string_splitter.h 9.13 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
gejun's avatar
gejun committed
17 18 19 20

// Author: Ge,Jun (gejun@baidu.com)
// Date: Mon. Apr. 18 19:52:34 CST 2011

gejun's avatar
gejun committed
21 22
// Iteratively split a string by one or multiple separators.

gejun's avatar
gejun committed
23 24
#ifndef BUTIL_STRING_SPLITTER_H
#define BUTIL_STRING_SPLITTER_H
gejun's avatar
gejun committed
25 26 27

#include <stdlib.h>
#include <stdint.h>
28
#include "butil/strings/string_piece.h"
gejun's avatar
gejun committed
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

// It's common to encode data into strings separated by special characters
// and decode them back, but functions such as `split_string' has to modify
// the input string, which is bad. If we parse the string from scratch, the
// code will be filled with pointer operations and obscure to understand.
//
// What we want is:
// - Scan the string once: just do simple things efficiently.
// - Do not modify input string: Changing input is bad, it may bring hidden
//   bugs, concurrency issues and non-const propagations.
// - Split the string in-place without additional buffer/array.
//
// StringSplitter does meet these requirements.
// Usage:
//     const char* the_string_to_split = ...;
//     for (StringSplitter s(the_string_to_split, '\t'); s; ++s) {
//         printf("%*s\n", s.length(), s.field());    
//     }
// 
// "s" behaves as an iterator and evaluates to true before ending.
// "s.field()" and "s.length()" are address and length of current field
// respectively. Notice that "s.field()" may not end with '\0' because
// we don't modify input. You can copy the field to a dedicated buffer
// or apply a function supporting length.

54
namespace butil {
gejun's avatar
gejun committed
55 56 57 58 59 60 61 62 63 64 65 66 67

enum EmptyFieldAction {
    SKIP_EMPTY_FIELD,
    ALLOW_EMPTY_FIELD
};

// Split a string with one character
class StringSplitter {
public:
    // Split `input' with `separator'. If `action' is SKIP_EMPTY_FIELD, zero-
    // length() field() will be skipped.
    inline StringSplitter(const char* input, char separator,
                          EmptyFieldAction action = SKIP_EMPTY_FIELD);
68 69
    // Allows containing embedded '\0' characters and separator can be '\0',
    // if str_end is not NULL.
gejun's avatar
gejun committed
70 71
    inline StringSplitter(const char* str_begin, const char* str_end,
                          char separator,
72 73 74 75
                          EmptyFieldAction action = SKIP_EMPTY_FIELD);
    // Allows containing embedded '\0' characters and separator can be '\0',
    inline StringSplitter(const StringPiece& input, char separator,
                          EmptyFieldAction action = SKIP_EMPTY_FIELD);
gejun's avatar
gejun committed
76 77 78 79 80 81 82 83 84 85 86 87

    // Move splitter forward.
    inline StringSplitter& operator++();
    inline StringSplitter operator++(int);

    // True iff field() is valid.
    inline operator const void*() const;

    // Beginning address and length of the field. *(field() + length()) may
    // not be '\0' because we don't modify `input'.
    inline const char* field() const;
    inline size_t length() const;
88
    inline StringPiece field_sp() const;
gejun's avatar
gejun committed
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125

    // Cast field to specific type, and write the value into `pv'.
    // Returns 0 on success, -1 otherwise.
    // NOTE: If separator is a digit, casting functions always return -1.
    inline int to_int8(int8_t *pv) const;
    inline int to_uint8(uint8_t *pv) const;
    inline int to_int(int *pv) const;
    inline int to_uint(unsigned int *pv) const;
    inline int to_long(long *pv) const;
    inline int to_ulong(unsigned long *pv) const;
    inline int to_longlong(long long *pv) const;
    inline int to_ulonglong(unsigned long long *pv) const;
    inline int to_float(float *pv) const;
    inline int to_double(double *pv) const;
    
private:
    inline bool not_end(const char* p) const;
    inline void init();
    
    const char* _head;
    const char* _tail;
    const char* _str_tail;
    const char _sep;
    const EmptyFieldAction _empty_field_action;
};

// Split a string with one of the separators
class StringMultiSplitter {
public:
    // Split `input' with one character of `separators'. If `action' is
    // SKIP_EMPTY_FIELD, zero-length() field() will be skipped.
    // NOTE: This utility stores pointer of `separators' directly rather than
    //       copying the content because this utility is intended to be used
    //       in ad-hoc manner where lifetime of `separators' is generally
    //       longer than this utility.
    inline StringMultiSplitter(const char* input, const char* separators,
                               EmptyFieldAction action = SKIP_EMPTY_FIELD);
126 127
    // Allows containing embedded '\0' characters if str_end is not NULL.
    // NOTE: `separators` cannot contain embedded '\0' character.
gejun's avatar
gejun committed
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
    inline StringMultiSplitter(const char* str_begin, const char* str_end,
                               const char* separators,
                               EmptyFieldAction action = SKIP_EMPTY_FIELD);

    // Move splitter forward.
    inline StringMultiSplitter& operator++();
    inline StringMultiSplitter operator++(int);

    // True iff field() is valid.
    inline operator const void*() const;

    // Beginning address and length of the field. *(field() + length()) may
    // not be '\0' because we don't modify `input'.
    inline const char* field() const;
    inline size_t length() const;
143
    inline StringPiece field_sp() const;
gejun's avatar
gejun committed
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170

    // Cast field to specific type, and write the value into `pv'.
    // Returns 0 on success, -1 otherwise.
    // NOTE: If separators contains digit, casting functions always return -1.
    inline int to_int8(int8_t *pv) const;
    inline int to_uint8(uint8_t *pv) const;
    inline int to_int(int *pv) const;
    inline int to_uint(unsigned int *pv) const;
    inline int to_long(long *pv) const;
    inline int to_ulong(unsigned long *pv) const;
    inline int to_longlong(long long *pv) const;
    inline int to_ulonglong(unsigned long long *pv) const;
    inline int to_float(float *pv) const;
    inline int to_double(double *pv) const;

private:
    inline bool is_sep(char c) const;
    inline bool not_end(const char* p) const;
    inline void init();
    
    const char* _head;
    const char* _tail;
    const char* _str_tail;
    const char* const _seps;
    const EmptyFieldAction _empty_field_action;
};

171
// Split query in the format according to the given delimiters.
172
// This class can also handle some exceptional cases.
173 174
// 1. consecutive pair_delimiter are omitted, for example,
//    suppose key_value_delimiter is '=' and pair_delimiter
175 176
//    is '&', then 'k1=v1&&&k2=v2' is normalized to 'k1=k2&k2=v2'.
// 2. key or value can be empty or both can be empty.
177
// 3. consecutive key_value_delimiter are not omitted, for example,
178
//    suppose input is 'k1===v2' and key_value_delimiter is '=', then
179
//    key() returns 'k1', value() returns '==v2'.
180 181 182 183
class KeyValuePairsSplitter {
public:
    inline KeyValuePairsSplitter(const char* str_begin,
                                 const char* str_end,
184 185 186
                                 char pair_delimiter,
                                 char key_value_delimiter)
        : _sp(str_begin, str_end, pair_delimiter)
187 188
        , _delim_pos(StringPiece::npos)
        , _key_value_delim(key_value_delimiter) {
189
        UpdateDelimiterPosition();
190 191 192
    }

    inline KeyValuePairsSplitter(const char* str_begin,
193 194
                                 char pair_delimiter,
                                 char key_value_delimiter)
195
        : KeyValuePairsSplitter(str_begin, NULL,
196
                pair_delimiter, key_value_delimiter) {}
197 198

    inline KeyValuePairsSplitter(const StringPiece &sp,
199 200
                                 char pair_delimiter,
                                 char key_value_delimiter)
201
        : KeyValuePairsSplitter(sp.begin(), sp.end(),
202
                pair_delimiter, key_value_delimiter) {}
203

204
    inline StringPiece key() {
205
        return key_and_value().substr(0, _delim_pos);
206 207
    }

208
    inline StringPiece value() {
209
        return key_and_value().substr(_delim_pos + 1);
210 211
    }

212
    // Get the current value of key and value
213
    // in the format of "key=value"
214
    inline StringPiece key_and_value() {
215 216 217 218 219 220
        return StringPiece(_sp.field(), _sp.length());
    }

    // Move splitter forward.
    inline KeyValuePairsSplitter& operator++() {
        ++_sp;
221
        UpdateDelimiterPosition();
222 223 224 225 226 227 228 229 230 231 232 233
        return *this;
    }

    inline KeyValuePairsSplitter operator++(int) {
        KeyValuePairsSplitter tmp = *this;
        operator++();
        return tmp;
    }

    inline operator const void*() const { return _sp; }

private:
234
    inline void UpdateDelimiterPosition();
235 236 237

private:
    StringSplitter _sp;
238 239
    StringPiece::size_type _delim_pos;
    const char _key_value_delim;
240 241
};

242
}  // namespace butil
gejun's avatar
gejun committed
243

244
#include "butil/string_splitter_inl.h"
gejun's avatar
gejun committed
245

gejun's avatar
gejun committed
246
#endif  // BUTIL_STRING_SPLITTER_H