utf_string_conversion_utils.h 4.22 KB
Newer Older
gejun's avatar
gejun committed
1 2 3 4
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

5 6
#ifndef BUTIL_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
#define BUTIL_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
gejun's avatar
gejun committed
7 8 9

// This should only be used by the various UTF string conversion files.

10 11
#include "butil/base_export.h"
#include "butil/strings/string16.h"
gejun's avatar
gejun committed
12

13
namespace butil {
gejun's avatar
gejun committed
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39

inline bool IsValidCodepoint(uint32_t code_point) {
  // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
  // codepoints larger than 0x10FFFF (the highest codepoint allowed).
  // Non-characters and unassigned codepoints are allowed.
  return code_point < 0xD800u ||
         (code_point >= 0xE000u && code_point <= 0x10FFFFu);
}

inline bool IsValidCharacter(uint32_t code_point) {
  // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
  // 0xFFFE or 0xFFFF) from the set of valid code points.
  return code_point < 0xD800u || (code_point >= 0xE000u &&
      code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
      code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
}

// ReadUnicodeCharacter --------------------------------------------------------

// Reads a UTF-8 stream, placing the next code point into the given output
// |*code_point|. |src| represents the entire string to read, and |*char_index|
// is the character offset within the string to start reading at. |*char_index|
// will be updated to index the last character read, such that incrementing it
// (as in a for loop) will take the reader to the next character.
//
// Returns true on success. On false, |*code_point| will be invalid.
40
BUTIL_EXPORT bool ReadUnicodeCharacter(const char* src,
gejun's avatar
gejun committed
41 42 43 44 45
                                      int32_t src_len,
                                      int32_t* char_index,
                                      uint32_t* code_point_out);

// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
46
BUTIL_EXPORT bool ReadUnicodeCharacter(const char16* src,
gejun's avatar
gejun committed
47 48 49 50 51 52
                                      int32_t src_len,
                                      int32_t* char_index,
                                      uint32_t* code_point);

#if defined(WCHAR_T_IS_UTF32)
// Reads UTF-32 character. The usage is the same as the 8-bit version above.
53
BUTIL_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
gejun's avatar
gejun committed
54 55 56 57 58 59 60 61 62 63
                                      int32_t src_len,
                                      int32_t* char_index,
                                      uint32_t* code_point);
#endif  // defined(WCHAR_T_IS_UTF32)

// WriteUnicodeCharacter -------------------------------------------------------

// Appends a UTF-8 character to the given 8-bit string.  Returns the number of
// bytes written.
// TODO(brettw) Bug 79631: This function should not be exposed.
64
BUTIL_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point,
gejun's avatar
gejun committed
65 66 67 68
                                         std::string* output);

// Appends the given code point as a UTF-16 character to the given 16-bit
// string.  Returns the number of 16-bit values written.
69
BUTIL_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point, string16* output);
gejun's avatar
gejun committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94

#if defined(WCHAR_T_IS_UTF32)
// Appends the given UTF-32 character to the given 32-bit string.  Returns the
// number of 32-bit values written.
inline size_t WriteUnicodeCharacter(uint32_t code_point, std::wstring* output) {
  // This is the easy case, just append the character.
  output->push_back(code_point);
  return 1;
}
#endif  // defined(WCHAR_T_IS_UTF32)

// Generalized Unicode converter -----------------------------------------------

// Guesses the length of the output in UTF-8 in bytes, clears that output
// string, and reserves that amount of space.  We assume that the input
// character types are unsigned, which will be true for UTF-16 and -32 on our
// systems.
template<typename CHAR>
void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);

// Prepares an output buffer (containing either UTF-16 or -32 data) given some
// UTF-8 input that will be converted to it.  See PrepareForUTF8Output().
template<typename STRING>
void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);

95
}  // namespace butil
gejun's avatar
gejun committed
96

97
#endif  // BUTIL_STRINGS_UTF_STRING_CONVERSION_UTILS_H_