utf_offset_string_conversions.cc

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "butil/strings/utf_offset_string_conversions.h"

#include <algorithm>

#include "butil/logging.h"
#include "butil/memory/scoped_ptr.h"
#include "butil/strings/string_piece.h"
#include "butil/strings/utf_string_conversion_utils.h"

namespace butil {

OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
                                       size_t original_length,
                                       size_t output_length)
    : original_offset(original_offset),
      original_length(original_length),
      output_length(output_length) {
}

// static
void OffsetAdjuster::AdjustOffsets(
    const Adjustments& adjustments,
    std::vector<size_t>* offsets_for_adjustment) {
  if (!offsets_for_adjustment || adjustments.empty())
    return;
  for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
       i != offsets_for_adjustment->end(); ++i)
    AdjustOffset(adjustments, &(*i));
}

// static
void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
                                  size_t* offset) {
  if (*offset == string16::npos)
    return;
  int adjustment = 0;
  for (Adjustments::const_iterator i = adjustments.begin();
       i != adjustments.end(); ++i) {
    if (*offset <= i->original_offset)
      break;
    if (*offset < (i->original_offset + i->original_length)) {
      *offset = string16::npos;
      return;
    }
    adjustment += static_cast<int>(i->original_length - i->output_length);
  }
  *offset -= adjustment;
}

// static
void OffsetAdjuster::UnadjustOffsets(
    const Adjustments& adjustments,
    std::vector<size_t>* offsets_for_unadjustment) {
  if (!offsets_for_unadjustment || adjustments.empty())
    return;
  for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
       i != offsets_for_unadjustment->end(); ++i)
    UnadjustOffset(adjustments, &(*i));
}

// static
void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
                                    size_t* offset) {
  if (*offset == string16::npos)
    return;
  int adjustment = 0;
  for (Adjustments::const_iterator i = adjustments.begin();
       i != adjustments.end(); ++i) {
    if (*offset + adjustment <= i->original_offset)
      break;
    adjustment += static_cast<int>(i->original_length - i->output_length);
    if ((*offset + adjustment) <
        (i->original_offset + i->original_length)) {
      *offset = string16::npos;
      return;
    }
  }
  *offset += adjustment;
}

// static
void OffsetAdjuster::MergeSequentialAdjustments(
    const Adjustments& first_adjustments,
    Adjustments* adjustments_on_adjusted_string) {
  Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
  Adjustments::const_iterator first_iter = first_adjustments.begin();
  // Simultaneously iterate over all |adjustments_on_adjusted_string| and
  // |first_adjustments|, adding adjustments to or correcting the adjustments
  // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
  // current number of characters collapsed by |first_adjustments| up to this
  // point.  |currently_collapsing| keeps track of the number of characters
  // collapsed by |first_adjustments| into the current |adjusted_iter|'s
  // length.  These are characters that will change |shift| as soon as we're
  // done processing the current |adjusted_iter|; they are not yet reflected in
  // |shift|.
  size_t shift = 0;
  size_t currently_collapsing = 0;
  while (adjusted_iter != adjustments_on_adjusted_string->end()) {
    if ((first_iter == first_adjustments.end()) ||
        ((adjusted_iter->original_offset + shift +
          adjusted_iter->original_length) <= first_iter->original_offset)) {
      // Entire |adjusted_iter| (accounting for its shift and including its
      // whole original length) comes before |first_iter|.
      //
      // Correct the offset at |adjusted_iter| and move onto the next
      // adjustment that needs revising.
      adjusted_iter->original_offset += shift;
      shift += currently_collapsing;
      currently_collapsing = 0;
      ++adjusted_iter;
    } else if ((adjusted_iter->original_offset + shift) >
               first_iter->original_offset) {
      // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).

      // It's not possible for the adjustments to overlap.  (It shouldn't
      // be possible that we have an |adjusted_iter->original_offset| that,
      // when adjusted by the computed |shift|, is in the middle of
      // |first_iter|'s output's length.  After all, that would mean the
      // current adjustment_on_adjusted_string somehow points to an offset
      // that was supposed to have been eliminated by the first set of
      // adjustments.)
      DCHECK_LE(first_iter->original_offset + first_iter->output_length,
                adjusted_iter->original_offset + shift);

      // Add the |first_adjustment_iter| to the full set of adjustments while
      // making sure |adjusted_iter| continues pointing to the same element.
      // We do this by inserting the |first_adjustment_iter| right before
      // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
      // the following element.
      shift += first_iter->original_length - first_iter->output_length;
      adjusted_iter = adjustments_on_adjusted_string->insert(
          adjusted_iter, *first_iter);
      ++adjusted_iter;
      ++first_iter;
    } else {
      // The first adjustment adjusted something that then got further adjusted
      // by the second set of adjustments.  In other words, |first_iter| points
      // to something in the range covered by |adjusted_iter|'s length (after
      // accounting for |shift|).  Precisely,
      //   adjusted_iter->original_offset + shift
      //   <=
      //   first_iter->original_offset
      //   <=
      //   adjusted_iter->original_offset + shift +
      //       adjusted_iter->original_length

      // Modify the current |adjusted_iter| to include whatever collapsing
      // happened in |first_iter|, then advance to the next |first_adjustments|
      // because we dealt with the current one.
      const int collapse = static_cast<int>(first_iter->original_length) -
          static_cast<int>(first_iter->output_length);
      // This function does not know how to deal with a string that expands and
      // then gets modified, only strings that collapse and then get modified.
      DCHECK_GT(collapse, 0);
      adjusted_iter->original_length += collapse;
      currently_collapsing += collapse;
      ++first_iter;
    }
  }
  DCHECK_EQ(0u, currently_collapsing);
  if (first_iter != first_adjustments.end()) {
    // Only first adjustments are left.  These do not need to be modified.
    // (Their offsets are already correct with respect to the original string.)
    // Append them all.
    DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
    adjustments_on_adjusted_string->insert(
        adjustments_on_adjusted_string->end(), first_iter,
        first_adjustments.end());
  }
}

// Converts the given source Unicode character type to the given destination
// Unicode character type as a STL string. The given input buffer and size
// determine the source, and the given output STL string will be replaced by
// the result.  If non-NULL, |adjustments| is set to reflect the all the
// alterations to the string that are not one-character-to-one-character.
// It will always be sorted by increasing offset.
template<typename SrcChar, typename DestStdString>
bool ConvertUnicode(const SrcChar* src,
                    size_t src_len,
                    DestStdString* output,
                    OffsetAdjuster::Adjustments* adjustments) {
  if (adjustments)
    adjustments->clear();
  // ICU requires 32-bit numbers.
  bool success = true;
  int32_t src_len32 = static_cast<int32_t>(src_len);
  for (int32_t i = 0; i < src_len32; i++) {
    uint32_t code_point;
    size_t original_i = i;
    size_t chars_written = 0;
    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
      chars_written = WriteUnicodeCharacter(code_point, output);
    } else {
      chars_written = WriteUnicodeCharacter(0xFFFD, output);
      success = false;
    }

    // Only bother writing an adjustment if this modification changed the
    // length of this character.
    // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
    // character read, not after it (so that incrementing it in the loop
    // increment will place it at the right location), so we need to account
    // for that in determining the amount that was read.
    if (adjustments && ((i - original_i + 1) != chars_written)) {
      adjustments->push_back(OffsetAdjuster::Adjustment(
          original_i, i - original_i + 1, chars_written));
    }
  }
  return success;
}

bool UTF8ToUTF16WithAdjustments(
    const char* src,
    size_t src_len,
    string16* output,
    butil::OffsetAdjuster::Adjustments* adjustments) {
  PrepareForUTF16Or32Output(src, src_len, output);
  return ConvertUnicode(src, src_len, output, adjustments);
}

string16 UTF8ToUTF16WithAdjustments(
    const butil::StringPiece& utf8,
    butil::OffsetAdjuster::Adjustments* adjustments) {
  string16 result;
  UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
  return result;
}

string16 UTF8ToUTF16AndAdjustOffsets(
    const butil::StringPiece& utf8,
    std::vector<size_t>* offsets_for_adjustment) {
  std::for_each(offsets_for_adjustment->begin(),
                offsets_for_adjustment->end(),
                LimitOffset<butil::StringPiece>(utf8.length()));
  OffsetAdjuster::Adjustments adjustments;
  string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
  return result;
}

std::string UTF16ToUTF8AndAdjustOffsets(
    const butil::StringPiece16& utf16,
    std::vector<size_t>* offsets_for_adjustment) {
  std::for_each(offsets_for_adjustment->begin(),
                offsets_for_adjustment->end(),
                LimitOffset<butil::StringPiece16>(utf16.length()));
  std::string result;
  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
  OffsetAdjuster::Adjustments adjustments;
  ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
  return result;
}

}  // namespace butil