/* ********************************************************************** * Copyright (C) 2008-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 05/11/2008 Andy Heninger Port from Java ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/unifilt.h" #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/brkiter.h" #include "brktrans.h" #include "unicode/uchar.h" #include "cmemory.h" #include "uprops.h" #include "uinvchar.h" #include "util.h" #include "uvectr32.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) static const UChar SPACE = 32; // ' ' /** * Constructs a transliterator with the default delimiters '{' and * '}'. */ BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), fInsertion(SPACE) { bi = NULL; UErrorCode status = U_ZERO_ERROR; boundaries = new UVector32(status); } /** * Destructor. */ BreakTransliterator::~BreakTransliterator() { delete bi; bi = NULL; delete boundaries; boundaries = NULL; } /** * Copy constructor. */ BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : Transliterator(o) { bi = NULL; if (o.bi != NULL) { bi = o.bi->clone(); } fInsertion = o.fInsertion; UErrorCode status = U_ZERO_ERROR; boundaries = new UVector32(status); } /** * Transliterator API. */ Transliterator* BreakTransliterator::clone(void) const { return new BreakTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental ) const { UErrorCode status = U_ZERO_ERROR; boundaries->removeAllElements(); BreakTransliterator *nonConstThis = (BreakTransliterator *)this; nonConstThis->getBreakIterator(); // Lazy-create it if necessary UnicodeString sText = replaceableAsString(text); bi->setText(sText); bi->preceding(offsets.start); // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. int32_t boundary; for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { if (boundary == 0) continue; // HACK: Check to see that preceeding item was a letter UChar32 cp = sText.char32At(boundary-1); int type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; cp = sText.char32At(boundary); type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; boundaries->addElement(boundary, status); // printf("Boundary at %d\n", boundary); } int delta = 0; int lastBoundary = 0; if (boundaries->size() != 0) { // if we found something, adjust delta = boundaries->size() * fInsertion.length(); lastBoundary = boundaries->lastElementi(); // we do this from the end backwards, so that we don't have to keep updating. while (boundaries->size() > 0) { boundary = boundaries->popi(); text.handleReplaceBetween(boundary, boundary, fInsertion); } } // Now fix up the return values offsets.contextLimit += delta; offsets.limit += delta; offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; // TODO: do something with U_FAILURE(status); // (need to look at transliterators overall, not just here.) } // // getInsertion() // const UnicodeString &BreakTransliterator::getInsertion() const { return fInsertion; } // // setInsertion() // void BreakTransliterator::setInsertion(const UnicodeString &insertion) { this->fInsertion = insertion; } // // getBreakIterator Lazily create the break iterator if it does // not already exist. Copied from Java, probably // better to just create it in the constructor. // BreakIterator *BreakTransliterator::getBreakIterator() { UErrorCode status = U_ZERO_ERROR; if (bi == NULL) { // Note: Thai breaking behavior is universal, it is not // tied to the Thai locale. bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); } return bi; } // // replaceableAsString Hack to let break iterators work // on the replaceable text from transliterators. // In practice, the only real Replaceable type that we // will be seeing is UnicodeString, so this function // will normally be efficient. // UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { UnicodeString s; UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); if (rs != NULL) { s = *rs; } else { r.extractBetween(0, r.length(), s); } return s; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */