Home | History | Annotate | Download | only in editing
      1 /*
      2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
      3  * Copyright (C) 2005 Alexey Proskuryakov.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextIterator.h"
     29 
     30 #include "Document.h"
     31 #include "Frame.h"
     32 #include "HTMLElement.h"
     33 #include "HTMLNames.h"
     34 #include "htmlediting.h"
     35 #include "InlineTextBox.h"
     36 #include "Range.h"
     37 #include "RenderTableCell.h"
     38 #include "RenderTableRow.h"
     39 #include "RenderTextControl.h"
     40 #include "RenderTextFragment.h"
     41 #include "TextBoundaries.h"
     42 #include "TextBreakIterator.h"
     43 #include "VisiblePosition.h"
     44 #include "visible_units.h"
     45 #include <wtf/unicode/CharacterNames.h>
     46 
     47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     48 #include "TextBreakIteratorInternalICU.h"
     49 #include <unicode/usearch.h>
     50 #endif
     51 
     52 using namespace WTF::Unicode;
     53 using namespace std;
     54 
     55 namespace WebCore {
     56 
     57 using namespace HTMLNames;
     58 
     59 // Buffer that knows how to compare with a search target.
     60 // Keeps enough of the previous text to be able to search in the future, but no more.
     61 // Non-breaking spaces are always equal to normal spaces.
     62 // Case folding is also done if the CaseInsensitive option is specified.
     63 // Matches are further filtered if the AtWordStarts option is specified, although some
     64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
     65 class SearchBuffer {
     66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
     67 public:
     68     SearchBuffer(const String& target, FindOptions);
     69     ~SearchBuffer();
     70 
     71     // Returns number of characters appended; guaranteed to be in the range [1, length].
     72     size_t append(const UChar*, size_t length);
     73     bool needsMoreContext() const;
     74     void prependContext(const UChar*, size_t length);
     75     void reachedBreak();
     76 
     77     // Result is the size in characters of what was found.
     78     // And <startOffset> is the number of characters back to the start of what was found.
     79     size_t search(size_t& startOffset);
     80     bool atBreak() const;
     81 
     82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     83 
     84 private:
     85     bool isBadMatch(const UChar*, size_t length) const;
     86     bool isWordStartMatch(size_t start, size_t length) const;
     87 
     88     String m_target;
     89     FindOptions m_options;
     90 
     91     Vector<UChar> m_buffer;
     92     size_t m_overlap;
     93     size_t m_prefixLength;
     94     bool m_atBreak;
     95     bool m_needsMoreContext;
     96 
     97     bool m_targetRequiresKanaWorkaround;
     98     Vector<UChar> m_normalizedTarget;
     99     mutable Vector<UChar> m_normalizedMatch;
    100 
    101 #else
    102 
    103 private:
    104     void append(UChar, bool isCharacterStart);
    105     size_t length() const;
    106 
    107     String m_target;
    108     FindOptions m_options;
    109 
    110     Vector<UChar> m_buffer;
    111     Vector<bool> m_isCharacterStartBuffer;
    112     bool m_isBufferFull;
    113     size_t m_cursor;
    114 
    115 #endif
    116 };
    117 
    118 // --------
    119 
    120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
    121 static const unsigned bitInWordMask = bitsInWord - 1;
    122 
    123 BitStack::BitStack()
    124     : m_size(0)
    125 {
    126 }
    127 
    128 BitStack::~BitStack()
    129 {
    130 }
    131 
    132 void BitStack::push(bool bit)
    133 {
    134     unsigned index = m_size / bitsInWord;
    135     unsigned shift = m_size & bitInWordMask;
    136     if (!shift && index == m_words.size()) {
    137         m_words.grow(index + 1);
    138         m_words[index] = 0;
    139     }
    140     unsigned& word = m_words[index];
    141     unsigned mask = 1U << shift;
    142     if (bit)
    143         word |= mask;
    144     else
    145         word &= ~mask;
    146     ++m_size;
    147 }
    148 
    149 void BitStack::pop()
    150 {
    151     if (m_size)
    152         --m_size;
    153 }
    154 
    155 bool BitStack::top() const
    156 {
    157     if (!m_size)
    158         return false;
    159     unsigned shift = (m_size - 1) & bitInWordMask;
    160     return m_words.last() & (1U << shift);
    161 }
    162 
    163 unsigned BitStack::size() const
    164 {
    165     return m_size;
    166 }
    167 
    168 // --------
    169 
    170 #if !ASSERT_DISABLED
    171 
    172 static unsigned depthCrossingShadowBoundaries(Node* node)
    173 {
    174     unsigned depth = 0;
    175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    176         ++depth;
    177     return depth;
    178 }
    179 
    180 #endif
    181 
    182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
    183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
    184 {
    185     if (!rangeEndContainer)
    186         return 0;
    187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
    188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
    189             return next;
    190     }
    191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
    192         if (Node* next = node->nextSibling())
    193             return next;
    194     }
    195     return 0;
    196 }
    197 
    198 // --------
    199 
    200 static inline bool fullyClipsContents(Node* node)
    201 {
    202     RenderObject* renderer = node->renderer();
    203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
    204         return false;
    205     return toRenderBox(renderer)->size().isEmpty();
    206 }
    207 
    208 static inline bool ignoresContainerClip(Node* node)
    209 {
    210     RenderObject* renderer = node->renderer();
    211     if (!renderer || renderer->isText())
    212         return false;
    213     EPosition position = renderer->style()->position();
    214     return position == AbsolutePosition || position == FixedPosition;
    215 }
    216 
    217 static void pushFullyClippedState(BitStack& stack, Node* node)
    218 {
    219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
    220 
    221     // Push true if this node full clips its contents, or if a parent already has fully
    222     // clipped and this is not a node that ignores its container's clip.
    223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
    224 }
    225 
    226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
    227 {
    228     // Put the nodes in a vector so we can iterate in reverse order.
    229     Vector<Node*, 100> ancestry;
    230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    231         ancestry.append(parent);
    232 
    233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
    234     size_t size = ancestry.size();
    235     for (size_t i = 0; i < size; ++i)
    236         pushFullyClippedState(stack, ancestry[size - i - 1]);
    237     pushFullyClippedState(stack, node);
    238 
    239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
    240 }
    241 
    242 #if OS(ANDROID)
    243 static bool checkFormControlElement(Node* startNode)
    244 {
    245     Node* node = startNode;
    246     while (node) {
    247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
    248             return true;
    249         node = node->parentNode();
    250     }
    251     return false;
    252 }
    253 #endif
    254 
    255 
    256 // --------
    257 
    258 TextIterator::TextIterator()
    259     : m_startContainer(0)
    260     , m_startOffset(0)
    261     , m_endContainer(0)
    262     , m_endOffset(0)
    263     , m_positionNode(0)
    264     , m_textCharacters(0)
    265     , m_textLength(0)
    266     , m_remainingTextBox(0)
    267     , m_firstLetterText(0)
    268     , m_lastCharacter(0)
    269     , m_emitsCharactersBetweenAllVisiblePositions(false)
    270     , m_entersTextControls(false)
    271     , m_emitsTextWithoutTranscoding(false)
    272     , m_handledFirstLetter(false)
    273     , m_ignoresStyleVisibility(false)
    274     , m_emitsObjectReplacementCharacters(false)
    275 #if OS(ANDROID)
    276     , m_stopsOnFormControls(false)
    277     , m_shouldStop(false)
    278 #endif
    279 {
    280 }
    281 
    282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
    283     : m_startContainer(0)
    284     , m_startOffset(0)
    285     , m_endContainer(0)
    286     , m_endOffset(0)
    287     , m_positionNode(0)
    288     , m_textCharacters(0)
    289     , m_textLength(0)
    290     , m_remainingTextBox(0)
    291     , m_firstLetterText(0)
    292     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
    293     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
    294     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
    295     , m_handledFirstLetter(false)
    296     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
    297     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
    298 #if OS(ANDROID)
    299     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
    300     , m_shouldStop(false)
    301 #endif
    302 {
    303     if (!r)
    304         return;
    305 
    306     // get and validate the range endpoints
    307     Node* startContainer = r->startContainer();
    308     if (!startContainer)
    309         return;
    310     int startOffset = r->startOffset();
    311     Node* endContainer = r->endContainer();
    312     int endOffset = r->endOffset();
    313 
    314     // Callers should be handing us well-formed ranges. If we discover that this isn't
    315     // the case, we could consider changing this assertion to an early return.
    316     ASSERT(r->boundaryPointsValid());
    317 
    318     // remember range - this does not change
    319     m_startContainer = startContainer;
    320     m_startOffset = startOffset;
    321     m_endContainer = endContainer;
    322     m_endOffset = endOffset;
    323 
    324     // set up the current node for processing
    325     m_node = r->firstNode();
    326     if (!m_node)
    327         return;
    328     setUpFullyClippedStack(m_fullyClippedStack, m_node);
    329     m_offset = m_node == m_startContainer ? m_startOffset : 0;
    330     m_handledNode = false;
    331     m_handledChildren = false;
    332 
    333     // calculate first out of bounds node
    334     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
    335 
    336     // initialize node processing state
    337     m_needsAnotherNewline = false;
    338     m_textBox = 0;
    339 
    340     // initialize record of previous node processing
    341     m_hasEmitted = false;
    342     m_lastTextNode = 0;
    343     m_lastTextNodeEndedWithCollapsedSpace = false;
    344     m_lastCharacter = 0;
    345 
    346 #ifndef NDEBUG
    347     // need this just because of the assert in advance()
    348     m_positionNode = m_node;
    349 #endif
    350 
    351     // identify the first run
    352     advance();
    353 }
    354 
    355 TextIterator::~TextIterator()
    356 {
    357 }
    358 
    359 bool TextIterator::atEnd() const
    360 {
    361 #if OS(ANDROID)
    362     return !m_positionNode || m_shouldStop;
    363 #else
    364     return !m_positionNode;
    365 #endif
    366 }
    367 
    368 void TextIterator::advance()
    369 {
    370 #if OS(ANDROID)
    371     if (m_shouldStop)
    372         return;
    373 #endif
    374     // reset the run information
    375     m_positionNode = 0;
    376     m_textLength = 0;
    377 
    378     // handle remembered node that needed a newline after the text node's newline
    379     if (m_needsAnotherNewline) {
    380         // Emit the extra newline, and position it *inside* m_node, after m_node's
    381         // contents, in case it's a block, in the same way that we position the first
    382         // newline.  The range for the emitted newline should start where the line
    383         // break begins.
    384         // FIXME: It would be cleaner if we emitted two newlines during the last
    385         // iteration, instead of using m_needsAnotherNewline.
    386         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    387         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    388         m_needsAnotherNewline = false;
    389         return;
    390     }
    391 
    392     if (!m_textBox && m_remainingTextBox) {
    393         m_textBox = m_remainingTextBox;
    394         m_remainingTextBox = 0;
    395         m_firstLetterText = 0;
    396         m_offset = 0;
    397     }
    398     // handle remembered text box
    399     if (m_textBox) {
    400         handleTextBox();
    401         if (m_positionNode)
    402             return;
    403     }
    404 
    405     while (m_node && m_node != m_pastEndNode) {
    406 #if OS(ANDROID)
    407         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
    408             m_shouldStop = true;
    409 #endif
    410         // if the range ends at offset 0 of an element, represent the
    411         // position, but not the content, of that element e.g. if the
    412         // node is a blockflow element, emit a newline that
    413         // precedes the element
    414         if (m_node == m_endContainer && m_endOffset == 0) {
    415             representNodeOffsetZero();
    416             m_node = 0;
    417             return;
    418         }
    419 
    420         RenderObject* renderer = m_node->renderer();
    421         if (!renderer) {
    422             m_handledNode = true;
    423             m_handledChildren = true;
    424         } else {
    425             // handle current node according to its type
    426             if (!m_handledNode) {
    427                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
    428                     m_handledNode = handleTextNode();
    429                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
    430                          (renderer->node() && renderer->node()->isElementNode() &&
    431                           static_cast<Element*>(renderer->node())->isFormControlElement())))
    432                     m_handledNode = handleReplacedElement();
    433                 else
    434                     m_handledNode = handleNonTextNode();
    435                 if (m_positionNode)
    436                     return;
    437             }
    438         }
    439 
    440         // find a new current node to handle in depth-first manner,
    441         // calling exitNode() as we come back thru a parent node
    442         Node* next = m_handledChildren ? 0 : m_node->firstChild();
    443         m_offset = 0;
    444         if (!next) {
    445             next = m_node->nextSibling();
    446             if (!next) {
    447                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
    448                 Node* parentNode = m_node->parentOrHostNode();
    449                 while (!next && parentNode) {
    450                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
    451                         return;
    452                     bool haveRenderer = m_node->renderer();
    453                     m_node = parentNode;
    454                     m_fullyClippedStack.pop();
    455                     parentNode = m_node->parentOrHostNode();
    456                     if (haveRenderer)
    457                         exitNode();
    458                     if (m_positionNode) {
    459                         m_handledNode = true;
    460                         m_handledChildren = true;
    461                         return;
    462                     }
    463                     next = m_node->nextSibling();
    464                 }
    465             }
    466             m_fullyClippedStack.pop();
    467         }
    468 
    469         // set the new current node
    470         m_node = next;
    471         if (m_node)
    472             pushFullyClippedState(m_fullyClippedStack, m_node);
    473         m_handledNode = false;
    474         m_handledChildren = false;
    475         m_handledFirstLetter = false;
    476         m_firstLetterText = 0;
    477 
    478         // how would this ever be?
    479         if (m_positionNode)
    480             return;
    481     }
    482 }
    483 
    484 bool TextIterator::handleTextNode()
    485 {
    486     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
    487         return false;
    488 
    489     RenderText* renderer = toRenderText(m_node->renderer());
    490 
    491     m_lastTextNode = m_node;
    492     String str = renderer->text();
    493 
    494     // handle pre-formatted text
    495     if (!renderer->style()->collapseWhiteSpace()) {
    496         int runStart = m_offset;
    497         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
    498             emitCharacter(' ', m_node, 0, runStart, runStart);
    499             return false;
    500         }
    501         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    502             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    503             if (m_firstLetterText) {
    504                 String firstLetter = m_firstLetterText->text();
    505                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
    506                 m_firstLetterText = 0;
    507                 m_textBox = 0;
    508                 return false;
    509             }
    510         }
    511         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    512             return false;
    513         int strLength = str.length();
    514         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
    515         int runEnd = min(strLength, end);
    516 
    517         if (runStart >= runEnd)
    518             return true;
    519 
    520         emitText(m_node, runStart, runEnd);
    521         return true;
    522     }
    523 
    524     if (!renderer->firstTextBox() && str.length() > 0) {
    525         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    526             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    527             if (m_firstLetterText) {
    528                 handleTextBox();
    529                 return false;
    530             }
    531         }
    532         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    533             return false;
    534         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
    535         return true;
    536     }
    537 
    538     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
    539     if (renderer->containsReversedText()) {
    540         m_sortedTextBoxes.clear();
    541         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
    542             m_sortedTextBoxes.append(textBox);
    543         }
    544         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
    545         m_sortedTextBoxesPosition = 0;
    546     }
    547 
    548     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    549     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
    550         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    551     handleTextBox();
    552     return true;
    553 }
    554 
    555 void TextIterator::handleTextBox()
    556 {
    557     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
    558     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
    559         m_textBox = 0;
    560         return;
    561     }
    562     String str = renderer->text();
    563     unsigned start = m_offset;
    564     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
    565     while (m_textBox) {
    566         unsigned textBoxStart = m_textBox->start();
    567         unsigned runStart = max(textBoxStart, start);
    568 
    569         // Check for collapsed space at the start of this run.
    570         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    571         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
    572             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
    573         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
    574             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
    575                 unsigned spaceRunStart = runStart - 1;
    576                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
    577                     --spaceRunStart;
    578                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
    579             } else
    580                 emitCharacter(' ', m_node, 0, runStart, runStart);
    581             return;
    582         }
    583         unsigned textBoxEnd = textBoxStart + m_textBox->len();
    584         unsigned runEnd = min(textBoxEnd, end);
    585 
    586         // Determine what the next text box will be, but don't advance yet
    587         InlineTextBox* nextTextBox = 0;
    588         if (renderer->containsReversedText()) {
    589             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
    590                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
    591         } else
    592             nextTextBox = m_textBox->nextTextBox();
    593 
    594         if (runStart < runEnd) {
    595             // Handle either a single newline character (which becomes a space),
    596             // or a run of characters that does not include a newline.
    597             // This effectively translates newlines to spaces without copying the text.
    598             if (str[runStart] == '\n') {
    599                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
    600                 m_offset = runStart + 1;
    601             } else {
    602                 size_t subrunEnd = str.find('\n', runStart);
    603                 if (subrunEnd == notFound || subrunEnd > runEnd)
    604                     subrunEnd = runEnd;
    605 
    606                 m_offset = subrunEnd;
    607                 emitText(m_node, renderer, runStart, subrunEnd);
    608             }
    609 
    610             // If we are doing a subrun that doesn't go to the end of the text box,
    611             // come back again to finish handling this text box; don't advance to the next one.
    612             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
    613                 return;
    614 
    615             // Advance and return
    616             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
    617             if (nextRunStart > runEnd)
    618                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
    619             m_textBox = nextTextBox;
    620             if (renderer->containsReversedText())
    621                 ++m_sortedTextBoxesPosition;
    622             return;
    623         }
    624         // Advance and continue
    625         m_textBox = nextTextBox;
    626         if (renderer->containsReversedText())
    627             ++m_sortedTextBoxesPosition;
    628     }
    629     if (!m_textBox && m_remainingTextBox) {
    630         m_textBox = m_remainingTextBox;
    631         m_remainingTextBox = 0;
    632         m_firstLetterText = 0;
    633         m_offset = 0;
    634         handleTextBox();
    635     }
    636 }
    637 
    638 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
    639 {
    640     if (renderer->firstLetter()) {
    641         RenderObject* r = renderer->firstLetter();
    642         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    643             return;
    644         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
    645             if (currChild->isText()) {
    646                 RenderText* firstLetter = toRenderText(currChild);
    647                 m_handledFirstLetter = true;
    648                 m_remainingTextBox = m_textBox;
    649                 m_textBox = firstLetter->firstTextBox();
    650                 m_firstLetterText = firstLetter;
    651                 return;
    652             }
    653         }
    654     }
    655     m_handledFirstLetter = true;
    656 }
    657 
    658 bool TextIterator::handleReplacedElement()
    659 {
    660     if (m_fullyClippedStack.top())
    661         return false;
    662 
    663     RenderObject* renderer = m_node->renderer();
    664     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    665         return false;
    666 
    667     if (m_lastTextNodeEndedWithCollapsedSpace) {
    668         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
    669         return false;
    670     }
    671 
    672     if (m_entersTextControls && renderer->isTextControl()) {
    673         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
    674             m_node = innerTextElement->shadowTreeRootNode();
    675             pushFullyClippedState(m_fullyClippedStack, m_node);
    676             m_offset = 0;
    677             return false;
    678         }
    679     }
    680 
    681     m_hasEmitted = true;
    682 
    683     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
    684         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
    685         return true;
    686     }
    687 
    688     if (m_emitsCharactersBetweenAllVisiblePositions) {
    689         // We want replaced elements to behave like punctuation for boundary
    690         // finding, and to simply take up space for the selection preservation
    691         // code in moveParagraphs, so we use a comma.
    692         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
    693         return true;
    694     }
    695 
    696     m_positionNode = m_node->parentNode();
    697     m_positionOffsetBaseNode = m_node;
    698     m_positionStartOffset = 0;
    699     m_positionEndOffset = 1;
    700 
    701     m_textCharacters = 0;
    702     m_textLength = 0;
    703 
    704     m_lastCharacter = 0;
    705 
    706     return true;
    707 }
    708 
    709 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
    710 {
    711     if (renderer->style()->visibility() == VISIBLE)
    712         return true;
    713     if (renderer->isTextFragment()) {
    714         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
    715         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
    716             return true;
    717     }
    718     return false;
    719 }
    720 
    721 static bool shouldEmitTabBeforeNode(Node* node)
    722 {
    723     RenderObject* r = node->renderer();
    724 
    725     // Table cells are delimited by tabs.
    726     if (!r || !isTableCell(node))
    727         return false;
    728 
    729     // Want a tab before every cell other than the first one
    730     RenderTableCell* rc = toRenderTableCell(r);
    731     RenderTable* t = rc->table();
    732     return t && (t->cellBefore(rc) || t->cellAbove(rc));
    733 }
    734 
    735 static bool shouldEmitNewlineForNode(Node* node)
    736 {
    737     // br elements are represented by a single newline.
    738     RenderObject* r = node->renderer();
    739     if (!r)
    740         return node->hasTagName(brTag);
    741 
    742     return r->isBR();
    743 }
    744 
    745 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
    746 {
    747     // Block flow (versus inline flow) is represented by having
    748     // a newline both before and after the element.
    749     RenderObject* r = node->renderer();
    750     if (!r) {
    751         return (node->hasTagName(blockquoteTag)
    752                 || node->hasTagName(ddTag)
    753                 || node->hasTagName(divTag)
    754                 || node->hasTagName(dlTag)
    755                 || node->hasTagName(dtTag)
    756                 || node->hasTagName(h1Tag)
    757                 || node->hasTagName(h2Tag)
    758                 || node->hasTagName(h3Tag)
    759                 || node->hasTagName(h4Tag)
    760                 || node->hasTagName(h5Tag)
    761                 || node->hasTagName(h6Tag)
    762                 || node->hasTagName(hrTag)
    763                 || node->hasTagName(liTag)
    764                 || node->hasTagName(listingTag)
    765                 || node->hasTagName(olTag)
    766                 || node->hasTagName(pTag)
    767                 || node->hasTagName(preTag)
    768                 || node->hasTagName(trTag)
    769                 || node->hasTagName(ulTag));
    770     }
    771 
    772     // Need to make an exception for table cells, because they are blocks, but we
    773     // want them tab-delimited rather than having newlines before and after.
    774     if (isTableCell(node))
    775         return false;
    776 
    777     // Need to make an exception for table row elements, because they are neither
    778     // "inline" or "RenderBlock", but we want newlines for them.
    779     if (r->isTableRow()) {
    780         RenderTable* t = toRenderTableRow(r)->table();
    781         if (t && !t->isInline())
    782             return true;
    783     }
    784 
    785     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
    786 }
    787 
    788 static bool shouldEmitNewlineAfterNode(Node* node)
    789 {
    790     // FIXME: It should be better but slower to create a VisiblePosition here.
    791     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
    792         return false;
    793     // Check if this is the very last renderer in the document.
    794     // If so, then we should not emit a newline.
    795     while ((node = node->traverseNextSibling()))
    796         if (node->renderer())
    797             return true;
    798     return false;
    799 }
    800 
    801 static bool shouldEmitNewlineBeforeNode(Node* node)
    802 {
    803     return shouldEmitNewlinesBeforeAndAfterNode(node);
    804 }
    805 
    806 static bool shouldEmitExtraNewlineForNode(Node* node)
    807 {
    808     // When there is a significant collapsed bottom margin, emit an extra
    809     // newline for a more realistic result.  We end up getting the right
    810     // result even without margin collapsing. For example: <div><p>text</p></div>
    811     // will work right even if both the <div> and the <p> have bottom margins.
    812     RenderObject* r = node->renderer();
    813     if (!r || !r->isBox())
    814         return false;
    815 
    816     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
    817     // not to do this at all
    818     if (node->hasTagName(h1Tag)
    819         || node->hasTagName(h2Tag)
    820         || node->hasTagName(h3Tag)
    821         || node->hasTagName(h4Tag)
    822         || node->hasTagName(h5Tag)
    823         || node->hasTagName(h6Tag)
    824         || node->hasTagName(pTag)) {
    825         RenderStyle* style = r->style();
    826         if (style) {
    827             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
    828             int fontSize = style->fontDescription().computedPixelSize();
    829             if (bottomMargin * 2 >= fontSize)
    830                 return true;
    831         }
    832     }
    833 
    834     return false;
    835 }
    836 
    837 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
    838 {
    839     const UChar* characters = renderer->text()->characters();
    840     int length = renderer->text()->length();
    841     for (int i = textEnd; i < length; ++i) {
    842         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
    843             return i - textEnd;
    844     }
    845 
    846     return length - textEnd;
    847 }
    848 
    849 static int maxOffsetIncludingCollapsedSpaces(Node* node)
    850 {
    851     int offset = caretMaxOffset(node);
    852 
    853     if (node->renderer() && node->renderer()->isText())
    854         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
    855 
    856     return offset;
    857 }
    858 
    859 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
    860 bool TextIterator::shouldRepresentNodeOffsetZero()
    861 {
    862     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
    863         return true;
    864 
    865     // Leave element positioned flush with start of a paragraph
    866     // (e.g. do not insert tab before a table cell at the start of a paragraph)
    867     if (m_lastCharacter == '\n')
    868         return false;
    869 
    870     // Otherwise, show the position if we have emitted any characters
    871     if (m_hasEmitted)
    872         return true;
    873 
    874     // We've not emitted anything yet. Generally, there is no need for any positioning then.
    875     // The only exception is when the element is visually not in the same line as
    876     // the start of the range (e.g. the range starts at the end of the previous paragraph).
    877     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
    878     // make quicker checks to possibly avoid that. Another check that we could make is
    879     // is whether the inline vs block flow changed since the previous visible element.
    880     // I think we're already in a special enough case that that won't be needed, tho.
    881 
    882     // No character needed if this is the first node in the range.
    883     if (m_node == m_startContainer)
    884         return false;
    885 
    886     // If we are outside the start container's subtree, assume we need to emit.
    887     // FIXME: m_startContainer could be an inline block
    888     if (!m_node->isDescendantOf(m_startContainer))
    889         return true;
    890 
    891     // If we started as m_startContainer offset 0 and the current node is a descendant of
    892     // the start container, we already had enough context to correctly decide whether to
    893     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
    894     // so don't second guess that now.
    895     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
    896     // immaterial since we likely would have already emitted something by now.
    897     if (m_startOffset == 0)
    898         return false;
    899 
    900     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
    901     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
    902     // we would create VisiblePositions on every call to this function without this check.
    903     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
    904         return false;
    905 
    906     // The startPos.isNotNull() check is needed because the start could be before the body,
    907     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
    908     // The currPos.isNotNull() check is needed because positions in non-HTML content
    909     // (like SVG) do not have visible positions, and we don't want to emit for them either.
    910     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
    911     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
    912     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
    913 }
    914 
    915 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
    916 {
    917     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
    918 }
    919 
    920 void TextIterator::representNodeOffsetZero()
    921 {
    922     // Emit a character to show the positioning of m_node.
    923 
    924     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
    925     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
    926     // on m_node to see if it necessitates emitting a character first and will early return
    927     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
    928     if (shouldEmitTabBeforeNode(m_node)) {
    929         if (shouldRepresentNodeOffsetZero())
    930             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
    931     } else if (shouldEmitNewlineBeforeNode(m_node)) {
    932         if (shouldRepresentNodeOffsetZero())
    933             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
    934     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
    935         if (shouldRepresentNodeOffsetZero())
    936             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
    937     }
    938 }
    939 
    940 bool TextIterator::handleNonTextNode()
    941 {
    942     if (shouldEmitNewlineForNode(m_node))
    943         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
    944     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
    945         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
    946     else
    947         representNodeOffsetZero();
    948 
    949     return true;
    950 }
    951 
    952 void TextIterator::exitNode()
    953 {
    954     // prevent emitting a newline when exiting a collapsed block at beginning of the range
    955     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
    956     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
    957     // therefore look like a blank line.
    958     if (!m_hasEmitted)
    959         return;
    960 
    961     // Emit with a position *inside* m_node, after m_node's contents, in
    962     // case it is a block, because the run should start where the
    963     // emitted character is positioned visually.
    964     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    965     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
    966     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
    967     // TextIterator in _web_attributedStringFromRange.
    968     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
    969     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
    970         // use extra newline to represent margin bottom, as needed
    971         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
    972 
    973         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
    974         // contain a VisiblePosition when doing selection preservation.
    975         if (m_lastCharacter != '\n') {
    976             // insert a newline with a position following this block's contents.
    977             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    978             // remember whether to later add a newline for the current node
    979             ASSERT(!m_needsAnotherNewline);
    980             m_needsAnotherNewline = addNewline;
    981         } else if (addNewline)
    982             // insert a newline with a position following this block's contents.
    983             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    984     }
    985 
    986     // If nothing was emitted, see if we need to emit a space.
    987     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
    988         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
    989 }
    990 
    991 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
    992 {
    993     m_hasEmitted = true;
    994 
    995     // remember information with which to construct the TextIterator::range()
    996     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
    997     m_positionNode = textNode;
    998     m_positionOffsetBaseNode = offsetBaseNode;
    999     m_positionStartOffset = textStartOffset;
   1000     m_positionEndOffset = textEndOffset;
   1001 
   1002     // remember information with which to construct the TextIterator::characters() and length()
   1003     m_singleCharacterBuffer = c;
   1004     m_textCharacters = &m_singleCharacterBuffer;
   1005     m_textLength = 1;
   1006 
   1007     // remember some iteration state
   1008     m_lastTextNodeEndedWithCollapsedSpace = false;
   1009     m_lastCharacter = c;
   1010 }
   1011 
   1012 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
   1013 {
   1014     RenderText* renderer = toRenderText(renderObject);
   1015     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
   1016     ASSERT(m_text.characters());
   1017 
   1018     m_positionNode = textNode;
   1019     m_positionOffsetBaseNode = 0;
   1020     m_positionStartOffset = textStartOffset;
   1021     m_positionEndOffset = textEndOffset;
   1022     m_textCharacters = m_text.characters() + textStartOffset;
   1023     m_textLength = textEndOffset - textStartOffset;
   1024     m_lastCharacter = m_text[textEndOffset - 1];
   1025 
   1026     m_lastTextNodeEndedWithCollapsedSpace = false;
   1027     m_hasEmitted = true;
   1028 }
   1029 
   1030 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
   1031 {
   1032     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
   1033 }
   1034 
   1035 PassRefPtr<Range> TextIterator::range() const
   1036 {
   1037     // use the current run information, if we have it
   1038     if (m_positionNode) {
   1039         if (m_positionOffsetBaseNode) {
   1040             int index = m_positionOffsetBaseNode->nodeIndex();
   1041             m_positionStartOffset += index;
   1042             m_positionEndOffset += index;
   1043             m_positionOffsetBaseNode = 0;
   1044         }
   1045         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1046     }
   1047 
   1048     // otherwise, return the end of the overall range we were given
   1049     if (m_endContainer)
   1050         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
   1051 
   1052     return 0;
   1053 }
   1054 
   1055 Node* TextIterator::node() const
   1056 {
   1057     RefPtr<Range> textRange = range();
   1058     if (!textRange)
   1059         return 0;
   1060 
   1061     Node* node = textRange->startContainer();
   1062     if (!node)
   1063         return 0;
   1064     if (node->offsetInCharacters())
   1065         return node;
   1066 
   1067     return node->childNode(textRange->startOffset());
   1068 }
   1069 
   1070 // --------
   1071 
   1072 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
   1073     : m_behavior(TextIteratorDefaultBehavior)
   1074     , m_node(0)
   1075     , m_positionNode(0)
   1076 #if OS(ANDROID)
   1077     , m_stopsOnFormControls(false)
   1078     , m_shouldStop(false)
   1079 #endif
   1080 {
   1081 }
   1082 
   1083 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
   1084     : m_behavior(behavior)
   1085     , m_node(0)
   1086     , m_positionNode(0)
   1087 #if OS(ANDROID)
   1088     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
   1089     , m_shouldStop(false)
   1090 #endif
   1091 {
   1092 #if OS(ANDROID)
   1093     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
   1094 #else
   1095     ASSERT(m_behavior == TextIteratorDefaultBehavior);
   1096 #endif
   1097 
   1098     if (!r)
   1099         return;
   1100 
   1101     Node* startNode = r->startContainer();
   1102     if (!startNode)
   1103         return;
   1104     Node* endNode = r->endContainer();
   1105     int startOffset = r->startOffset();
   1106     int endOffset = r->endOffset();
   1107 
   1108     if (!startNode->offsetInCharacters()) {
   1109         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
   1110             startNode = startNode->childNode(startOffset);
   1111             startOffset = 0;
   1112         }
   1113     }
   1114     if (!endNode->offsetInCharacters()) {
   1115         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
   1116             endNode = endNode->childNode(endOffset - 1);
   1117             endOffset = lastOffsetInNode(endNode);
   1118         }
   1119     }
   1120 
   1121     m_node = endNode;
   1122     setUpFullyClippedStack(m_fullyClippedStack, m_node);
   1123     m_offset = endOffset;
   1124     m_handledNode = false;
   1125     m_handledChildren = endOffset == 0;
   1126 
   1127     m_startNode = startNode;
   1128     m_startOffset = startOffset;
   1129     m_endNode = endNode;
   1130     m_endOffset = endOffset;
   1131 
   1132 #ifndef NDEBUG
   1133     // Need this just because of the assert.
   1134     m_positionNode = endNode;
   1135 #endif
   1136 
   1137     m_lastTextNode = 0;
   1138     m_lastCharacter = '\n';
   1139 
   1140     m_havePassedStartNode = false;
   1141 
   1142     advance();
   1143 }
   1144 
   1145 bool SimplifiedBackwardsTextIterator::atEnd() const
   1146 {
   1147 #if OS(ANDROID)
   1148     return !m_positionNode || m_shouldStop;
   1149 #else
   1150     return !m_positionNode;
   1151 #endif
   1152 }
   1153 
   1154 void SimplifiedBackwardsTextIterator::advance()
   1155 {
   1156     ASSERT(m_positionNode);
   1157 
   1158 #if OS(ANDROID)
   1159     if (m_shouldStop)
   1160         return;
   1161 
   1162     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
   1163     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
   1164         m_shouldStop = true;
   1165         return;
   1166     }
   1167 #endif
   1168 
   1169     m_positionNode = 0;
   1170     m_textLength = 0;
   1171 
   1172     while (m_node && !m_havePassedStartNode) {
   1173         // Don't handle node if we start iterating at [node, 0].
   1174         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
   1175             RenderObject* renderer = m_node->renderer();
   1176             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
   1177                 // FIXME: What about CDATA_SECTION_NODE?
   1178                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1179                     m_handledNode = handleTextNode();
   1180             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
   1181                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1182                     m_handledNode = handleReplacedElement();
   1183             } else
   1184                 m_handledNode = handleNonTextNode();
   1185             if (m_positionNode)
   1186                 return;
   1187         }
   1188 
   1189         if (!m_handledChildren && m_node->hasChildNodes()) {
   1190             m_node = m_node->lastChild();
   1191             pushFullyClippedState(m_fullyClippedStack, m_node);
   1192         } else {
   1193             // Exit empty containers as we pass over them or containers
   1194             // where [container, 0] is where we started iterating.
   1195             if (!m_handledNode
   1196                     && canHaveChildrenForEditing(m_node)
   1197                     && m_node->parentNode()
   1198                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
   1199                 exitNode();
   1200                 if (m_positionNode) {
   1201                     m_handledNode = true;
   1202                     m_handledChildren = true;
   1203                     return;
   1204                 }
   1205             }
   1206 
   1207             // Exit all other containers.
   1208             while (!m_node->previousSibling()) {
   1209                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
   1210                     break;
   1211                 m_fullyClippedStack.pop();
   1212                 exitNode();
   1213                 if (m_positionNode) {
   1214                     m_handledNode = true;
   1215                     m_handledChildren = true;
   1216                     return;
   1217                 }
   1218             }
   1219 
   1220             m_fullyClippedStack.pop();
   1221             if (advanceRespectingRange(m_node->previousSibling()))
   1222                 pushFullyClippedState(m_fullyClippedStack, m_node);
   1223             else
   1224                 m_node = 0;
   1225         }
   1226 
   1227         // For the purpose of word boundary detection,
   1228         // we should iterate all visible text and trailing (collapsed) whitespaces.
   1229         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
   1230         m_handledNode = false;
   1231         m_handledChildren = false;
   1232 
   1233         if (m_positionNode)
   1234             return;
   1235     }
   1236 }
   1237 
   1238 bool SimplifiedBackwardsTextIterator::handleTextNode()
   1239 {
   1240     m_lastTextNode = m_node;
   1241 
   1242     RenderText* renderer = toRenderText(m_node->renderer());
   1243     String str = renderer->text();
   1244 
   1245     if (!renderer->firstTextBox() && str.length() > 0)
   1246         return true;
   1247 
   1248     m_positionEndOffset = m_offset;
   1249 
   1250     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
   1251     m_positionNode = m_node;
   1252     m_positionStartOffset = m_offset;
   1253     m_textLength = m_positionEndOffset - m_positionStartOffset;
   1254     m_textCharacters = str.characters() + m_positionStartOffset;
   1255 
   1256     m_lastCharacter = str[m_positionEndOffset - 1];
   1257 
   1258     return true;
   1259 }
   1260 
   1261 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
   1262 {
   1263     unsigned index = m_node->nodeIndex();
   1264     // We want replaced elements to behave like punctuation for boundary
   1265     // finding, and to simply take up space for the selection preservation
   1266     // code in moveParagraphs, so we use a comma.  Unconditionally emit
   1267     // here because this iterator is only used for boundary finding.
   1268     emitCharacter(',', m_node->parentNode(), index, index + 1);
   1269     return true;
   1270 }
   1271 
   1272 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
   1273 {
   1274     // We can use a linefeed in place of a tab because this simple iterator is only used to
   1275     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
   1276     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1277         unsigned index = m_node->nodeIndex();
   1278         // The start of this emitted range is wrong. Ensuring correctness would require
   1279         // VisiblePositions and so would be slow. previousBoundary expects this.
   1280         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
   1281     }
   1282     return true;
   1283 }
   1284 
   1285 void SimplifiedBackwardsTextIterator::exitNode()
   1286 {
   1287     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1288         // The start of this emitted range is wrong. Ensuring correctness would require
   1289         // VisiblePositions and so would be slow. previousBoundary expects this.
   1290         emitCharacter('\n', m_node, 0, 0);
   1291     }
   1292 }
   1293 
   1294 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
   1295 {
   1296     m_singleCharacterBuffer = c;
   1297     m_positionNode = node;
   1298     m_positionStartOffset = startOffset;
   1299     m_positionEndOffset = endOffset;
   1300     m_textCharacters = &m_singleCharacterBuffer;
   1301     m_textLength = 1;
   1302     m_lastCharacter = c;
   1303 }
   1304 
   1305 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
   1306 {
   1307     if (!next)
   1308         return false;
   1309     m_havePassedStartNode |= m_node == m_startNode;
   1310     if (m_havePassedStartNode)
   1311         return false;
   1312     m_node = next;
   1313     return true;
   1314 }
   1315 
   1316 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
   1317 {
   1318     if (m_positionNode)
   1319         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1320 
   1321     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
   1322 }
   1323 
   1324 // --------
   1325 
   1326 CharacterIterator::CharacterIterator()
   1327     : m_offset(0)
   1328     , m_runOffset(0)
   1329     , m_atBreak(true)
   1330 {
   1331 }
   1332 
   1333 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
   1334     : m_offset(0)
   1335     , m_runOffset(0)
   1336     , m_atBreak(true)
   1337     , m_textIterator(r, behavior)
   1338 {
   1339     while (!atEnd() && m_textIterator.length() == 0)
   1340         m_textIterator.advance();
   1341 }
   1342 
   1343 PassRefPtr<Range> CharacterIterator::range() const
   1344 {
   1345     RefPtr<Range> r = m_textIterator.range();
   1346     if (!m_textIterator.atEnd()) {
   1347         if (m_textIterator.length() <= 1) {
   1348             ASSERT(m_runOffset == 0);
   1349         } else {
   1350             Node* n = r->startContainer();
   1351             ASSERT(n == r->endContainer());
   1352             int offset = r->startOffset() + m_runOffset;
   1353             ExceptionCode ec = 0;
   1354             r->setStart(n, offset, ec);
   1355             r->setEnd(n, offset + 1, ec);
   1356             ASSERT(!ec);
   1357         }
   1358     }
   1359     return r.release();
   1360 }
   1361 
   1362 void CharacterIterator::advance(int count)
   1363 {
   1364     if (count <= 0) {
   1365         ASSERT(count == 0);
   1366         return;
   1367     }
   1368 
   1369     m_atBreak = false;
   1370 
   1371     // easy if there is enough left in the current m_textIterator run
   1372     int remaining = m_textIterator.length() - m_runOffset;
   1373     if (count < remaining) {
   1374         m_runOffset += count;
   1375         m_offset += count;
   1376         return;
   1377     }
   1378 
   1379     // exhaust the current m_textIterator run
   1380     count -= remaining;
   1381     m_offset += remaining;
   1382 
   1383     // move to a subsequent m_textIterator run
   1384     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1385         int runLength = m_textIterator.length();
   1386         if (runLength == 0)
   1387             m_atBreak = true;
   1388         else {
   1389             // see whether this is m_textIterator to use
   1390             if (count < runLength) {
   1391                 m_runOffset = count;
   1392                 m_offset += count;
   1393                 return;
   1394             }
   1395 
   1396             // exhaust this m_textIterator run
   1397             count -= runLength;
   1398             m_offset += runLength;
   1399         }
   1400     }
   1401 
   1402     // ran to the end of the m_textIterator... no more runs left
   1403     m_atBreak = true;
   1404     m_runOffset = 0;
   1405 }
   1406 
   1407 String CharacterIterator::string(int numChars)
   1408 {
   1409     Vector<UChar> result;
   1410     result.reserveInitialCapacity(numChars);
   1411     while (numChars > 0 && !atEnd()) {
   1412         int runSize = min(numChars, length());
   1413         result.append(characters(), runSize);
   1414         numChars -= runSize;
   1415         advance(runSize);
   1416     }
   1417     return String::adopt(result);
   1418 }
   1419 
   1420 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
   1421 {
   1422     it.advance(offset);
   1423     RefPtr<Range> start = it.range();
   1424 
   1425     if (length > 1)
   1426         it.advance(length - 1);
   1427     RefPtr<Range> end = it.range();
   1428 
   1429     return Range::create(start->startContainer()->document(),
   1430         start->startContainer(), start->startOffset(),
   1431         end->endContainer(), end->endOffset());
   1432 }
   1433 
   1434 BackwardsCharacterIterator::BackwardsCharacterIterator()
   1435     : m_offset(0)
   1436     , m_runOffset(0)
   1437     , m_atBreak(true)
   1438 {
   1439 }
   1440 
   1441 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
   1442     : m_offset(0)
   1443     , m_runOffset(0)
   1444     , m_atBreak(true)
   1445     , m_textIterator(range, behavior)
   1446 {
   1447     while (!atEnd() && !m_textIterator.length())
   1448         m_textIterator.advance();
   1449 }
   1450 
   1451 PassRefPtr<Range> BackwardsCharacterIterator::range() const
   1452 {
   1453     RefPtr<Range> r = m_textIterator.range();
   1454     if (!m_textIterator.atEnd()) {
   1455         if (m_textIterator.length() <= 1)
   1456             ASSERT(m_runOffset == 0);
   1457         else {
   1458             Node* n = r->startContainer();
   1459             ASSERT(n == r->endContainer());
   1460             int offset = r->endOffset() - m_runOffset;
   1461             ExceptionCode ec = 0;
   1462             r->setStart(n, offset - 1, ec);
   1463             r->setEnd(n, offset, ec);
   1464             ASSERT(!ec);
   1465         }
   1466     }
   1467     return r.release();
   1468 }
   1469 
   1470 void BackwardsCharacterIterator::advance(int count)
   1471 {
   1472     if (count <= 0) {
   1473         ASSERT(!count);
   1474         return;
   1475     }
   1476 
   1477     m_atBreak = false;
   1478 
   1479     int remaining = m_textIterator.length() - m_runOffset;
   1480     if (count < remaining) {
   1481         m_runOffset += count;
   1482         m_offset += count;
   1483         return;
   1484     }
   1485 
   1486     count -= remaining;
   1487     m_offset += remaining;
   1488 
   1489     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1490         int runLength = m_textIterator.length();
   1491         if (runLength == 0)
   1492             m_atBreak = true;
   1493         else {
   1494             if (count < runLength) {
   1495                 m_runOffset = count;
   1496                 m_offset += count;
   1497                 return;
   1498             }
   1499 
   1500             count -= runLength;
   1501             m_offset += runLength;
   1502         }
   1503     }
   1504 
   1505     m_atBreak = true;
   1506     m_runOffset = 0;
   1507 }
   1508 
   1509 // --------
   1510 
   1511 WordAwareIterator::WordAwareIterator()
   1512     : m_previousText(0)
   1513     , m_didLookAhead(false)
   1514 {
   1515 }
   1516 
   1517 WordAwareIterator::WordAwareIterator(const Range* r)
   1518     : m_previousText(0)
   1519     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
   1520     , m_textIterator(r)
   1521 {
   1522     advance(); // get in position over the first chunk of text
   1523 }
   1524 
   1525 WordAwareIterator::~WordAwareIterator()
   1526 {
   1527 }
   1528 
   1529 // We're always in one of these modes:
   1530 // - The current chunk in the text iterator is our current chunk
   1531 //      (typically its a piece of whitespace, or text that ended with whitespace)
   1532 // - The previous chunk in the text iterator is our current chunk
   1533 //      (we looked ahead to the next chunk and found a word boundary)
   1534 // - We built up our own chunk of text from many chunks from the text iterator
   1535 
   1536 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
   1537 
   1538 void WordAwareIterator::advance()
   1539 {
   1540     m_previousText = 0;
   1541     m_buffer.clear();      // toss any old buffer we built up
   1542 
   1543     // If last time we did a look-ahead, start with that looked-ahead chunk now
   1544     if (!m_didLookAhead) {
   1545         ASSERT(!m_textIterator.atEnd());
   1546         m_textIterator.advance();
   1547     }
   1548     m_didLookAhead = false;
   1549 
   1550     // Go to next non-empty chunk
   1551     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
   1552         m_textIterator.advance();
   1553     m_range = m_textIterator.range();
   1554 
   1555     if (m_textIterator.atEnd())
   1556         return;
   1557 
   1558     while (1) {
   1559         // If this chunk ends in whitespace we can just use it as our chunk.
   1560         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
   1561             return;
   1562 
   1563         // If this is the first chunk that failed, save it in previousText before look ahead
   1564         if (m_buffer.isEmpty()) {
   1565             m_previousText = m_textIterator.characters();
   1566             m_previousLength = m_textIterator.length();
   1567         }
   1568 
   1569         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
   1570         m_textIterator.advance();
   1571         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
   1572             m_didLookAhead = true;
   1573             return;
   1574         }
   1575 
   1576         if (m_buffer.isEmpty()) {
   1577             // Start gobbling chunks until we get to a suitable stopping point
   1578             m_buffer.append(m_previousText, m_previousLength);
   1579             m_previousText = 0;
   1580         }
   1581         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
   1582         int exception = 0;
   1583         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
   1584     }
   1585 }
   1586 
   1587 int WordAwareIterator::length() const
   1588 {
   1589     if (!m_buffer.isEmpty())
   1590         return m_buffer.size();
   1591     if (m_previousText)
   1592         return m_previousLength;
   1593     return m_textIterator.length();
   1594 }
   1595 
   1596 const UChar* WordAwareIterator::characters() const
   1597 {
   1598     if (!m_buffer.isEmpty())
   1599         return m_buffer.data();
   1600     if (m_previousText)
   1601         return m_previousText;
   1602     return m_textIterator.characters();
   1603 }
   1604 
   1605 // --------
   1606 
   1607 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
   1608 {
   1609     switch (c) {
   1610         case hebrewPunctuationGershayim:
   1611         case leftDoubleQuotationMark:
   1612         case rightDoubleQuotationMark:
   1613             return '"';
   1614         case hebrewPunctuationGeresh:
   1615         case leftSingleQuotationMark:
   1616         case rightSingleQuotationMark:
   1617             return '\'';
   1618         case softHyphen:
   1619             // Replace soft hyphen with an ignorable character so that their presence or absence will
   1620             // not affect string comparison.
   1621             return 0;
   1622         default:
   1623             return c;
   1624     }
   1625 }
   1626 
   1627 static inline void foldQuoteMarksAndSoftHyphens(String& s)
   1628 {
   1629     s.replace(hebrewPunctuationGeresh, '\'');
   1630     s.replace(hebrewPunctuationGershayim, '"');
   1631     s.replace(leftDoubleQuotationMark, '"');
   1632     s.replace(leftSingleQuotationMark, '\'');
   1633     s.replace(rightDoubleQuotationMark, '"');
   1634     s.replace(rightSingleQuotationMark, '\'');
   1635     // Replace soft hyphen with an ignorable character so that their presence or absence will
   1636     // not affect string comparison.
   1637     s.replace(softHyphen, 0);
   1638 }
   1639 
   1640 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
   1641 
   1642 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
   1643 {
   1644     for (size_t i = 0; i < length; ++i)
   1645         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
   1646 }
   1647 
   1648 static const size_t minimumSearchBufferSize = 8192;
   1649 
   1650 #ifndef NDEBUG
   1651 static bool searcherInUse;
   1652 #endif
   1653 
   1654 static UStringSearch* createSearcher()
   1655 {
   1656     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
   1657     // but it doesn't matter exactly what it is, since we don't perform any searches
   1658     // without setting both the pattern and the text.
   1659     UErrorCode status = U_ZERO_ERROR;
   1660     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
   1661     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
   1662     return searcher;
   1663 }
   1664 
   1665 static UStringSearch* searcher()
   1666 {
   1667     static UStringSearch* searcher = createSearcher();
   1668     return searcher;
   1669 }
   1670 
   1671 static inline void lockSearcher()
   1672 {
   1673 #ifndef NDEBUG
   1674     ASSERT(!searcherInUse);
   1675     searcherInUse = true;
   1676 #endif
   1677 }
   1678 
   1679 static inline void unlockSearcher()
   1680 {
   1681 #ifndef NDEBUG
   1682     ASSERT(searcherInUse);
   1683     searcherInUse = false;
   1684 #endif
   1685 }
   1686 
   1687 // ICU's search ignores the distinction between small kana letters and ones
   1688 // that are not small, and also characters that differ only in the voicing
   1689 // marks when considering only primary collation strength diffrences.
   1690 // This is not helpful for end users, since these differences make words
   1691 // distinct, so for our purposes we need these to be considered.
   1692 // The Unicode folks do not think the collation algorithm should be
   1693 // changed. To work around this, we would like to tailor the ICU searcher,
   1694 // but we can't get that to work yet. So instead, we check for cases where
   1695 // these differences occur, and skip those matches.
   1696 
   1697 // We refer to the above technique as the "kana workaround". The next few
   1698 // functions are helper functinos for the kana workaround.
   1699 
   1700 static inline bool isKanaLetter(UChar character)
   1701 {
   1702     // Hiragana letters.
   1703     if (character >= 0x3041 && character <= 0x3096)
   1704         return true;
   1705 
   1706     // Katakana letters.
   1707     if (character >= 0x30A1 && character <= 0x30FA)
   1708         return true;
   1709     if (character >= 0x31F0 && character <= 0x31FF)
   1710         return true;
   1711 
   1712     // Halfwidth katakana letters.
   1713     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
   1714         return true;
   1715 
   1716     return false;
   1717 }
   1718 
   1719 static inline bool isSmallKanaLetter(UChar character)
   1720 {
   1721     ASSERT(isKanaLetter(character));
   1722 
   1723     switch (character) {
   1724     case 0x3041: // HIRAGANA LETTER SMALL A
   1725     case 0x3043: // HIRAGANA LETTER SMALL I
   1726     case 0x3045: // HIRAGANA LETTER SMALL U
   1727     case 0x3047: // HIRAGANA LETTER SMALL E
   1728     case 0x3049: // HIRAGANA LETTER SMALL O
   1729     case 0x3063: // HIRAGANA LETTER SMALL TU
   1730     case 0x3083: // HIRAGANA LETTER SMALL YA
   1731     case 0x3085: // HIRAGANA LETTER SMALL YU
   1732     case 0x3087: // HIRAGANA LETTER SMALL YO
   1733     case 0x308E: // HIRAGANA LETTER SMALL WA
   1734     case 0x3095: // HIRAGANA LETTER SMALL KA
   1735     case 0x3096: // HIRAGANA LETTER SMALL KE
   1736     case 0x30A1: // KATAKANA LETTER SMALL A
   1737     case 0x30A3: // KATAKANA LETTER SMALL I
   1738     case 0x30A5: // KATAKANA LETTER SMALL U
   1739     case 0x30A7: // KATAKANA LETTER SMALL E
   1740     case 0x30A9: // KATAKANA LETTER SMALL O
   1741     case 0x30C3: // KATAKANA LETTER SMALL TU
   1742     case 0x30E3: // KATAKANA LETTER SMALL YA
   1743     case 0x30E5: // KATAKANA LETTER SMALL YU
   1744     case 0x30E7: // KATAKANA LETTER SMALL YO
   1745     case 0x30EE: // KATAKANA LETTER SMALL WA
   1746     case 0x30F5: // KATAKANA LETTER SMALL KA
   1747     case 0x30F6: // KATAKANA LETTER SMALL KE
   1748     case 0x31F0: // KATAKANA LETTER SMALL KU
   1749     case 0x31F1: // KATAKANA LETTER SMALL SI
   1750     case 0x31F2: // KATAKANA LETTER SMALL SU
   1751     case 0x31F3: // KATAKANA LETTER SMALL TO
   1752     case 0x31F4: // KATAKANA LETTER SMALL NU
   1753     case 0x31F5: // KATAKANA LETTER SMALL HA
   1754     case 0x31F6: // KATAKANA LETTER SMALL HI
   1755     case 0x31F7: // KATAKANA LETTER SMALL HU
   1756     case 0x31F8: // KATAKANA LETTER SMALL HE
   1757     case 0x31F9: // KATAKANA LETTER SMALL HO
   1758     case 0x31FA: // KATAKANA LETTER SMALL MU
   1759     case 0x31FB: // KATAKANA LETTER SMALL RA
   1760     case 0x31FC: // KATAKANA LETTER SMALL RI
   1761     case 0x31FD: // KATAKANA LETTER SMALL RU
   1762     case 0x31FE: // KATAKANA LETTER SMALL RE
   1763     case 0x31FF: // KATAKANA LETTER SMALL RO
   1764     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
   1765     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
   1766     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
   1767     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
   1768     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
   1769     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
   1770     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
   1771     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
   1772     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
   1773         return true;
   1774     }
   1775     return false;
   1776 }
   1777 
   1778 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
   1779 
   1780 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
   1781 {
   1782     ASSERT(isKanaLetter(character));
   1783 
   1784     switch (character) {
   1785     case 0x304C: // HIRAGANA LETTER GA
   1786     case 0x304E: // HIRAGANA LETTER GI
   1787     case 0x3050: // HIRAGANA LETTER GU
   1788     case 0x3052: // HIRAGANA LETTER GE
   1789     case 0x3054: // HIRAGANA LETTER GO
   1790     case 0x3056: // HIRAGANA LETTER ZA
   1791     case 0x3058: // HIRAGANA LETTER ZI
   1792     case 0x305A: // HIRAGANA LETTER ZU
   1793     case 0x305C: // HIRAGANA LETTER ZE
   1794     case 0x305E: // HIRAGANA LETTER ZO
   1795     case 0x3060: // HIRAGANA LETTER DA
   1796     case 0x3062: // HIRAGANA LETTER DI
   1797     case 0x3065: // HIRAGANA LETTER DU
   1798     case 0x3067: // HIRAGANA LETTER DE
   1799     case 0x3069: // HIRAGANA LETTER DO
   1800     case 0x3070: // HIRAGANA LETTER BA
   1801     case 0x3073: // HIRAGANA LETTER BI
   1802     case 0x3076: // HIRAGANA LETTER BU
   1803     case 0x3079: // HIRAGANA LETTER BE
   1804     case 0x307C: // HIRAGANA LETTER BO
   1805     case 0x3094: // HIRAGANA LETTER VU
   1806     case 0x30AC: // KATAKANA LETTER GA
   1807     case 0x30AE: // KATAKANA LETTER GI
   1808     case 0x30B0: // KATAKANA LETTER GU
   1809     case 0x30B2: // KATAKANA LETTER GE
   1810     case 0x30B4: // KATAKANA LETTER GO
   1811     case 0x30B6: // KATAKANA LETTER ZA
   1812     case 0x30B8: // KATAKANA LETTER ZI
   1813     case 0x30BA: // KATAKANA LETTER ZU
   1814     case 0x30BC: // KATAKANA LETTER ZE
   1815     case 0x30BE: // KATAKANA LETTER ZO
   1816     case 0x30C0: // KATAKANA LETTER DA
   1817     case 0x30C2: // KATAKANA LETTER DI
   1818     case 0x30C5: // KATAKANA LETTER DU
   1819     case 0x30C7: // KATAKANA LETTER DE
   1820     case 0x30C9: // KATAKANA LETTER DO
   1821     case 0x30D0: // KATAKANA LETTER BA
   1822     case 0x30D3: // KATAKANA LETTER BI
   1823     case 0x30D6: // KATAKANA LETTER BU
   1824     case 0x30D9: // KATAKANA LETTER BE
   1825     case 0x30DC: // KATAKANA LETTER BO
   1826     case 0x30F4: // KATAKANA LETTER VU
   1827     case 0x30F7: // KATAKANA LETTER VA
   1828     case 0x30F8: // KATAKANA LETTER VI
   1829     case 0x30F9: // KATAKANA LETTER VE
   1830     case 0x30FA: // KATAKANA LETTER VO
   1831         return VoicedSoundMark;
   1832     case 0x3071: // HIRAGANA LETTER PA
   1833     case 0x3074: // HIRAGANA LETTER PI
   1834     case 0x3077: // HIRAGANA LETTER PU
   1835     case 0x307A: // HIRAGANA LETTER PE
   1836     case 0x307D: // HIRAGANA LETTER PO
   1837     case 0x30D1: // KATAKANA LETTER PA
   1838     case 0x30D4: // KATAKANA LETTER PI
   1839     case 0x30D7: // KATAKANA LETTER PU
   1840     case 0x30DA: // KATAKANA LETTER PE
   1841     case 0x30DD: // KATAKANA LETTER PO
   1842         return SemiVoicedSoundMark;
   1843     }
   1844     return NoVoicedSoundMark;
   1845 }
   1846 
   1847 static inline bool isCombiningVoicedSoundMark(UChar character)
   1848 {
   1849     switch (character) {
   1850     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
   1851     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
   1852         return true;
   1853     }
   1854     return false;
   1855 }
   1856 
   1857 static inline bool containsKanaLetters(const String& pattern)
   1858 {
   1859     const UChar* characters = pattern.characters();
   1860     unsigned length = pattern.length();
   1861     for (unsigned i = 0; i < length; ++i) {
   1862         if (isKanaLetter(characters[i]))
   1863             return true;
   1864     }
   1865     return false;
   1866 }
   1867 
   1868 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
   1869 {
   1870     ASSERT(length);
   1871 
   1872     buffer.resize(length);
   1873 
   1874     UErrorCode status = U_ZERO_ERROR;
   1875     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
   1876     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
   1877     ASSERT(bufferSize);
   1878 
   1879     buffer.resize(bufferSize);
   1880 
   1881     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
   1882         return;
   1883 
   1884     status = U_ZERO_ERROR;
   1885     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
   1886     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
   1887 }
   1888 
   1889 static bool isNonLatin1Separator(UChar32 character)
   1890 {
   1891     ASSERT_ARG(character, character >= 256);
   1892 
   1893     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
   1894 }
   1895 
   1896 static inline bool isSeparator(UChar32 character)
   1897 {
   1898     static const bool latin1SeparatorTable[256] = {
   1899         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1900         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1901         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
   1902         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
   1903         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
   1904         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
   1905         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
   1906         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
   1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1908         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1909         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1910         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1911         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1912         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
   1913         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1914         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
   1915     };
   1916 
   1917     if (character < 256)
   1918         return latin1SeparatorTable[character];
   1919 
   1920     return isNonLatin1Separator(character);
   1921 }
   1922 
   1923 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   1924     : m_target(target)
   1925     , m_options(options)
   1926     , m_prefixLength(0)
   1927     , m_atBreak(true)
   1928     , m_needsMoreContext(options & AtWordStarts)
   1929     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
   1930 {
   1931     ASSERT(!m_target.isEmpty());
   1932 
   1933     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
   1934     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
   1935     // to add tailoring on top of the locale-specific tailoring as of this writing.
   1936     foldQuoteMarksAndSoftHyphens(m_target);
   1937 
   1938     size_t targetLength = m_target.length();
   1939     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
   1940     m_overlap = m_buffer.capacity() / 4;
   1941 
   1942     if ((m_options & AtWordStarts) && targetLength) {
   1943         UChar32 targetFirstCharacter;
   1944         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
   1945         // Characters in the separator category never really occur at the beginning of a word,
   1946         // so if the target begins with such a character, we just ignore the AtWordStart option.
   1947         if (isSeparator(targetFirstCharacter)) {
   1948             m_options &= ~AtWordStarts;
   1949             m_needsMoreContext = false;
   1950         }
   1951     }
   1952 
   1953     // Grab the single global searcher.
   1954     // If we ever have a reason to do more than once search buffer at once, we'll have
   1955     // to move to multiple searchers.
   1956     lockSearcher();
   1957 
   1958     UStringSearch* searcher = WebCore::searcher();
   1959     UCollator* collator = usearch_getCollator(searcher);
   1960 
   1961     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
   1962     if (ucol_getStrength(collator) != strength) {
   1963         ucol_setStrength(collator, strength);
   1964         usearch_reset(searcher);
   1965     }
   1966 
   1967     UErrorCode status = U_ZERO_ERROR;
   1968     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
   1969     ASSERT(status == U_ZERO_ERROR);
   1970 
   1971     // The kana workaround requires a normalized copy of the target string.
   1972     if (m_targetRequiresKanaWorkaround)
   1973         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
   1974 }
   1975 
   1976 inline SearchBuffer::~SearchBuffer()
   1977 {
   1978     // Leave the static object pointing to a valid string.
   1979     UErrorCode status = U_ZERO_ERROR;
   1980     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
   1981     ASSERT(status == U_ZERO_ERROR);
   1982 
   1983     unlockSearcher();
   1984 }
   1985 
   1986 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   1987 {
   1988     ASSERT(length);
   1989 
   1990     if (m_atBreak) {
   1991         m_buffer.shrink(0);
   1992         m_prefixLength = 0;
   1993         m_atBreak = false;
   1994     } else if (m_buffer.size() == m_buffer.capacity()) {
   1995         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
   1996         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
   1997         m_buffer.shrink(m_overlap);
   1998     }
   1999 
   2000     size_t oldLength = m_buffer.size();
   2001     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
   2002     ASSERT(usableLength);
   2003     m_buffer.append(characters, usableLength);
   2004     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
   2005     return usableLength;
   2006 }
   2007 
   2008 inline bool SearchBuffer::needsMoreContext() const
   2009 {
   2010     return m_needsMoreContext;
   2011 }
   2012 
   2013 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
   2014 {
   2015     ASSERT(m_needsMoreContext);
   2016     ASSERT(m_prefixLength == m_buffer.size());
   2017 
   2018     if (!length)
   2019         return;
   2020 
   2021     m_atBreak = false;
   2022 
   2023     size_t wordBoundaryContextStart = length;
   2024     if (wordBoundaryContextStart) {
   2025         U16_BACK_1(characters, 0, wordBoundaryContextStart);
   2026         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
   2027     }
   2028 
   2029     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
   2030     m_buffer.prepend(characters + length - usableLength, usableLength);
   2031     m_prefixLength += usableLength;
   2032 
   2033     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
   2034         m_needsMoreContext = false;
   2035 }
   2036 
   2037 inline bool SearchBuffer::atBreak() const
   2038 {
   2039     return m_atBreak;
   2040 }
   2041 
   2042 inline void SearchBuffer::reachedBreak()
   2043 {
   2044     m_atBreak = true;
   2045 }
   2046 
   2047 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
   2048 {
   2049     // This function implements the kana workaround. If usearch treats
   2050     // it as a match, but we do not want to, then it's a "bad match".
   2051     if (!m_targetRequiresKanaWorkaround)
   2052         return false;
   2053 
   2054     // Normalize into a match buffer. We reuse a single buffer rather than
   2055     // creating a new one each time.
   2056     normalizeCharacters(match, matchLength, m_normalizedMatch);
   2057 
   2058     const UChar* a = m_normalizedTarget.begin();
   2059     const UChar* aEnd = m_normalizedTarget.end();
   2060 
   2061     const UChar* b = m_normalizedMatch.begin();
   2062     const UChar* bEnd = m_normalizedMatch.end();
   2063 
   2064     while (true) {
   2065         // Skip runs of non-kana-letter characters. This is necessary so we can
   2066         // correctly handle strings where the target and match have different-length
   2067         // runs of characters that match, while still double checking the correctness
   2068         // of matches of kana letters with other kana letters.
   2069         while (a != aEnd && !isKanaLetter(*a))
   2070             ++a;
   2071         while (b != bEnd && !isKanaLetter(*b))
   2072             ++b;
   2073 
   2074         // If we reached the end of either the target or the match, we should have
   2075         // reached the end of both; both should have the same number of kana letters.
   2076         if (a == aEnd || b == bEnd) {
   2077             ASSERT(a == aEnd);
   2078             ASSERT(b == bEnd);
   2079             return false;
   2080         }
   2081 
   2082         // Check for differences in the kana letter character itself.
   2083         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
   2084             return true;
   2085         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
   2086             return true;
   2087         ++a;
   2088         ++b;
   2089 
   2090         // Check for differences in combining voiced sound marks found after the letter.
   2091         while (1) {
   2092             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
   2093                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
   2094                     return true;
   2095                 break;
   2096             }
   2097             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
   2098                 return true;
   2099             if (*a != *b)
   2100                 return true;
   2101             ++a;
   2102             ++b;
   2103         }
   2104     }
   2105 }
   2106 
   2107 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
   2108 {
   2109     ASSERT(m_options & AtWordStarts);
   2110 
   2111     if (!start)
   2112         return true;
   2113 
   2114     if (m_options & TreatMedialCapitalAsWordStart) {
   2115         int size = m_buffer.size();
   2116         int offset = start;
   2117         UChar32 firstCharacter;
   2118         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
   2119         UChar32 previousCharacter;
   2120         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
   2121 
   2122         if (isSeparator(firstCharacter)) {
   2123             // The start of a separator run is a word start (".org" in "webkit.org").
   2124             if (!isSeparator(previousCharacter))
   2125                 return true;
   2126         } else if (isASCIIUpper(firstCharacter)) {
   2127             // The start of an uppercase run is a word start ("Kit" in "WebKit").
   2128             if (!isASCIIUpper(previousCharacter))
   2129                 return true;
   2130             // The last character of an uppercase run followed by a non-separator, non-digit
   2131             // is a word start ("Request" in "XMLHTTPRequest").
   2132             offset = start;
   2133             U16_FWD_1(m_buffer.data(), offset, size);
   2134             UChar32 nextCharacter = 0;
   2135             if (offset < size)
   2136                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
   2137             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
   2138                 return true;
   2139         } else if (isASCIIDigit(firstCharacter)) {
   2140             // The start of a digit run is a word start ("2" in "WebKit2").
   2141             if (!isASCIIDigit(previousCharacter))
   2142                 return true;
   2143         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
   2144             // The start of a non-separator, non-uppercase, non-digit run is a word start,
   2145             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
   2146             return true;
   2147         }
   2148     }
   2149 
   2150     size_t wordBreakSearchStart = start + length;
   2151     while (wordBreakSearchStart > start)
   2152         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
   2153     return wordBreakSearchStart == start;
   2154 }
   2155 
   2156 inline size_t SearchBuffer::search(size_t& start)
   2157 {
   2158     size_t size = m_buffer.size();
   2159     if (m_atBreak) {
   2160         if (!size)
   2161             return 0;
   2162     } else {
   2163         if (size != m_buffer.capacity())
   2164             return 0;
   2165     }
   2166 
   2167     UStringSearch* searcher = WebCore::searcher();
   2168 
   2169     UErrorCode status = U_ZERO_ERROR;
   2170     usearch_setText(searcher, m_buffer.data(), size, &status);
   2171     ASSERT(status == U_ZERO_ERROR);
   2172 
   2173     usearch_setOffset(searcher, m_prefixLength, &status);
   2174     ASSERT(status == U_ZERO_ERROR);
   2175 
   2176     int matchStart = usearch_next(searcher, &status);
   2177     ASSERT(status == U_ZERO_ERROR);
   2178 
   2179 nextMatch:
   2180     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
   2181         ASSERT(matchStart == USEARCH_DONE);
   2182         return 0;
   2183     }
   2184 
   2185     // Matches that start in the overlap area are only tentative.
   2186     // The same match may appear later, matching more characters,
   2187     // possibly including a combining character that's not yet in the buffer.
   2188     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
   2189         size_t overlap = m_overlap;
   2190         if (m_options & AtWordStarts) {
   2191             // Ensure that there is sufficient context before matchStart the next time around for
   2192             // determining if it is at a word boundary.
   2193             int wordBoundaryContextStart = matchStart;
   2194             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
   2195             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
   2196             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
   2197         }
   2198         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
   2199         m_prefixLength -= min(m_prefixLength, size - overlap);
   2200         m_buffer.shrink(overlap);
   2201         return 0;
   2202     }
   2203 
   2204     size_t matchedLength = usearch_getMatchedLength(searcher);
   2205     ASSERT(matchStart + matchedLength <= size);
   2206 
   2207     // If this match is "bad", move on to the next match.
   2208     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
   2209         matchStart = usearch_next(searcher, &status);
   2210         ASSERT(status == U_ZERO_ERROR);
   2211         goto nextMatch;
   2212     }
   2213 
   2214     size_t newSize = size - (matchStart + 1);
   2215     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
   2216     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
   2217     m_buffer.shrink(newSize);
   2218 
   2219     start = size - matchStart;
   2220     return matchedLength;
   2221 }
   2222 
   2223 #else // !ICU_UNICODE
   2224 
   2225 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   2226     : m_target(options & CaseInsensitive ? target.foldCase() : target)
   2227     , m_options(options)
   2228     , m_buffer(m_target.length())
   2229     , m_isCharacterStartBuffer(m_target.length())
   2230     , m_isBufferFull(false)
   2231     , m_cursor(0)
   2232 {
   2233     ASSERT(!m_target.isEmpty());
   2234     m_target.replace(noBreakSpace, ' ');
   2235     foldQuoteMarksAndSoftHyphens(m_target);
   2236 }
   2237 
   2238 inline SearchBuffer::~SearchBuffer()
   2239 {
   2240 }
   2241 
   2242 inline void SearchBuffer::reachedBreak()
   2243 {
   2244     m_cursor = 0;
   2245     m_isBufferFull = false;
   2246 }
   2247 
   2248 inline bool SearchBuffer::atBreak() const
   2249 {
   2250     return !m_cursor && !m_isBufferFull;
   2251 }
   2252 
   2253 inline void SearchBuffer::append(UChar c, bool isStart)
   2254 {
   2255     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
   2256     m_isCharacterStartBuffer[m_cursor] = isStart;
   2257     if (++m_cursor == m_target.length()) {
   2258         m_cursor = 0;
   2259         m_isBufferFull = true;
   2260     }
   2261 }
   2262 
   2263 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   2264 {
   2265     ASSERT(length);
   2266     if (!(m_options & CaseInsensitive)) {
   2267         append(characters[0], true);
   2268         return 1;
   2269     }
   2270     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
   2271     UChar foldedCharacters[maxFoldedCharacters];
   2272     bool error;
   2273     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
   2274     ASSERT(!error);
   2275     ASSERT(numFoldedCharacters);
   2276     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
   2277     if (!error && numFoldedCharacters) {
   2278         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
   2279         append(foldedCharacters[0], true);
   2280         for (int i = 1; i < numFoldedCharacters; ++i)
   2281             append(foldedCharacters[i], false);
   2282     }
   2283     return 1;
   2284 }
   2285 
   2286 inline bool SearchBuffer::needsMoreContext() const
   2287 {
   2288     return false;
   2289 }
   2290 
   2291 void SearchBuffer::prependContext(const UChar*, size_t)
   2292 {
   2293     ASSERT_NOT_REACHED();
   2294 }
   2295 
   2296 inline size_t SearchBuffer::search(size_t& start)
   2297 {
   2298     if (!m_isBufferFull)
   2299         return 0;
   2300     if (!m_isCharacterStartBuffer[m_cursor])
   2301         return 0;
   2302 
   2303     size_t tailSpace = m_target.length() - m_cursor;
   2304     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
   2305         return 0;
   2306     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
   2307         return 0;
   2308 
   2309     start = length();
   2310 
   2311     // Now that we've found a match once, we don't want to find it again, because those
   2312     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
   2313     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
   2314     // we want to get rid of that in the future we could track this with a separate boolean
   2315     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
   2316     m_isCharacterStartBuffer[m_cursor] = false;
   2317 
   2318     return start;
   2319 }
   2320 
   2321 // Returns the number of characters that were appended to the buffer (what we are searching in).
   2322 // That's not necessarily the same length as the passed-in target string, because case folding
   2323 // can make two strings match even though they're not the same length.
   2324 size_t SearchBuffer::length() const
   2325 {
   2326     size_t bufferSize = m_target.length();
   2327     size_t length = 0;
   2328     for (size_t i = 0; i < bufferSize; ++i)
   2329         length += m_isCharacterStartBuffer[i];
   2330     return length;
   2331 }
   2332 
   2333 #endif // !ICU_UNICODE
   2334 
   2335 // --------
   2336 
   2337 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
   2338 {
   2339     int length = 0;
   2340     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
   2341         length += it.length();
   2342 
   2343     return length;
   2344 }
   2345 
   2346 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
   2347 {
   2348     CharacterIterator entireRangeIterator(entireRange);
   2349     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
   2350 }
   2351 
   2352 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
   2353 {
   2354     RefPtr<Range> resultRange = scope->document()->createRange();
   2355 
   2356     int docTextPosition = 0;
   2357     int rangeEnd = rangeLocation + rangeLength;
   2358     bool startRangeFound = false;
   2359 
   2360     RefPtr<Range> textRunRange;
   2361 
   2362     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
   2363 
   2364     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
   2365     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
   2366         textRunRange = it.range();
   2367 
   2368         ExceptionCode ec = 0;
   2369         resultRange->setStart(textRunRange->startContainer(), 0, ec);
   2370         ASSERT(!ec);
   2371         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
   2372         ASSERT(!ec);
   2373 
   2374         return resultRange.release();
   2375     }
   2376 
   2377     for (; !it.atEnd(); it.advance()) {
   2378         int len = it.length();
   2379         textRunRange = it.range();
   2380 
   2381         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
   2382         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
   2383 
   2384         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
   2385         // in those cases that textRunRange is used.
   2386         if (foundEnd) {
   2387             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
   2388             // position for emitted '\n's.
   2389             if (len == 1 && it.characters()[0] == '\n') {
   2390                 scope->document()->updateLayoutIgnorePendingStylesheets();
   2391                 it.advance();
   2392                 if (!it.atEnd()) {
   2393                     RefPtr<Range> range = it.range();
   2394                     ExceptionCode ec = 0;
   2395                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
   2396                     ASSERT(!ec);
   2397                 } else {
   2398                     Position runStart = textRunRange->startPosition();
   2399                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
   2400                     if (runEnd.isNotNull()) {
   2401                         ExceptionCode ec = 0;
   2402                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
   2403                         ASSERT(!ec);
   2404                     }
   2405                 }
   2406             }
   2407         }
   2408 
   2409         if (foundStart) {
   2410             startRangeFound = true;
   2411             int exception = 0;
   2412             if (textRunRange->startContainer()->isTextNode()) {
   2413                 int offset = rangeLocation - docTextPosition;
   2414                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2415             } else {
   2416                 if (rangeLocation == docTextPosition)
   2417                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2418                 else
   2419                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2420             }
   2421         }
   2422 
   2423         if (foundEnd) {
   2424             int exception = 0;
   2425             if (textRunRange->startContainer()->isTextNode()) {
   2426                 int offset = rangeEnd - docTextPosition;
   2427                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2428             } else {
   2429                 if (rangeEnd == docTextPosition)
   2430                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2431                 else
   2432                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2433             }
   2434             docTextPosition += len;
   2435             break;
   2436         }
   2437         docTextPosition += len;
   2438     }
   2439 
   2440     if (!startRangeFound)
   2441         return 0;
   2442 
   2443     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
   2444         int exception = 0;
   2445         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2446     }
   2447 
   2448     return resultRange.release();
   2449 }
   2450 
   2451 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
   2452 {
   2453     location = notFound;
   2454     length = 0;
   2455 
   2456     if (!range->startContainer())
   2457         return false;
   2458 
   2459     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
   2460     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
   2461 
   2462     // The critical assumption is that this only gets called with ranges that
   2463     // concentrate on a given area containing the selection root. This is done
   2464     // because of text fields and textareas. The DOM for those is not
   2465     // directly in the document DOM, so ensure that the range does not cross a
   2466     // boundary of one of those.
   2467     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
   2468         return false;
   2469     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
   2470         return false;
   2471 
   2472     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
   2473     ASSERT(testRange->startContainer() == scope);
   2474     location = TextIterator::rangeLength(testRange.get());
   2475 
   2476     ExceptionCode ec;
   2477     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
   2478     ASSERT(testRange->startContainer() == scope);
   2479     length = TextIterator::rangeLength(testRange.get()) - location;
   2480     return true;
   2481 }
   2482 
   2483 // --------
   2484 
   2485 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
   2486 {
   2487     UChar* result = 0;
   2488 
   2489     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
   2490     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
   2491     static const unsigned cMaxSegmentSize = 1 << 16;
   2492     bufferLength = 0;
   2493     typedef pair<UChar*, unsigned> TextSegment;
   2494     OwnPtr<Vector<TextSegment> > textSegments;
   2495     Vector<UChar> textBuffer;
   2496     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
   2497     TextIteratorBehavior behavior = defaultBehavior;
   2498     if (!isDisplayString)
   2499         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
   2500 
   2501     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
   2502         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
   2503             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
   2504             if (!newSegmentBuffer)
   2505                 goto exit;
   2506             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2507             if (!textSegments)
   2508                 textSegments = adoptPtr(new Vector<TextSegment>);
   2509             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
   2510             textBuffer.clear();
   2511         }
   2512         textBuffer.append(it.characters(), it.length());
   2513         bufferLength += it.length();
   2514     }
   2515 
   2516     if (!bufferLength)
   2517         return 0;
   2518 
   2519     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
   2520     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
   2521     if (!result)
   2522         goto exit;
   2523 
   2524     {
   2525         UChar* resultPos = result;
   2526         if (textSegments) {
   2527             unsigned size = textSegments->size();
   2528             for (unsigned i = 0; i < size; ++i) {
   2529                 const TextSegment& segment = textSegments->at(i);
   2530                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
   2531                 resultPos += segment.second;
   2532             }
   2533         }
   2534         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2535     }
   2536 
   2537 exit:
   2538     if (textSegments) {
   2539         unsigned size = textSegments->size();
   2540         for (unsigned i = 0; i < size; ++i)
   2541             free(textSegments->at(i).first);
   2542     }
   2543 
   2544     if (isDisplayString && r->ownerDocument())
   2545         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
   2546 
   2547     return result;
   2548 }
   2549 
   2550 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
   2551 {
   2552     unsigned length;
   2553     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
   2554     if (!buf)
   2555         return "";
   2556     String result(buf, length);
   2557     free(buf);
   2558     return result;
   2559 }
   2560 
   2561 static inline bool isAllCollapsibleWhitespace(const String& string)
   2562 {
   2563     const UChar* characters = string.characters();
   2564     unsigned length = string.length();
   2565     for (unsigned i = 0; i < length; ++i) {
   2566         if (!isCollapsibleWhitespace(characters[i]))
   2567             return false;
   2568     }
   2569     return true;
   2570 }
   2571 
   2572 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
   2573 {
   2574     ExceptionCode ec = 0;
   2575     RefPtr<Range> result = range->cloneRange(ec);
   2576     ASSERT(!ec);
   2577     result->collapse(!forward, ec);
   2578     ASSERT(!ec);
   2579     return result.release();
   2580 }
   2581 
   2582 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
   2583 {
   2584     matchStart = 0;
   2585     size_t matchLength = 0;
   2586 
   2587     SearchBuffer buffer(target, options);
   2588 
   2589     if (buffer.needsMoreContext()) {
   2590         RefPtr<Range> startRange = it.range();
   2591         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
   2592         ExceptionCode ec = 0;
   2593         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
   2594         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
   2595             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
   2596             if (!buffer.needsMoreContext())
   2597                 break;
   2598         }
   2599     }
   2600 
   2601     while (!it.atEnd()) {
   2602         it.advance(buffer.append(it.characters(), it.length()));
   2603 tryAgain:
   2604         size_t matchStartOffset;
   2605         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
   2606             // Note that we found a match, and where we found it.
   2607             size_t lastCharacterInBufferOffset = it.characterOffset();
   2608             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
   2609             matchStart = lastCharacterInBufferOffset - matchStartOffset;
   2610             matchLength = newMatchLength;
   2611             // If searching forward, stop on the first match.
   2612             // If searching backward, don't stop, so we end up with the last match.
   2613             if (!(options & Backwards))
   2614                 break;
   2615             goto tryAgain;
   2616         }
   2617         if (it.atBreak() && !buffer.atBreak()) {
   2618             buffer.reachedBreak();
   2619             goto tryAgain;
   2620         }
   2621     }
   2622 
   2623     return matchLength;
   2624 }
   2625 
   2626 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
   2627 {
   2628     // First, find the text.
   2629     size_t matchStart;
   2630     size_t matchLength;
   2631     {
   2632         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
   2633         matchLength = findPlainText(findIterator, target, options, matchStart);
   2634         if (!matchLength)
   2635             return collapsedToBoundary(range, !(options & Backwards));
   2636     }
   2637 
   2638     // Then, find the document position of the start and the end of the text.
   2639     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
   2640     return characterSubrange(computeRangeIterator, matchStart, matchLength);
   2641 }
   2642 
   2643 }
   2644