Home | History | Annotate | Download | only in editing
      1 /*
      2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
      3  * Copyright (C) 2005 Alexey Proskuryakov.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextIterator.h"
     29 
     30 #include "Document.h"
     31 #include "Frame.h"
     32 #include "HTMLElement.h"
     33 #include "HTMLNames.h"
     34 #include "htmlediting.h"
     35 #include "InlineTextBox.h"
     36 #include "Range.h"
     37 #include "RenderTableCell.h"
     38 #include "RenderTableRow.h"
     39 #include "RenderTextControl.h"
     40 #include "RenderTextFragment.h"
     41 #include "TextBoundaries.h"
     42 #include "TextBreakIterator.h"
     43 #include "VisiblePosition.h"
     44 #include "visible_units.h"
     45 #include <wtf/unicode/CharacterNames.h>
     46 
     47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     48 #include "TextBreakIteratorInternalICU.h"
     49 #include <unicode/usearch.h>
     50 #endif
     51 
     52 using namespace WTF::Unicode;
     53 using namespace std;
     54 
     55 namespace WebCore {
     56 
     57 using namespace HTMLNames;
     58 
     59 // Buffer that knows how to compare with a search target.
     60 // Keeps enough of the previous text to be able to search in the future, but no more.
     61 // Non-breaking spaces are always equal to normal spaces.
     62 // Case folding is also done if the CaseInsensitive option is specified.
     63 // Matches are further filtered if the AtWordStarts option is specified, although some
     64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
     65 class SearchBuffer {
     66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
     67 public:
     68     SearchBuffer(const String& target, FindOptions);
     69     ~SearchBuffer();
     70 
     71     // Returns number of characters appended; guaranteed to be in the range [1, length].
     72     size_t append(const UChar*, size_t length);
     73     bool needsMoreContext() const;
     74     void prependContext(const UChar*, size_t length);
     75     void reachedBreak();
     76 
     77     // Result is the size in characters of what was found.
     78     // And <startOffset> is the number of characters back to the start of what was found.
     79     size_t search(size_t& startOffset);
     80     bool atBreak() const;
     81 
     82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     83 
     84 private:
     85     bool isBadMatch(const UChar*, size_t length) const;
     86     bool isWordStartMatch(size_t start, size_t length) const;
     87 
     88     String m_target;
     89     FindOptions m_options;
     90 
     91     Vector<UChar> m_buffer;
     92     size_t m_overlap;
     93     size_t m_prefixLength;
     94     bool m_atBreak;
     95     bool m_needsMoreContext;
     96 
     97     bool m_targetRequiresKanaWorkaround;
     98     Vector<UChar> m_normalizedTarget;
     99     mutable Vector<UChar> m_normalizedMatch;
    100 
    101 #else
    102 
    103 private:
    104     void append(UChar, bool isCharacterStart);
    105     size_t length() const;
    106 
    107     String m_target;
    108     FindOptions m_options;
    109 
    110     Vector<UChar> m_buffer;
    111     Vector<bool> m_isCharacterStartBuffer;
    112     bool m_isBufferFull;
    113     size_t m_cursor;
    114 
    115 #endif
    116 };
    117 
    118 // --------
    119 
    120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
    121 static const unsigned bitInWordMask = bitsInWord - 1;
    122 
    123 BitStack::BitStack()
    124     : m_size(0)
    125 {
    126 }
    127 
    128 BitStack::~BitStack()
    129 {
    130 }
    131 
    132 void BitStack::push(bool bit)
    133 {
    134     unsigned index = m_size / bitsInWord;
    135     unsigned shift = m_size & bitInWordMask;
    136     if (!shift && index == m_words.size()) {
    137         m_words.grow(index + 1);
    138         m_words[index] = 0;
    139     }
    140     unsigned& word = m_words[index];
    141     unsigned mask = 1U << shift;
    142     if (bit)
    143         word |= mask;
    144     else
    145         word &= ~mask;
    146     ++m_size;
    147 }
    148 
    149 void BitStack::pop()
    150 {
    151     if (m_size)
    152         --m_size;
    153 }
    154 
    155 bool BitStack::top() const
    156 {
    157     if (!m_size)
    158         return false;
    159     unsigned shift = (m_size - 1) & bitInWordMask;
    160     return m_words.last() & (1U << shift);
    161 }
    162 
    163 unsigned BitStack::size() const
    164 {
    165     return m_size;
    166 }
    167 
    168 // --------
    169 
    170 #if !ASSERT_DISABLED
    171 
    172 static unsigned depthCrossingShadowBoundaries(Node* node)
    173 {
    174     unsigned depth = 0;
    175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    176         ++depth;
    177     return depth;
    178 }
    179 
    180 #endif
    181 
    182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
    183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
    184 {
    185     if (!rangeEndContainer)
    186         return 0;
    187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
    188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
    189             return next;
    190     }
    191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
    192         if (Node* next = node->nextSibling())
    193             return next;
    194     }
    195     return 0;
    196 }
    197 
    198 // --------
    199 
    200 static inline bool fullyClipsContents(Node* node)
    201 {
    202     RenderObject* renderer = node->renderer();
    203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
    204         return false;
    205     return toRenderBox(renderer)->size().isEmpty();
    206 }
    207 
    208 static inline bool ignoresContainerClip(Node* node)
    209 {
    210     RenderObject* renderer = node->renderer();
    211     if (!renderer || renderer->isText())
    212         return false;
    213     EPosition position = renderer->style()->position();
    214     return position == AbsolutePosition || position == FixedPosition;
    215 }
    216 
    217 static void pushFullyClippedState(BitStack& stack, Node* node)
    218 {
    219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
    220 
    221     // Push true if this node full clips its contents, or if a parent already has fully
    222     // clipped and this is not a node that ignores its container's clip.
    223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
    224 }
    225 
    226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
    227 {
    228     // Put the nodes in a vector so we can iterate in reverse order.
    229     Vector<Node*, 100> ancestry;
    230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    231         ancestry.append(parent);
    232 
    233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
    234     size_t size = ancestry.size();
    235     for (size_t i = 0; i < size; ++i)
    236         pushFullyClippedState(stack, ancestry[size - i - 1]);
    237     pushFullyClippedState(stack, node);
    238 
    239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
    240 }
    241 
    242 #if OS(ANDROID)
    243 static bool checkFormControlElement(Node* startNode)
    244 {
    245     Node* node = startNode;
    246     while (node) {
    247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
    248             return true;
    249         node = node->parentNode();
    250     }
    251     return false;
    252 }
    253 #endif
    254 
    255 
    256 // --------
    257 
    258 TextIterator::TextIterator()
    259     : m_startContainer(0)
    260     , m_startOffset(0)
    261     , m_endContainer(0)
    262     , m_endOffset(0)
    263     , m_positionNode(0)
    264     , m_textCharacters(0)
    265     , m_textLength(0)
    266     , m_remainingTextBox(0)
    267     , m_firstLetterText(0)
    268     , m_lastCharacter(0)
    269     , m_emitsCharactersBetweenAllVisiblePositions(false)
    270     , m_entersTextControls(false)
    271     , m_emitsTextWithoutTranscoding(false)
    272     , m_handledFirstLetter(false)
    273     , m_ignoresStyleVisibility(false)
    274     , m_emitsObjectReplacementCharacters(false)
    275 #if OS(ANDROID)
    276     , m_stopsOnFormControls(false)
    277     , m_shouldStop(false)
    278 #endif
    279 {
    280 }
    281 
    282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
    283     : m_startContainer(0)
    284     , m_startOffset(0)
    285     , m_endContainer(0)
    286     , m_endOffset(0)
    287     , m_positionNode(0)
    288     , m_textCharacters(0)
    289     , m_textLength(0)
    290 #if OS(ANDROID)
    291     , m_needsAnotherNewline(false)
    292 #endif
    293     , m_remainingTextBox(0)
    294     , m_firstLetterText(0)
    295     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
    296     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
    297     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
    298     , m_handledFirstLetter(false)
    299     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
    300     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
    301 #if OS(ANDROID)
    302     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
    303     , m_shouldStop(false)
    304 #endif
    305 {
    306     if (!r)
    307         return;
    308 
    309     // get and validate the range endpoints
    310     Node* startContainer = r->startContainer();
    311     if (!startContainer)
    312         return;
    313     int startOffset = r->startOffset();
    314     Node* endContainer = r->endContainer();
    315     int endOffset = r->endOffset();
    316 
    317     // Callers should be handing us well-formed ranges. If we discover that this isn't
    318     // the case, we could consider changing this assertion to an early return.
    319     ASSERT(r->boundaryPointsValid());
    320 
    321     // remember range - this does not change
    322     m_startContainer = startContainer;
    323     m_startOffset = startOffset;
    324     m_endContainer = endContainer;
    325     m_endOffset = endOffset;
    326 
    327     // set up the current node for processing
    328     m_node = r->firstNode();
    329     if (!m_node)
    330         return;
    331     setUpFullyClippedStack(m_fullyClippedStack, m_node);
    332     m_offset = m_node == m_startContainer ? m_startOffset : 0;
    333     m_handledNode = false;
    334     m_handledChildren = false;
    335 
    336     // calculate first out of bounds node
    337     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
    338 
    339     // initialize node processing state
    340     m_needsAnotherNewline = false;
    341     m_textBox = 0;
    342 
    343     // initialize record of previous node processing
    344     m_hasEmitted = false;
    345     m_lastTextNode = 0;
    346     m_lastTextNodeEndedWithCollapsedSpace = false;
    347     m_lastCharacter = 0;
    348 
    349 #ifndef NDEBUG
    350     // need this just because of the assert in advance()
    351     m_positionNode = m_node;
    352 #endif
    353 
    354     // identify the first run
    355     advance();
    356 }
    357 
    358 TextIterator::~TextIterator()
    359 {
    360 }
    361 
    362 bool TextIterator::atEnd() const
    363 {
    364 #if OS(ANDROID)
    365     return !m_positionNode || m_shouldStop;
    366 #else
    367     return !m_positionNode;
    368 #endif
    369 }
    370 
    371 void TextIterator::advance()
    372 {
    373 #if OS(ANDROID)
    374     if (m_shouldStop)
    375         return;
    376 #endif
    377     // reset the run information
    378     m_positionNode = 0;
    379     m_textLength = 0;
    380 
    381     // handle remembered node that needed a newline after the text node's newline
    382     if (m_needsAnotherNewline) {
    383         // Emit the extra newline, and position it *inside* m_node, after m_node's
    384         // contents, in case it's a block, in the same way that we position the first
    385         // newline.  The range for the emitted newline should start where the line
    386         // break begins.
    387         // FIXME: It would be cleaner if we emitted two newlines during the last
    388         // iteration, instead of using m_needsAnotherNewline.
    389         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    390         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    391         m_needsAnotherNewline = false;
    392         return;
    393     }
    394 
    395     if (!m_textBox && m_remainingTextBox) {
    396         m_textBox = m_remainingTextBox;
    397         m_remainingTextBox = 0;
    398         m_firstLetterText = 0;
    399         m_offset = 0;
    400     }
    401     // handle remembered text box
    402     if (m_textBox) {
    403         handleTextBox();
    404         if (m_positionNode)
    405             return;
    406     }
    407 
    408     while (m_node && m_node != m_pastEndNode) {
    409 #if OS(ANDROID)
    410         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
    411             m_shouldStop = true;
    412 #endif
    413         // if the range ends at offset 0 of an element, represent the
    414         // position, but not the content, of that element e.g. if the
    415         // node is a blockflow element, emit a newline that
    416         // precedes the element
    417         if (m_node == m_endContainer && m_endOffset == 0) {
    418             representNodeOffsetZero();
    419             m_node = 0;
    420             return;
    421         }
    422 
    423         RenderObject* renderer = m_node->renderer();
    424         if (!renderer) {
    425             m_handledNode = true;
    426             m_handledChildren = true;
    427         } else {
    428             // handle current node according to its type
    429             if (!m_handledNode) {
    430                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
    431                     m_handledNode = handleTextNode();
    432                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
    433                          (renderer->node() && renderer->node()->isElementNode() &&
    434                           static_cast<Element*>(renderer->node())->isFormControlElement())))
    435                     m_handledNode = handleReplacedElement();
    436                 else
    437                     m_handledNode = handleNonTextNode();
    438                 if (m_positionNode)
    439                     return;
    440             }
    441         }
    442 
    443         // find a new current node to handle in depth-first manner,
    444         // calling exitNode() as we come back thru a parent node
    445         Node* next = m_handledChildren ? 0 : m_node->firstChild();
    446         m_offset = 0;
    447         if (!next) {
    448             next = m_node->nextSibling();
    449             if (!next) {
    450                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
    451                 Node* parentNode = m_node->parentOrHostNode();
    452                 while (!next && parentNode) {
    453                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
    454                         return;
    455                     bool haveRenderer = m_node->renderer();
    456                     m_node = parentNode;
    457                     m_fullyClippedStack.pop();
    458                     parentNode = m_node->parentOrHostNode();
    459                     if (haveRenderer)
    460                         exitNode();
    461                     if (m_positionNode) {
    462                         m_handledNode = true;
    463                         m_handledChildren = true;
    464                         return;
    465                     }
    466                     next = m_node->nextSibling();
    467                 }
    468             }
    469             m_fullyClippedStack.pop();
    470         }
    471 
    472         // set the new current node
    473         m_node = next;
    474         if (m_node)
    475             pushFullyClippedState(m_fullyClippedStack, m_node);
    476         m_handledNode = false;
    477         m_handledChildren = false;
    478         m_handledFirstLetter = false;
    479         m_firstLetterText = 0;
    480 
    481         // how would this ever be?
    482         if (m_positionNode)
    483             return;
    484     }
    485 }
    486 
    487 bool TextIterator::handleTextNode()
    488 {
    489     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
    490         return false;
    491 
    492     RenderText* renderer = toRenderText(m_node->renderer());
    493 
    494     m_lastTextNode = m_node;
    495     String str = renderer->text();
    496 
    497     // handle pre-formatted text
    498     if (!renderer->style()->collapseWhiteSpace()) {
    499         int runStart = m_offset;
    500         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
    501             emitCharacter(' ', m_node, 0, runStart, runStart);
    502             return false;
    503         }
    504         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    505             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    506             if (m_firstLetterText) {
    507                 String firstLetter = m_firstLetterText->text();
    508                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
    509                 m_firstLetterText = 0;
    510                 m_textBox = 0;
    511                 return false;
    512             }
    513         }
    514         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    515             return false;
    516         int strLength = str.length();
    517         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
    518         int runEnd = min(strLength, end);
    519 
    520         if (runStart >= runEnd)
    521             return true;
    522 
    523         emitText(m_node, runStart, runEnd);
    524         return true;
    525     }
    526 
    527     if (!renderer->firstTextBox() && str.length() > 0) {
    528         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    529             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    530             if (m_firstLetterText) {
    531                 handleTextBox();
    532                 return false;
    533             }
    534         }
    535         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    536             return false;
    537         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
    538         return true;
    539     }
    540 
    541     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
    542     if (renderer->containsReversedText()) {
    543         m_sortedTextBoxes.clear();
    544         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
    545             m_sortedTextBoxes.append(textBox);
    546         }
    547         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
    548         m_sortedTextBoxesPosition = 0;
    549     }
    550 
    551     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    552     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
    553         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    554     handleTextBox();
    555     return true;
    556 }
    557 
    558 void TextIterator::handleTextBox()
    559 {
    560     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
    561     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
    562         m_textBox = 0;
    563         return;
    564     }
    565     String str = renderer->text();
    566     unsigned start = m_offset;
    567     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
    568     while (m_textBox) {
    569         unsigned textBoxStart = m_textBox->start();
    570         unsigned runStart = max(textBoxStart, start);
    571 
    572         // Check for collapsed space at the start of this run.
    573         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    574         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
    575             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
    576         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
    577             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
    578                 unsigned spaceRunStart = runStart - 1;
    579                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
    580                     --spaceRunStart;
    581                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
    582             } else
    583                 emitCharacter(' ', m_node, 0, runStart, runStart);
    584             return;
    585         }
    586         unsigned textBoxEnd = textBoxStart + m_textBox->len();
    587         unsigned runEnd = min(textBoxEnd, end);
    588 
    589         // Determine what the next text box will be, but don't advance yet
    590         InlineTextBox* nextTextBox = 0;
    591         if (renderer->containsReversedText()) {
    592             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
    593                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
    594         } else
    595             nextTextBox = m_textBox->nextTextBox();
    596 
    597         if (runStart < runEnd) {
    598             // Handle either a single newline character (which becomes a space),
    599             // or a run of characters that does not include a newline.
    600             // This effectively translates newlines to spaces without copying the text.
    601             if (str[runStart] == '\n') {
    602                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
    603                 m_offset = runStart + 1;
    604             } else {
    605                 size_t subrunEnd = str.find('\n', runStart);
    606                 if (subrunEnd == notFound || subrunEnd > runEnd)
    607                     subrunEnd = runEnd;
    608 
    609                 m_offset = subrunEnd;
    610                 emitText(m_node, renderer, runStart, subrunEnd);
    611             }
    612 
    613             // If we are doing a subrun that doesn't go to the end of the text box,
    614             // come back again to finish handling this text box; don't advance to the next one.
    615             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
    616                 return;
    617 
    618             // Advance and return
    619             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
    620             if (nextRunStart > runEnd)
    621                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
    622             m_textBox = nextTextBox;
    623             if (renderer->containsReversedText())
    624                 ++m_sortedTextBoxesPosition;
    625             return;
    626         }
    627         // Advance and continue
    628         m_textBox = nextTextBox;
    629         if (renderer->containsReversedText())
    630             ++m_sortedTextBoxesPosition;
    631     }
    632     if (!m_textBox && m_remainingTextBox) {
    633         m_textBox = m_remainingTextBox;
    634         m_remainingTextBox = 0;
    635         m_firstLetterText = 0;
    636         m_offset = 0;
    637         handleTextBox();
    638     }
    639 }
    640 
    641 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
    642 {
    643     if (renderer->firstLetter()) {
    644         RenderObject* r = renderer->firstLetter();
    645         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    646             return;
    647         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
    648             if (currChild->isText()) {
    649                 RenderText* firstLetter = toRenderText(currChild);
    650                 m_handledFirstLetter = true;
    651                 m_remainingTextBox = m_textBox;
    652                 m_textBox = firstLetter->firstTextBox();
    653                 m_firstLetterText = firstLetter;
    654                 return;
    655             }
    656         }
    657     }
    658     m_handledFirstLetter = true;
    659 }
    660 
    661 bool TextIterator::handleReplacedElement()
    662 {
    663     if (m_fullyClippedStack.top())
    664         return false;
    665 
    666     RenderObject* renderer = m_node->renderer();
    667     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    668         return false;
    669 
    670     if (m_lastTextNodeEndedWithCollapsedSpace) {
    671         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
    672         return false;
    673     }
    674 
    675     if (m_entersTextControls && renderer->isTextControl()) {
    676         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
    677             m_node = innerTextElement->shadowTreeRootNode();
    678             pushFullyClippedState(m_fullyClippedStack, m_node);
    679             m_offset = 0;
    680             return false;
    681         }
    682     }
    683 
    684     m_hasEmitted = true;
    685 
    686     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
    687         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
    688         return true;
    689     }
    690 
    691     if (m_emitsCharactersBetweenAllVisiblePositions) {
    692         // We want replaced elements to behave like punctuation for boundary
    693         // finding, and to simply take up space for the selection preservation
    694         // code in moveParagraphs, so we use a comma.
    695         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
    696         return true;
    697     }
    698 
    699     m_positionNode = m_node->parentNode();
    700     m_positionOffsetBaseNode = m_node;
    701     m_positionStartOffset = 0;
    702     m_positionEndOffset = 1;
    703 
    704     m_textCharacters = 0;
    705     m_textLength = 0;
    706 
    707     m_lastCharacter = 0;
    708 
    709     return true;
    710 }
    711 
    712 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
    713 {
    714     if (renderer->style()->visibility() == VISIBLE)
    715         return true;
    716     if (renderer->isTextFragment()) {
    717         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
    718         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
    719             return true;
    720     }
    721     return false;
    722 }
    723 
    724 static bool shouldEmitTabBeforeNode(Node* node)
    725 {
    726     RenderObject* r = node->renderer();
    727 
    728     // Table cells are delimited by tabs.
    729     if (!r || !isTableCell(node))
    730         return false;
    731 
    732     // Want a tab before every cell other than the first one
    733     RenderTableCell* rc = toRenderTableCell(r);
    734     RenderTable* t = rc->table();
    735     return t && (t->cellBefore(rc) || t->cellAbove(rc));
    736 }
    737 
    738 static bool shouldEmitNewlineForNode(Node* node)
    739 {
    740     // br elements are represented by a single newline.
    741     RenderObject* r = node->renderer();
    742     if (!r)
    743         return node->hasTagName(brTag);
    744 
    745     return r->isBR();
    746 }
    747 
    748 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
    749 {
    750     // Block flow (versus inline flow) is represented by having
    751     // a newline both before and after the element.
    752     RenderObject* r = node->renderer();
    753     if (!r) {
    754         return (node->hasTagName(blockquoteTag)
    755                 || node->hasTagName(ddTag)
    756                 || node->hasTagName(divTag)
    757                 || node->hasTagName(dlTag)
    758                 || node->hasTagName(dtTag)
    759                 || node->hasTagName(h1Tag)
    760                 || node->hasTagName(h2Tag)
    761                 || node->hasTagName(h3Tag)
    762                 || node->hasTagName(h4Tag)
    763                 || node->hasTagName(h5Tag)
    764                 || node->hasTagName(h6Tag)
    765                 || node->hasTagName(hrTag)
    766                 || node->hasTagName(liTag)
    767                 || node->hasTagName(listingTag)
    768                 || node->hasTagName(olTag)
    769                 || node->hasTagName(pTag)
    770                 || node->hasTagName(preTag)
    771                 || node->hasTagName(trTag)
    772                 || node->hasTagName(ulTag));
    773     }
    774 
    775     // Need to make an exception for table cells, because they are blocks, but we
    776     // want them tab-delimited rather than having newlines before and after.
    777     if (isTableCell(node))
    778         return false;
    779 
    780     // Need to make an exception for table row elements, because they are neither
    781     // "inline" or "RenderBlock", but we want newlines for them.
    782     if (r->isTableRow()) {
    783         RenderTable* t = toRenderTableRow(r)->table();
    784         if (t && !t->isInline())
    785             return true;
    786     }
    787 
    788     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
    789 }
    790 
    791 static bool shouldEmitNewlineAfterNode(Node* node)
    792 {
    793     // FIXME: It should be better but slower to create a VisiblePosition here.
    794     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
    795         return false;
    796     // Check if this is the very last renderer in the document.
    797     // If so, then we should not emit a newline.
    798     while ((node = node->traverseNextSibling()))
    799         if (node->renderer())
    800             return true;
    801     return false;
    802 }
    803 
    804 static bool shouldEmitNewlineBeforeNode(Node* node)
    805 {
    806     return shouldEmitNewlinesBeforeAndAfterNode(node);
    807 }
    808 
    809 static bool shouldEmitExtraNewlineForNode(Node* node)
    810 {
    811     // When there is a significant collapsed bottom margin, emit an extra
    812     // newline for a more realistic result.  We end up getting the right
    813     // result even without margin collapsing. For example: <div><p>text</p></div>
    814     // will work right even if both the <div> and the <p> have bottom margins.
    815     RenderObject* r = node->renderer();
    816     if (!r || !r->isBox())
    817         return false;
    818 
    819     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
    820     // not to do this at all
    821     if (node->hasTagName(h1Tag)
    822         || node->hasTagName(h2Tag)
    823         || node->hasTagName(h3Tag)
    824         || node->hasTagName(h4Tag)
    825         || node->hasTagName(h5Tag)
    826         || node->hasTagName(h6Tag)
    827         || node->hasTagName(pTag)) {
    828         RenderStyle* style = r->style();
    829         if (style) {
    830             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
    831             int fontSize = style->fontDescription().computedPixelSize();
    832             if (bottomMargin * 2 >= fontSize)
    833                 return true;
    834         }
    835     }
    836 
    837     return false;
    838 }
    839 
    840 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
    841 {
    842     const UChar* characters = renderer->text()->characters();
    843     int length = renderer->text()->length();
    844     for (int i = textEnd; i < length; ++i) {
    845         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
    846             return i - textEnd;
    847     }
    848 
    849     return length - textEnd;
    850 }
    851 
    852 static int maxOffsetIncludingCollapsedSpaces(Node* node)
    853 {
    854     int offset = caretMaxOffset(node);
    855 
    856     if (node->renderer() && node->renderer()->isText())
    857         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
    858 
    859     return offset;
    860 }
    861 
    862 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
    863 bool TextIterator::shouldRepresentNodeOffsetZero()
    864 {
    865     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
    866         return true;
    867 
    868     // Leave element positioned flush with start of a paragraph
    869     // (e.g. do not insert tab before a table cell at the start of a paragraph)
    870     if (m_lastCharacter == '\n')
    871         return false;
    872 
    873     // Otherwise, show the position if we have emitted any characters
    874     if (m_hasEmitted)
    875         return true;
    876 
    877     // We've not emitted anything yet. Generally, there is no need for any positioning then.
    878     // The only exception is when the element is visually not in the same line as
    879     // the start of the range (e.g. the range starts at the end of the previous paragraph).
    880     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
    881     // make quicker checks to possibly avoid that. Another check that we could make is
    882     // is whether the inline vs block flow changed since the previous visible element.
    883     // I think we're already in a special enough case that that won't be needed, tho.
    884 
    885     // No character needed if this is the first node in the range.
    886     if (m_node == m_startContainer)
    887         return false;
    888 
    889     // If we are outside the start container's subtree, assume we need to emit.
    890     // FIXME: m_startContainer could be an inline block
    891     if (!m_node->isDescendantOf(m_startContainer))
    892         return true;
    893 
    894     // If we started as m_startContainer offset 0 and the current node is a descendant of
    895     // the start container, we already had enough context to correctly decide whether to
    896     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
    897     // so don't second guess that now.
    898     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
    899     // immaterial since we likely would have already emitted something by now.
    900     if (m_startOffset == 0)
    901         return false;
    902 
    903     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
    904     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
    905     // we would create VisiblePositions on every call to this function without this check.
    906     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
    907         return false;
    908 
    909     // The startPos.isNotNull() check is needed because the start could be before the body,
    910     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
    911     // The currPos.isNotNull() check is needed because positions in non-HTML content
    912     // (like SVG) do not have visible positions, and we don't want to emit for them either.
    913     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
    914     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
    915     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
    916 }
    917 
    918 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
    919 {
    920     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
    921 }
    922 
    923 void TextIterator::representNodeOffsetZero()
    924 {
    925     // Emit a character to show the positioning of m_node.
    926 
    927     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
    928     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
    929     // on m_node to see if it necessitates emitting a character first and will early return
    930     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
    931     if (shouldEmitTabBeforeNode(m_node)) {
    932         if (shouldRepresentNodeOffsetZero())
    933             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
    934     } else if (shouldEmitNewlineBeforeNode(m_node)) {
    935         if (shouldRepresentNodeOffsetZero())
    936             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
    937     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
    938         if (shouldRepresentNodeOffsetZero())
    939             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
    940     }
    941 }
    942 
    943 bool TextIterator::handleNonTextNode()
    944 {
    945     if (shouldEmitNewlineForNode(m_node))
    946         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
    947     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
    948         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
    949     else
    950         representNodeOffsetZero();
    951 
    952     return true;
    953 }
    954 
    955 void TextIterator::exitNode()
    956 {
    957     // prevent emitting a newline when exiting a collapsed block at beginning of the range
    958     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
    959     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
    960     // therefore look like a blank line.
    961     if (!m_hasEmitted)
    962         return;
    963 
    964     // Emit with a position *inside* m_node, after m_node's contents, in
    965     // case it is a block, because the run should start where the
    966     // emitted character is positioned visually.
    967     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    968     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
    969     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
    970     // TextIterator in _web_attributedStringFromRange.
    971     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
    972     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
    973         // use extra newline to represent margin bottom, as needed
    974         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
    975 
    976         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
    977         // contain a VisiblePosition when doing selection preservation.
    978         if (m_lastCharacter != '\n') {
    979             // insert a newline with a position following this block's contents.
    980             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    981             // remember whether to later add a newline for the current node
    982             ASSERT(!m_needsAnotherNewline);
    983             m_needsAnotherNewline = addNewline;
    984         } else if (addNewline)
    985             // insert a newline with a position following this block's contents.
    986             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    987     }
    988 
    989     // If nothing was emitted, see if we need to emit a space.
    990     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
    991         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
    992 }
    993 
    994 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
    995 {
    996     m_hasEmitted = true;
    997 
    998     // remember information with which to construct the TextIterator::range()
    999     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
   1000     m_positionNode = textNode;
   1001     m_positionOffsetBaseNode = offsetBaseNode;
   1002     m_positionStartOffset = textStartOffset;
   1003     m_positionEndOffset = textEndOffset;
   1004 
   1005     // remember information with which to construct the TextIterator::characters() and length()
   1006     m_singleCharacterBuffer = c;
   1007     m_textCharacters = &m_singleCharacterBuffer;
   1008     m_textLength = 1;
   1009 
   1010     // remember some iteration state
   1011     m_lastTextNodeEndedWithCollapsedSpace = false;
   1012     m_lastCharacter = c;
   1013 }
   1014 
   1015 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
   1016 {
   1017     RenderText* renderer = toRenderText(renderObject);
   1018     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
   1019     ASSERT(m_text.characters());
   1020 
   1021     m_positionNode = textNode;
   1022     m_positionOffsetBaseNode = 0;
   1023     m_positionStartOffset = textStartOffset;
   1024     m_positionEndOffset = textEndOffset;
   1025     m_textCharacters = m_text.characters() + textStartOffset;
   1026     m_textLength = textEndOffset - textStartOffset;
   1027     m_lastCharacter = m_text[textEndOffset - 1];
   1028 
   1029     m_lastTextNodeEndedWithCollapsedSpace = false;
   1030     m_hasEmitted = true;
   1031 }
   1032 
   1033 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
   1034 {
   1035     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
   1036 }
   1037 
   1038 PassRefPtr<Range> TextIterator::range() const
   1039 {
   1040     // use the current run information, if we have it
   1041     if (m_positionNode) {
   1042         if (m_positionOffsetBaseNode) {
   1043             int index = m_positionOffsetBaseNode->nodeIndex();
   1044             m_positionStartOffset += index;
   1045             m_positionEndOffset += index;
   1046             m_positionOffsetBaseNode = 0;
   1047         }
   1048         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1049     }
   1050 
   1051     // otherwise, return the end of the overall range we were given
   1052     if (m_endContainer)
   1053         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
   1054 
   1055     return 0;
   1056 }
   1057 
   1058 Node* TextIterator::node() const
   1059 {
   1060     RefPtr<Range> textRange = range();
   1061     if (!textRange)
   1062         return 0;
   1063 
   1064     Node* node = textRange->startContainer();
   1065     if (!node)
   1066         return 0;
   1067     if (node->offsetInCharacters())
   1068         return node;
   1069 
   1070     return node->childNode(textRange->startOffset());
   1071 }
   1072 
   1073 // --------
   1074 
   1075 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
   1076     : m_behavior(TextIteratorDefaultBehavior)
   1077     , m_node(0)
   1078     , m_positionNode(0)
   1079 #if OS(ANDROID)
   1080     , m_stopsOnFormControls(false)
   1081     , m_shouldStop(false)
   1082 #endif
   1083 {
   1084 }
   1085 
   1086 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
   1087     : m_behavior(behavior)
   1088     , m_node(0)
   1089     , m_positionNode(0)
   1090 #if OS(ANDROID)
   1091     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
   1092     , m_shouldStop(false)
   1093 #endif
   1094 {
   1095 #if OS(ANDROID)
   1096     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
   1097 #else
   1098     ASSERT(m_behavior == TextIteratorDefaultBehavior);
   1099 #endif
   1100 
   1101     if (!r)
   1102         return;
   1103 
   1104     Node* startNode = r->startContainer();
   1105     if (!startNode)
   1106         return;
   1107     Node* endNode = r->endContainer();
   1108     int startOffset = r->startOffset();
   1109     int endOffset = r->endOffset();
   1110 
   1111     if (!startNode->offsetInCharacters()) {
   1112         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
   1113             startNode = startNode->childNode(startOffset);
   1114             startOffset = 0;
   1115         }
   1116     }
   1117     if (!endNode->offsetInCharacters()) {
   1118         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
   1119             endNode = endNode->childNode(endOffset - 1);
   1120             endOffset = lastOffsetInNode(endNode);
   1121         }
   1122     }
   1123 
   1124     m_node = endNode;
   1125     setUpFullyClippedStack(m_fullyClippedStack, m_node);
   1126     m_offset = endOffset;
   1127     m_handledNode = false;
   1128     m_handledChildren = endOffset == 0;
   1129 
   1130     m_startNode = startNode;
   1131     m_startOffset = startOffset;
   1132     m_endNode = endNode;
   1133     m_endOffset = endOffset;
   1134 
   1135 #ifndef NDEBUG
   1136     // Need this just because of the assert.
   1137     m_positionNode = endNode;
   1138 #endif
   1139 
   1140     m_lastTextNode = 0;
   1141     m_lastCharacter = '\n';
   1142 
   1143     m_havePassedStartNode = false;
   1144 
   1145     advance();
   1146 }
   1147 
   1148 bool SimplifiedBackwardsTextIterator::atEnd() const
   1149 {
   1150 #if OS(ANDROID)
   1151     return !m_positionNode || m_shouldStop;
   1152 #else
   1153     return !m_positionNode;
   1154 #endif
   1155 }
   1156 
   1157 void SimplifiedBackwardsTextIterator::advance()
   1158 {
   1159     ASSERT(m_positionNode);
   1160 
   1161 #if OS(ANDROID)
   1162     if (m_shouldStop)
   1163         return;
   1164 
   1165     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
   1166     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
   1167         m_shouldStop = true;
   1168         return;
   1169     }
   1170 #endif
   1171 
   1172     m_positionNode = 0;
   1173     m_textLength = 0;
   1174 
   1175     while (m_node && !m_havePassedStartNode) {
   1176         // Don't handle node if we start iterating at [node, 0].
   1177         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
   1178             RenderObject* renderer = m_node->renderer();
   1179             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
   1180                 // FIXME: What about CDATA_SECTION_NODE?
   1181                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1182                     m_handledNode = handleTextNode();
   1183             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
   1184                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1185                     m_handledNode = handleReplacedElement();
   1186             } else
   1187                 m_handledNode = handleNonTextNode();
   1188             if (m_positionNode)
   1189                 return;
   1190         }
   1191 
   1192         if (!m_handledChildren && m_node->hasChildNodes()) {
   1193             m_node = m_node->lastChild();
   1194             pushFullyClippedState(m_fullyClippedStack, m_node);
   1195         } else {
   1196             // Exit empty containers as we pass over them or containers
   1197             // where [container, 0] is where we started iterating.
   1198             if (!m_handledNode
   1199                     && canHaveChildrenForEditing(m_node)
   1200                     && m_node->parentNode()
   1201                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
   1202                 exitNode();
   1203                 if (m_positionNode) {
   1204                     m_handledNode = true;
   1205                     m_handledChildren = true;
   1206                     return;
   1207                 }
   1208             }
   1209 
   1210             // Exit all other containers.
   1211             while (!m_node->previousSibling()) {
   1212                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
   1213                     break;
   1214                 m_fullyClippedStack.pop();
   1215                 exitNode();
   1216                 if (m_positionNode) {
   1217                     m_handledNode = true;
   1218                     m_handledChildren = true;
   1219                     return;
   1220                 }
   1221             }
   1222 
   1223             m_fullyClippedStack.pop();
   1224             if (advanceRespectingRange(m_node->previousSibling()))
   1225                 pushFullyClippedState(m_fullyClippedStack, m_node);
   1226             else
   1227                 m_node = 0;
   1228         }
   1229 
   1230         // For the purpose of word boundary detection,
   1231         // we should iterate all visible text and trailing (collapsed) whitespaces.
   1232         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
   1233         m_handledNode = false;
   1234         m_handledChildren = false;
   1235 
   1236         if (m_positionNode)
   1237             return;
   1238     }
   1239 }
   1240 
   1241 bool SimplifiedBackwardsTextIterator::handleTextNode()
   1242 {
   1243     m_lastTextNode = m_node;
   1244 
   1245     RenderText* renderer = toRenderText(m_node->renderer());
   1246     String str = renderer->text();
   1247 
   1248     if (!renderer->firstTextBox() && str.length() > 0)
   1249         return true;
   1250 
   1251     m_positionEndOffset = m_offset;
   1252 
   1253     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
   1254     m_positionNode = m_node;
   1255     m_positionStartOffset = m_offset;
   1256     m_textLength = m_positionEndOffset - m_positionStartOffset;
   1257     m_textCharacters = str.characters() + m_positionStartOffset;
   1258 
   1259     m_lastCharacter = str[m_positionEndOffset - 1];
   1260 
   1261     return true;
   1262 }
   1263 
   1264 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
   1265 {
   1266     unsigned index = m_node->nodeIndex();
   1267     // We want replaced elements to behave like punctuation for boundary
   1268     // finding, and to simply take up space for the selection preservation
   1269     // code in moveParagraphs, so we use a comma.  Unconditionally emit
   1270     // here because this iterator is only used for boundary finding.
   1271     emitCharacter(',', m_node->parentNode(), index, index + 1);
   1272     return true;
   1273 }
   1274 
   1275 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
   1276 {
   1277     // We can use a linefeed in place of a tab because this simple iterator is only used to
   1278     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
   1279     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1280         unsigned index = m_node->nodeIndex();
   1281         // The start of this emitted range is wrong. Ensuring correctness would require
   1282         // VisiblePositions and so would be slow. previousBoundary expects this.
   1283         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
   1284     }
   1285     return true;
   1286 }
   1287 
   1288 void SimplifiedBackwardsTextIterator::exitNode()
   1289 {
   1290     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1291         // The start of this emitted range is wrong. Ensuring correctness would require
   1292         // VisiblePositions and so would be slow. previousBoundary expects this.
   1293         emitCharacter('\n', m_node, 0, 0);
   1294     }
   1295 }
   1296 
   1297 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
   1298 {
   1299     m_singleCharacterBuffer = c;
   1300     m_positionNode = node;
   1301     m_positionStartOffset = startOffset;
   1302     m_positionEndOffset = endOffset;
   1303     m_textCharacters = &m_singleCharacterBuffer;
   1304     m_textLength = 1;
   1305     m_lastCharacter = c;
   1306 }
   1307 
   1308 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
   1309 {
   1310     if (!next)
   1311         return false;
   1312     m_havePassedStartNode |= m_node == m_startNode;
   1313     if (m_havePassedStartNode)
   1314         return false;
   1315     m_node = next;
   1316     return true;
   1317 }
   1318 
   1319 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
   1320 {
   1321     if (m_positionNode)
   1322         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1323 
   1324     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
   1325 }
   1326 
   1327 // --------
   1328 
   1329 CharacterIterator::CharacterIterator()
   1330     : m_offset(0)
   1331     , m_runOffset(0)
   1332     , m_atBreak(true)
   1333 {
   1334 }
   1335 
   1336 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
   1337     : m_offset(0)
   1338     , m_runOffset(0)
   1339     , m_atBreak(true)
   1340     , m_textIterator(r, behavior)
   1341 {
   1342     while (!atEnd() && m_textIterator.length() == 0)
   1343         m_textIterator.advance();
   1344 }
   1345 
   1346 PassRefPtr<Range> CharacterIterator::range() const
   1347 {
   1348     RefPtr<Range> r = m_textIterator.range();
   1349     if (!m_textIterator.atEnd()) {
   1350         if (m_textIterator.length() <= 1) {
   1351             ASSERT(m_runOffset == 0);
   1352         } else {
   1353             Node* n = r->startContainer();
   1354             ASSERT(n == r->endContainer());
   1355             int offset = r->startOffset() + m_runOffset;
   1356             ExceptionCode ec = 0;
   1357             r->setStart(n, offset, ec);
   1358             r->setEnd(n, offset + 1, ec);
   1359             ASSERT(!ec);
   1360         }
   1361     }
   1362     return r.release();
   1363 }
   1364 
   1365 void CharacterIterator::advance(int count)
   1366 {
   1367     if (count <= 0) {
   1368         ASSERT(count == 0);
   1369         return;
   1370     }
   1371 
   1372     m_atBreak = false;
   1373 
   1374     // easy if there is enough left in the current m_textIterator run
   1375     int remaining = m_textIterator.length() - m_runOffset;
   1376     if (count < remaining) {
   1377         m_runOffset += count;
   1378         m_offset += count;
   1379         return;
   1380     }
   1381 
   1382     // exhaust the current m_textIterator run
   1383     count -= remaining;
   1384     m_offset += remaining;
   1385 
   1386     // move to a subsequent m_textIterator run
   1387     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1388         int runLength = m_textIterator.length();
   1389         if (runLength == 0)
   1390             m_atBreak = true;
   1391         else {
   1392             // see whether this is m_textIterator to use
   1393             if (count < runLength) {
   1394                 m_runOffset = count;
   1395                 m_offset += count;
   1396                 return;
   1397             }
   1398 
   1399             // exhaust this m_textIterator run
   1400             count -= runLength;
   1401             m_offset += runLength;
   1402         }
   1403     }
   1404 
   1405     // ran to the end of the m_textIterator... no more runs left
   1406     m_atBreak = true;
   1407     m_runOffset = 0;
   1408 }
   1409 
   1410 String CharacterIterator::string(int numChars)
   1411 {
   1412     Vector<UChar> result;
   1413     result.reserveInitialCapacity(numChars);
   1414     while (numChars > 0 && !atEnd()) {
   1415         int runSize = min(numChars, length());
   1416         result.append(characters(), runSize);
   1417         numChars -= runSize;
   1418         advance(runSize);
   1419     }
   1420     return String::adopt(result);
   1421 }
   1422 
   1423 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
   1424 {
   1425     it.advance(offset);
   1426     RefPtr<Range> start = it.range();
   1427 
   1428     if (length > 1)
   1429         it.advance(length - 1);
   1430     RefPtr<Range> end = it.range();
   1431 
   1432     return Range::create(start->startContainer()->document(),
   1433         start->startContainer(), start->startOffset(),
   1434         end->endContainer(), end->endOffset());
   1435 }
   1436 
   1437 BackwardsCharacterIterator::BackwardsCharacterIterator()
   1438     : m_offset(0)
   1439     , m_runOffset(0)
   1440     , m_atBreak(true)
   1441 {
   1442 }
   1443 
   1444 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
   1445     : m_offset(0)
   1446     , m_runOffset(0)
   1447     , m_atBreak(true)
   1448     , m_textIterator(range, behavior)
   1449 {
   1450     while (!atEnd() && !m_textIterator.length())
   1451         m_textIterator.advance();
   1452 }
   1453 
   1454 PassRefPtr<Range> BackwardsCharacterIterator::range() const
   1455 {
   1456     RefPtr<Range> r = m_textIterator.range();
   1457     if (!m_textIterator.atEnd()) {
   1458         if (m_textIterator.length() <= 1)
   1459             ASSERT(m_runOffset == 0);
   1460         else {
   1461             Node* n = r->startContainer();
   1462             ASSERT(n == r->endContainer());
   1463             int offset = r->endOffset() - m_runOffset;
   1464             ExceptionCode ec = 0;
   1465             r->setStart(n, offset - 1, ec);
   1466             r->setEnd(n, offset, ec);
   1467             ASSERT(!ec);
   1468         }
   1469     }
   1470     return r.release();
   1471 }
   1472 
   1473 void BackwardsCharacterIterator::advance(int count)
   1474 {
   1475     if (count <= 0) {
   1476         ASSERT(!count);
   1477         return;
   1478     }
   1479 
   1480     m_atBreak = false;
   1481 
   1482     int remaining = m_textIterator.length() - m_runOffset;
   1483     if (count < remaining) {
   1484         m_runOffset += count;
   1485         m_offset += count;
   1486         return;
   1487     }
   1488 
   1489     count -= remaining;
   1490     m_offset += remaining;
   1491 
   1492     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1493         int runLength = m_textIterator.length();
   1494         if (runLength == 0)
   1495             m_atBreak = true;
   1496         else {
   1497             if (count < runLength) {
   1498                 m_runOffset = count;
   1499                 m_offset += count;
   1500                 return;
   1501             }
   1502 
   1503             count -= runLength;
   1504             m_offset += runLength;
   1505         }
   1506     }
   1507 
   1508     m_atBreak = true;
   1509     m_runOffset = 0;
   1510 }
   1511 
   1512 // --------
   1513 
   1514 WordAwareIterator::WordAwareIterator()
   1515     : m_previousText(0)
   1516     , m_didLookAhead(false)
   1517 {
   1518 }
   1519 
   1520 WordAwareIterator::WordAwareIterator(const Range* r)
   1521     : m_previousText(0)
   1522     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
   1523     , m_textIterator(r)
   1524 {
   1525     advance(); // get in position over the first chunk of text
   1526 }
   1527 
   1528 WordAwareIterator::~WordAwareIterator()
   1529 {
   1530 }
   1531 
   1532 // We're always in one of these modes:
   1533 // - The current chunk in the text iterator is our current chunk
   1534 //      (typically its a piece of whitespace, or text that ended with whitespace)
   1535 // - The previous chunk in the text iterator is our current chunk
   1536 //      (we looked ahead to the next chunk and found a word boundary)
   1537 // - We built up our own chunk of text from many chunks from the text iterator
   1538 
   1539 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
   1540 
   1541 void WordAwareIterator::advance()
   1542 {
   1543     m_previousText = 0;
   1544     m_buffer.clear();      // toss any old buffer we built up
   1545 
   1546     // If last time we did a look-ahead, start with that looked-ahead chunk now
   1547     if (!m_didLookAhead) {
   1548         ASSERT(!m_textIterator.atEnd());
   1549         m_textIterator.advance();
   1550     }
   1551     m_didLookAhead = false;
   1552 
   1553     // Go to next non-empty chunk
   1554     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
   1555         m_textIterator.advance();
   1556     m_range = m_textIterator.range();
   1557 
   1558     if (m_textIterator.atEnd())
   1559         return;
   1560 
   1561     while (1) {
   1562         // If this chunk ends in whitespace we can just use it as our chunk.
   1563         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
   1564             return;
   1565 
   1566         // If this is the first chunk that failed, save it in previousText before look ahead
   1567         if (m_buffer.isEmpty()) {
   1568             m_previousText = m_textIterator.characters();
   1569             m_previousLength = m_textIterator.length();
   1570         }
   1571 
   1572         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
   1573         m_textIterator.advance();
   1574         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
   1575             m_didLookAhead = true;
   1576             return;
   1577         }
   1578 
   1579         if (m_buffer.isEmpty()) {
   1580             // Start gobbling chunks until we get to a suitable stopping point
   1581             m_buffer.append(m_previousText, m_previousLength);
   1582             m_previousText = 0;
   1583         }
   1584         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
   1585         int exception = 0;
   1586         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
   1587     }
   1588 }
   1589 
   1590 int WordAwareIterator::length() const
   1591 {
   1592     if (!m_buffer.isEmpty())
   1593         return m_buffer.size();
   1594     if (m_previousText)
   1595         return m_previousLength;
   1596     return m_textIterator.length();
   1597 }
   1598 
   1599 const UChar* WordAwareIterator::characters() const
   1600 {
   1601     if (!m_buffer.isEmpty())
   1602         return m_buffer.data();
   1603     if (m_previousText)
   1604         return m_previousText;
   1605     return m_textIterator.characters();
   1606 }
   1607 
   1608 // --------
   1609 
   1610 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
   1611 {
   1612     switch (c) {
   1613         case hebrewPunctuationGershayim:
   1614         case leftDoubleQuotationMark:
   1615         case rightDoubleQuotationMark:
   1616             return '"';
   1617         case hebrewPunctuationGeresh:
   1618         case leftSingleQuotationMark:
   1619         case rightSingleQuotationMark:
   1620             return '\'';
   1621         case softHyphen:
   1622             // Replace soft hyphen with an ignorable character so that their presence or absence will
   1623             // not affect string comparison.
   1624             return 0;
   1625         default:
   1626             return c;
   1627     }
   1628 }
   1629 
   1630 static inline void foldQuoteMarksAndSoftHyphens(String& s)
   1631 {
   1632     s.replace(hebrewPunctuationGeresh, '\'');
   1633     s.replace(hebrewPunctuationGershayim, '"');
   1634     s.replace(leftDoubleQuotationMark, '"');
   1635     s.replace(leftSingleQuotationMark, '\'');
   1636     s.replace(rightDoubleQuotationMark, '"');
   1637     s.replace(rightSingleQuotationMark, '\'');
   1638     // Replace soft hyphen with an ignorable character so that their presence or absence will
   1639     // not affect string comparison.
   1640     s.replace(softHyphen, 0);
   1641 }
   1642 
   1643 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
   1644 
   1645 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
   1646 {
   1647     for (size_t i = 0; i < length; ++i)
   1648         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
   1649 }
   1650 
   1651 static const size_t minimumSearchBufferSize = 8192;
   1652 
   1653 #ifndef NDEBUG
   1654 static bool searcherInUse;
   1655 #endif
   1656 
   1657 static UStringSearch* createSearcher()
   1658 {
   1659     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
   1660     // but it doesn't matter exactly what it is, since we don't perform any searches
   1661     // without setting both the pattern and the text.
   1662     UErrorCode status = U_ZERO_ERROR;
   1663     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
   1664     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
   1665     return searcher;
   1666 }
   1667 
   1668 static UStringSearch* searcher()
   1669 {
   1670     static UStringSearch* searcher = createSearcher();
   1671     return searcher;
   1672 }
   1673 
   1674 static inline void lockSearcher()
   1675 {
   1676 #ifndef NDEBUG
   1677     ASSERT(!searcherInUse);
   1678     searcherInUse = true;
   1679 #endif
   1680 }
   1681 
   1682 static inline void unlockSearcher()
   1683 {
   1684 #ifndef NDEBUG
   1685     ASSERT(searcherInUse);
   1686     searcherInUse = false;
   1687 #endif
   1688 }
   1689 
   1690 // ICU's search ignores the distinction between small kana letters and ones
   1691 // that are not small, and also characters that differ only in the voicing
   1692 // marks when considering only primary collation strength diffrences.
   1693 // This is not helpful for end users, since these differences make words
   1694 // distinct, so for our purposes we need these to be considered.
   1695 // The Unicode folks do not think the collation algorithm should be
   1696 // changed. To work around this, we would like to tailor the ICU searcher,
   1697 // but we can't get that to work yet. So instead, we check for cases where
   1698 // these differences occur, and skip those matches.
   1699 
   1700 // We refer to the above technique as the "kana workaround". The next few
   1701 // functions are helper functinos for the kana workaround.
   1702 
   1703 static inline bool isKanaLetter(UChar character)
   1704 {
   1705     // Hiragana letters.
   1706     if (character >= 0x3041 && character <= 0x3096)
   1707         return true;
   1708 
   1709     // Katakana letters.
   1710     if (character >= 0x30A1 && character <= 0x30FA)
   1711         return true;
   1712     if (character >= 0x31F0 && character <= 0x31FF)
   1713         return true;
   1714 
   1715     // Halfwidth katakana letters.
   1716     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
   1717         return true;
   1718 
   1719     return false;
   1720 }
   1721 
   1722 static inline bool isSmallKanaLetter(UChar character)
   1723 {
   1724     ASSERT(isKanaLetter(character));
   1725 
   1726     switch (character) {
   1727     case 0x3041: // HIRAGANA LETTER SMALL A
   1728     case 0x3043: // HIRAGANA LETTER SMALL I
   1729     case 0x3045: // HIRAGANA LETTER SMALL U
   1730     case 0x3047: // HIRAGANA LETTER SMALL E
   1731     case 0x3049: // HIRAGANA LETTER SMALL O
   1732     case 0x3063: // HIRAGANA LETTER SMALL TU
   1733     case 0x3083: // HIRAGANA LETTER SMALL YA
   1734     case 0x3085: // HIRAGANA LETTER SMALL YU
   1735     case 0x3087: // HIRAGANA LETTER SMALL YO
   1736     case 0x308E: // HIRAGANA LETTER SMALL WA
   1737     case 0x3095: // HIRAGANA LETTER SMALL KA
   1738     case 0x3096: // HIRAGANA LETTER SMALL KE
   1739     case 0x30A1: // KATAKANA LETTER SMALL A
   1740     case 0x30A3: // KATAKANA LETTER SMALL I
   1741     case 0x30A5: // KATAKANA LETTER SMALL U
   1742     case 0x30A7: // KATAKANA LETTER SMALL E
   1743     case 0x30A9: // KATAKANA LETTER SMALL O
   1744     case 0x30C3: // KATAKANA LETTER SMALL TU
   1745     case 0x30E3: // KATAKANA LETTER SMALL YA
   1746     case 0x30E5: // KATAKANA LETTER SMALL YU
   1747     case 0x30E7: // KATAKANA LETTER SMALL YO
   1748     case 0x30EE: // KATAKANA LETTER SMALL WA
   1749     case 0x30F5: // KATAKANA LETTER SMALL KA
   1750     case 0x30F6: // KATAKANA LETTER SMALL KE
   1751     case 0x31F0: // KATAKANA LETTER SMALL KU
   1752     case 0x31F1: // KATAKANA LETTER SMALL SI
   1753     case 0x31F2: // KATAKANA LETTER SMALL SU
   1754     case 0x31F3: // KATAKANA LETTER SMALL TO
   1755     case 0x31F4: // KATAKANA LETTER SMALL NU
   1756     case 0x31F5: // KATAKANA LETTER SMALL HA
   1757     case 0x31F6: // KATAKANA LETTER SMALL HI
   1758     case 0x31F7: // KATAKANA LETTER SMALL HU
   1759     case 0x31F8: // KATAKANA LETTER SMALL HE
   1760     case 0x31F9: // KATAKANA LETTER SMALL HO
   1761     case 0x31FA: // KATAKANA LETTER SMALL MU
   1762     case 0x31FB: // KATAKANA LETTER SMALL RA
   1763     case 0x31FC: // KATAKANA LETTER SMALL RI
   1764     case 0x31FD: // KATAKANA LETTER SMALL RU
   1765     case 0x31FE: // KATAKANA LETTER SMALL RE
   1766     case 0x31FF: // KATAKANA LETTER SMALL RO
   1767     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
   1768     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
   1769     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
   1770     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
   1771     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
   1772     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
   1773     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
   1774     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
   1775     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
   1776         return true;
   1777     }
   1778     return false;
   1779 }
   1780 
   1781 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
   1782 
   1783 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
   1784 {
   1785     ASSERT(isKanaLetter(character));
   1786 
   1787     switch (character) {
   1788     case 0x304C: // HIRAGANA LETTER GA
   1789     case 0x304E: // HIRAGANA LETTER GI
   1790     case 0x3050: // HIRAGANA LETTER GU
   1791     case 0x3052: // HIRAGANA LETTER GE
   1792     case 0x3054: // HIRAGANA LETTER GO
   1793     case 0x3056: // HIRAGANA LETTER ZA
   1794     case 0x3058: // HIRAGANA LETTER ZI
   1795     case 0x305A: // HIRAGANA LETTER ZU
   1796     case 0x305C: // HIRAGANA LETTER ZE
   1797     case 0x305E: // HIRAGANA LETTER ZO
   1798     case 0x3060: // HIRAGANA LETTER DA
   1799     case 0x3062: // HIRAGANA LETTER DI
   1800     case 0x3065: // HIRAGANA LETTER DU
   1801     case 0x3067: // HIRAGANA LETTER DE
   1802     case 0x3069: // HIRAGANA LETTER DO
   1803     case 0x3070: // HIRAGANA LETTER BA
   1804     case 0x3073: // HIRAGANA LETTER BI
   1805     case 0x3076: // HIRAGANA LETTER BU
   1806     case 0x3079: // HIRAGANA LETTER BE
   1807     case 0x307C: // HIRAGANA LETTER BO
   1808     case 0x3094: // HIRAGANA LETTER VU
   1809     case 0x30AC: // KATAKANA LETTER GA
   1810     case 0x30AE: // KATAKANA LETTER GI
   1811     case 0x30B0: // KATAKANA LETTER GU
   1812     case 0x30B2: // KATAKANA LETTER GE
   1813     case 0x30B4: // KATAKANA LETTER GO
   1814     case 0x30B6: // KATAKANA LETTER ZA
   1815     case 0x30B8: // KATAKANA LETTER ZI
   1816     case 0x30BA: // KATAKANA LETTER ZU
   1817     case 0x30BC: // KATAKANA LETTER ZE
   1818     case 0x30BE: // KATAKANA LETTER ZO
   1819     case 0x30C0: // KATAKANA LETTER DA
   1820     case 0x30C2: // KATAKANA LETTER DI
   1821     case 0x30C5: // KATAKANA LETTER DU
   1822     case 0x30C7: // KATAKANA LETTER DE
   1823     case 0x30C9: // KATAKANA LETTER DO
   1824     case 0x30D0: // KATAKANA LETTER BA
   1825     case 0x30D3: // KATAKANA LETTER BI
   1826     case 0x30D6: // KATAKANA LETTER BU
   1827     case 0x30D9: // KATAKANA LETTER BE
   1828     case 0x30DC: // KATAKANA LETTER BO
   1829     case 0x30F4: // KATAKANA LETTER VU
   1830     case 0x30F7: // KATAKANA LETTER VA
   1831     case 0x30F8: // KATAKANA LETTER VI
   1832     case 0x30F9: // KATAKANA LETTER VE
   1833     case 0x30FA: // KATAKANA LETTER VO
   1834         return VoicedSoundMark;
   1835     case 0x3071: // HIRAGANA LETTER PA
   1836     case 0x3074: // HIRAGANA LETTER PI
   1837     case 0x3077: // HIRAGANA LETTER PU
   1838     case 0x307A: // HIRAGANA LETTER PE
   1839     case 0x307D: // HIRAGANA LETTER PO
   1840     case 0x30D1: // KATAKANA LETTER PA
   1841     case 0x30D4: // KATAKANA LETTER PI
   1842     case 0x30D7: // KATAKANA LETTER PU
   1843     case 0x30DA: // KATAKANA LETTER PE
   1844     case 0x30DD: // KATAKANA LETTER PO
   1845         return SemiVoicedSoundMark;
   1846     }
   1847     return NoVoicedSoundMark;
   1848 }
   1849 
   1850 static inline bool isCombiningVoicedSoundMark(UChar character)
   1851 {
   1852     switch (character) {
   1853     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
   1854     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
   1855         return true;
   1856     }
   1857     return false;
   1858 }
   1859 
   1860 static inline bool containsKanaLetters(const String& pattern)
   1861 {
   1862     const UChar* characters = pattern.characters();
   1863     unsigned length = pattern.length();
   1864     for (unsigned i = 0; i < length; ++i) {
   1865         if (isKanaLetter(characters[i]))
   1866             return true;
   1867     }
   1868     return false;
   1869 }
   1870 
   1871 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
   1872 {
   1873     ASSERT(length);
   1874 
   1875     buffer.resize(length);
   1876 
   1877     UErrorCode status = U_ZERO_ERROR;
   1878     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
   1879     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
   1880     ASSERT(bufferSize);
   1881 
   1882     buffer.resize(bufferSize);
   1883 
   1884     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
   1885         return;
   1886 
   1887     status = U_ZERO_ERROR;
   1888     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
   1889     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
   1890 }
   1891 
   1892 static bool isNonLatin1Separator(UChar32 character)
   1893 {
   1894     ASSERT_ARG(character, character >= 256);
   1895 
   1896     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
   1897 }
   1898 
   1899 static inline bool isSeparator(UChar32 character)
   1900 {
   1901     static const bool latin1SeparatorTable[256] = {
   1902         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1903         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1904         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
   1905         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
   1906         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
   1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
   1908         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
   1909         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
   1910         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1911         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1912         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1913         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1914         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1915         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
   1916         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1917         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
   1918     };
   1919 
   1920     if (character < 256)
   1921         return latin1SeparatorTable[character];
   1922 
   1923     return isNonLatin1Separator(character);
   1924 }
   1925 
   1926 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   1927     : m_target(target)
   1928     , m_options(options)
   1929     , m_prefixLength(0)
   1930     , m_atBreak(true)
   1931     , m_needsMoreContext(options & AtWordStarts)
   1932     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
   1933 {
   1934     ASSERT(!m_target.isEmpty());
   1935 
   1936     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
   1937     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
   1938     // to add tailoring on top of the locale-specific tailoring as of this writing.
   1939     foldQuoteMarksAndSoftHyphens(m_target);
   1940 
   1941     size_t targetLength = m_target.length();
   1942     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
   1943     m_overlap = m_buffer.capacity() / 4;
   1944 
   1945     if ((m_options & AtWordStarts) && targetLength) {
   1946         UChar32 targetFirstCharacter;
   1947         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
   1948         // Characters in the separator category never really occur at the beginning of a word,
   1949         // so if the target begins with such a character, we just ignore the AtWordStart option.
   1950         if (isSeparator(targetFirstCharacter)) {
   1951             m_options &= ~AtWordStarts;
   1952             m_needsMoreContext = false;
   1953         }
   1954     }
   1955 
   1956     // Grab the single global searcher.
   1957     // If we ever have a reason to do more than once search buffer at once, we'll have
   1958     // to move to multiple searchers.
   1959     lockSearcher();
   1960 
   1961     UStringSearch* searcher = WebCore::searcher();
   1962     UCollator* collator = usearch_getCollator(searcher);
   1963 
   1964     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
   1965     if (ucol_getStrength(collator) != strength) {
   1966         ucol_setStrength(collator, strength);
   1967         usearch_reset(searcher);
   1968     }
   1969 
   1970     UErrorCode status = U_ZERO_ERROR;
   1971     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
   1972     ASSERT(status == U_ZERO_ERROR);
   1973 
   1974     // The kana workaround requires a normalized copy of the target string.
   1975     if (m_targetRequiresKanaWorkaround)
   1976         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
   1977 }
   1978 
   1979 inline SearchBuffer::~SearchBuffer()
   1980 {
   1981     // Leave the static object pointing to a valid string.
   1982     UErrorCode status = U_ZERO_ERROR;
   1983     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
   1984     ASSERT(status == U_ZERO_ERROR);
   1985 
   1986     unlockSearcher();
   1987 }
   1988 
   1989 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   1990 {
   1991     ASSERT(length);
   1992 
   1993     if (m_atBreak) {
   1994         m_buffer.shrink(0);
   1995         m_prefixLength = 0;
   1996         m_atBreak = false;
   1997     } else if (m_buffer.size() == m_buffer.capacity()) {
   1998         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
   1999         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
   2000         m_buffer.shrink(m_overlap);
   2001     }
   2002 
   2003     size_t oldLength = m_buffer.size();
   2004     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
   2005     ASSERT(usableLength);
   2006     m_buffer.append(characters, usableLength);
   2007     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
   2008     return usableLength;
   2009 }
   2010 
   2011 inline bool SearchBuffer::needsMoreContext() const
   2012 {
   2013     return m_needsMoreContext;
   2014 }
   2015 
   2016 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
   2017 {
   2018     ASSERT(m_needsMoreContext);
   2019     ASSERT(m_prefixLength == m_buffer.size());
   2020 
   2021     if (!length)
   2022         return;
   2023 
   2024     m_atBreak = false;
   2025 
   2026     size_t wordBoundaryContextStart = length;
   2027     if (wordBoundaryContextStart) {
   2028         U16_BACK_1(characters, 0, wordBoundaryContextStart);
   2029         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
   2030     }
   2031 
   2032     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
   2033     m_buffer.prepend(characters + length - usableLength, usableLength);
   2034     m_prefixLength += usableLength;
   2035 
   2036     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
   2037         m_needsMoreContext = false;
   2038 }
   2039 
   2040 inline bool SearchBuffer::atBreak() const
   2041 {
   2042     return m_atBreak;
   2043 }
   2044 
   2045 inline void SearchBuffer::reachedBreak()
   2046 {
   2047     m_atBreak = true;
   2048 }
   2049 
   2050 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
   2051 {
   2052     // This function implements the kana workaround. If usearch treats
   2053     // it as a match, but we do not want to, then it's a "bad match".
   2054     if (!m_targetRequiresKanaWorkaround)
   2055         return false;
   2056 
   2057     // Normalize into a match buffer. We reuse a single buffer rather than
   2058     // creating a new one each time.
   2059     normalizeCharacters(match, matchLength, m_normalizedMatch);
   2060 
   2061     const UChar* a = m_normalizedTarget.begin();
   2062     const UChar* aEnd = m_normalizedTarget.end();
   2063 
   2064     const UChar* b = m_normalizedMatch.begin();
   2065     const UChar* bEnd = m_normalizedMatch.end();
   2066 
   2067     while (true) {
   2068         // Skip runs of non-kana-letter characters. This is necessary so we can
   2069         // correctly handle strings where the target and match have different-length
   2070         // runs of characters that match, while still double checking the correctness
   2071         // of matches of kana letters with other kana letters.
   2072         while (a != aEnd && !isKanaLetter(*a))
   2073             ++a;
   2074         while (b != bEnd && !isKanaLetter(*b))
   2075             ++b;
   2076 
   2077         // If we reached the end of either the target or the match, we should have
   2078         // reached the end of both; both should have the same number of kana letters.
   2079         if (a == aEnd || b == bEnd) {
   2080             ASSERT(a == aEnd);
   2081             ASSERT(b == bEnd);
   2082             return false;
   2083         }
   2084 
   2085         // Check for differences in the kana letter character itself.
   2086         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
   2087             return true;
   2088         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
   2089             return true;
   2090         ++a;
   2091         ++b;
   2092 
   2093         // Check for differences in combining voiced sound marks found after the letter.
   2094         while (1) {
   2095             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
   2096                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
   2097                     return true;
   2098                 break;
   2099             }
   2100             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
   2101                 return true;
   2102             if (*a != *b)
   2103                 return true;
   2104             ++a;
   2105             ++b;
   2106         }
   2107     }
   2108 }
   2109 
   2110 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
   2111 {
   2112     ASSERT(m_options & AtWordStarts);
   2113 
   2114     if (!start)
   2115         return true;
   2116 
   2117     if (m_options & TreatMedialCapitalAsWordStart) {
   2118         int size = m_buffer.size();
   2119         int offset = start;
   2120         UChar32 firstCharacter;
   2121         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
   2122         UChar32 previousCharacter;
   2123         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
   2124 
   2125         if (isSeparator(firstCharacter)) {
   2126             // The start of a separator run is a word start (".org" in "webkit.org").
   2127             if (!isSeparator(previousCharacter))
   2128                 return true;
   2129         } else if (isASCIIUpper(firstCharacter)) {
   2130             // The start of an uppercase run is a word start ("Kit" in "WebKit").
   2131             if (!isASCIIUpper(previousCharacter))
   2132                 return true;
   2133             // The last character of an uppercase run followed by a non-separator, non-digit
   2134             // is a word start ("Request" in "XMLHTTPRequest").
   2135             offset = start;
   2136             U16_FWD_1(m_buffer.data(), offset, size);
   2137             UChar32 nextCharacter = 0;
   2138             if (offset < size)
   2139                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
   2140             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
   2141                 return true;
   2142         } else if (isASCIIDigit(firstCharacter)) {
   2143             // The start of a digit run is a word start ("2" in "WebKit2").
   2144             if (!isASCIIDigit(previousCharacter))
   2145                 return true;
   2146         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
   2147             // The start of a non-separator, non-uppercase, non-digit run is a word start,
   2148             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
   2149             return true;
   2150         }
   2151     }
   2152 
   2153     size_t wordBreakSearchStart = start + length;
   2154     while (wordBreakSearchStart > start)
   2155         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
   2156     return wordBreakSearchStart == start;
   2157 }
   2158 
   2159 inline size_t SearchBuffer::search(size_t& start)
   2160 {
   2161     size_t size = m_buffer.size();
   2162     if (m_atBreak) {
   2163         if (!size)
   2164             return 0;
   2165     } else {
   2166         if (size != m_buffer.capacity())
   2167             return 0;
   2168     }
   2169 
   2170     UStringSearch* searcher = WebCore::searcher();
   2171 
   2172     UErrorCode status = U_ZERO_ERROR;
   2173     usearch_setText(searcher, m_buffer.data(), size, &status);
   2174     ASSERT(status == U_ZERO_ERROR);
   2175 
   2176     usearch_setOffset(searcher, m_prefixLength, &status);
   2177     ASSERT(status == U_ZERO_ERROR);
   2178 
   2179     int matchStart = usearch_next(searcher, &status);
   2180     ASSERT(status == U_ZERO_ERROR);
   2181 
   2182 nextMatch:
   2183     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
   2184         ASSERT(matchStart == USEARCH_DONE);
   2185         return 0;
   2186     }
   2187 
   2188     // Matches that start in the overlap area are only tentative.
   2189     // The same match may appear later, matching more characters,
   2190     // possibly including a combining character that's not yet in the buffer.
   2191     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
   2192         size_t overlap = m_overlap;
   2193         if (m_options & AtWordStarts) {
   2194             // Ensure that there is sufficient context before matchStart the next time around for
   2195             // determining if it is at a word boundary.
   2196             int wordBoundaryContextStart = matchStart;
   2197             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
   2198             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
   2199             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
   2200         }
   2201         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
   2202         m_prefixLength -= min(m_prefixLength, size - overlap);
   2203         m_buffer.shrink(overlap);
   2204         return 0;
   2205     }
   2206 
   2207     size_t matchedLength = usearch_getMatchedLength(searcher);
   2208     ASSERT(matchStart + matchedLength <= size);
   2209 
   2210     // If this match is "bad", move on to the next match.
   2211     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
   2212         matchStart = usearch_next(searcher, &status);
   2213         ASSERT(status == U_ZERO_ERROR);
   2214         goto nextMatch;
   2215     }
   2216 
   2217     size_t newSize = size - (matchStart + 1);
   2218     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
   2219     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
   2220     m_buffer.shrink(newSize);
   2221 
   2222     start = size - matchStart;
   2223     return matchedLength;
   2224 }
   2225 
   2226 #else // !ICU_UNICODE
   2227 
   2228 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   2229     : m_target(options & CaseInsensitive ? target.foldCase() : target)
   2230     , m_options(options)
   2231     , m_buffer(m_target.length())
   2232     , m_isCharacterStartBuffer(m_target.length())
   2233     , m_isBufferFull(false)
   2234     , m_cursor(0)
   2235 {
   2236     ASSERT(!m_target.isEmpty());
   2237     m_target.replace(noBreakSpace, ' ');
   2238     foldQuoteMarksAndSoftHyphens(m_target);
   2239 }
   2240 
   2241 inline SearchBuffer::~SearchBuffer()
   2242 {
   2243 }
   2244 
   2245 inline void SearchBuffer::reachedBreak()
   2246 {
   2247     m_cursor = 0;
   2248     m_isBufferFull = false;
   2249 }
   2250 
   2251 inline bool SearchBuffer::atBreak() const
   2252 {
   2253     return !m_cursor && !m_isBufferFull;
   2254 }
   2255 
   2256 inline void SearchBuffer::append(UChar c, bool isStart)
   2257 {
   2258     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
   2259     m_isCharacterStartBuffer[m_cursor] = isStart;
   2260     if (++m_cursor == m_target.length()) {
   2261         m_cursor = 0;
   2262         m_isBufferFull = true;
   2263     }
   2264 }
   2265 
   2266 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   2267 {
   2268     ASSERT(length);
   2269     if (!(m_options & CaseInsensitive)) {
   2270         append(characters[0], true);
   2271         return 1;
   2272     }
   2273     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
   2274     UChar foldedCharacters[maxFoldedCharacters];
   2275     bool error;
   2276     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
   2277     ASSERT(!error);
   2278     ASSERT(numFoldedCharacters);
   2279     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
   2280     if (!error && numFoldedCharacters) {
   2281         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
   2282         append(foldedCharacters[0], true);
   2283         for (int i = 1; i < numFoldedCharacters; ++i)
   2284             append(foldedCharacters[i], false);
   2285     }
   2286     return 1;
   2287 }
   2288 
   2289 inline bool SearchBuffer::needsMoreContext() const
   2290 {
   2291     return false;
   2292 }
   2293 
   2294 void SearchBuffer::prependContext(const UChar*, size_t)
   2295 {
   2296     ASSERT_NOT_REACHED();
   2297 }
   2298 
   2299 inline size_t SearchBuffer::search(size_t& start)
   2300 {
   2301     if (!m_isBufferFull)
   2302         return 0;
   2303     if (!m_isCharacterStartBuffer[m_cursor])
   2304         return 0;
   2305 
   2306     size_t tailSpace = m_target.length() - m_cursor;
   2307     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
   2308         return 0;
   2309     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
   2310         return 0;
   2311 
   2312     start = length();
   2313 
   2314     // Now that we've found a match once, we don't want to find it again, because those
   2315     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
   2316     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
   2317     // we want to get rid of that in the future we could track this with a separate boolean
   2318     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
   2319     m_isCharacterStartBuffer[m_cursor] = false;
   2320 
   2321     return start;
   2322 }
   2323 
   2324 // Returns the number of characters that were appended to the buffer (what we are searching in).
   2325 // That's not necessarily the same length as the passed-in target string, because case folding
   2326 // can make two strings match even though they're not the same length.
   2327 size_t SearchBuffer::length() const
   2328 {
   2329     size_t bufferSize = m_target.length();
   2330     size_t length = 0;
   2331     for (size_t i = 0; i < bufferSize; ++i)
   2332         length += m_isCharacterStartBuffer[i];
   2333     return length;
   2334 }
   2335 
   2336 #endif // !ICU_UNICODE
   2337 
   2338 // --------
   2339 
   2340 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
   2341 {
   2342     int length = 0;
   2343     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
   2344         length += it.length();
   2345 
   2346     return length;
   2347 }
   2348 
   2349 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
   2350 {
   2351     CharacterIterator entireRangeIterator(entireRange);
   2352     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
   2353 }
   2354 
   2355 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
   2356 {
   2357     RefPtr<Range> resultRange = scope->document()->createRange();
   2358 
   2359     int docTextPosition = 0;
   2360     int rangeEnd = rangeLocation + rangeLength;
   2361     bool startRangeFound = false;
   2362 
   2363     RefPtr<Range> textRunRange;
   2364 
   2365     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
   2366 
   2367     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
   2368     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
   2369         textRunRange = it.range();
   2370 
   2371         ExceptionCode ec = 0;
   2372         resultRange->setStart(textRunRange->startContainer(), 0, ec);
   2373         ASSERT(!ec);
   2374         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
   2375         ASSERT(!ec);
   2376 
   2377         return resultRange.release();
   2378     }
   2379 
   2380     for (; !it.atEnd(); it.advance()) {
   2381         int len = it.length();
   2382         textRunRange = it.range();
   2383 
   2384         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
   2385         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
   2386 
   2387         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
   2388         // in those cases that textRunRange is used.
   2389         if (foundEnd) {
   2390             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
   2391             // position for emitted '\n's.
   2392             if (len == 1 && it.characters()[0] == '\n') {
   2393                 scope->document()->updateLayoutIgnorePendingStylesheets();
   2394                 it.advance();
   2395                 if (!it.atEnd()) {
   2396                     RefPtr<Range> range = it.range();
   2397                     ExceptionCode ec = 0;
   2398                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
   2399                     ASSERT(!ec);
   2400                 } else {
   2401                     Position runStart = textRunRange->startPosition();
   2402                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
   2403                     if (runEnd.isNotNull()) {
   2404                         ExceptionCode ec = 0;
   2405                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
   2406                         ASSERT(!ec);
   2407                     }
   2408                 }
   2409             }
   2410         }
   2411 
   2412         if (foundStart) {
   2413             startRangeFound = true;
   2414             int exception = 0;
   2415             if (textRunRange->startContainer()->isTextNode()) {
   2416                 int offset = rangeLocation - docTextPosition;
   2417                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2418             } else {
   2419                 if (rangeLocation == docTextPosition)
   2420                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2421                 else
   2422                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2423             }
   2424         }
   2425 
   2426         if (foundEnd) {
   2427             int exception = 0;
   2428             if (textRunRange->startContainer()->isTextNode()) {
   2429                 int offset = rangeEnd - docTextPosition;
   2430                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2431             } else {
   2432                 if (rangeEnd == docTextPosition)
   2433                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2434                 else
   2435                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2436             }
   2437             docTextPosition += len;
   2438             break;
   2439         }
   2440         docTextPosition += len;
   2441     }
   2442 
   2443     if (!startRangeFound)
   2444         return 0;
   2445 
   2446     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
   2447         int exception = 0;
   2448         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2449     }
   2450 
   2451     return resultRange.release();
   2452 }
   2453 
   2454 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
   2455 {
   2456     location = notFound;
   2457     length = 0;
   2458 
   2459     if (!range->startContainer())
   2460         return false;
   2461 
   2462     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
   2463     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
   2464 
   2465     // The critical assumption is that this only gets called with ranges that
   2466     // concentrate on a given area containing the selection root. This is done
   2467     // because of text fields and textareas. The DOM for those is not
   2468     // directly in the document DOM, so ensure that the range does not cross a
   2469     // boundary of one of those.
   2470     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
   2471         return false;
   2472     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
   2473         return false;
   2474 
   2475     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
   2476     ASSERT(testRange->startContainer() == scope);
   2477     location = TextIterator::rangeLength(testRange.get());
   2478 
   2479     ExceptionCode ec;
   2480     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
   2481     ASSERT(testRange->startContainer() == scope);
   2482     length = TextIterator::rangeLength(testRange.get()) - location;
   2483     return true;
   2484 }
   2485 
   2486 // --------
   2487 
   2488 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
   2489 {
   2490     UChar* result = 0;
   2491 
   2492     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
   2493     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
   2494     static const unsigned cMaxSegmentSize = 1 << 16;
   2495     bufferLength = 0;
   2496     typedef pair<UChar*, unsigned> TextSegment;
   2497     OwnPtr<Vector<TextSegment> > textSegments;
   2498     Vector<UChar> textBuffer;
   2499     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
   2500     TextIteratorBehavior behavior = defaultBehavior;
   2501     if (!isDisplayString)
   2502         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
   2503 
   2504     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
   2505         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
   2506             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
   2507             if (!newSegmentBuffer)
   2508                 goto exit;
   2509             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2510             if (!textSegments)
   2511                 textSegments = adoptPtr(new Vector<TextSegment>);
   2512             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
   2513             textBuffer.clear();
   2514         }
   2515         textBuffer.append(it.characters(), it.length());
   2516         bufferLength += it.length();
   2517     }
   2518 
   2519     if (!bufferLength)
   2520         return 0;
   2521 
   2522     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
   2523     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
   2524     if (!result)
   2525         goto exit;
   2526 
   2527     {
   2528         UChar* resultPos = result;
   2529         if (textSegments) {
   2530             unsigned size = textSegments->size();
   2531             for (unsigned i = 0; i < size; ++i) {
   2532                 const TextSegment& segment = textSegments->at(i);
   2533                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
   2534                 resultPos += segment.second;
   2535             }
   2536         }
   2537         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2538     }
   2539 
   2540 exit:
   2541     if (textSegments) {
   2542         unsigned size = textSegments->size();
   2543         for (unsigned i = 0; i < size; ++i)
   2544             free(textSegments->at(i).first);
   2545     }
   2546 
   2547     if (isDisplayString && r->ownerDocument())
   2548         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
   2549 
   2550     return result;
   2551 }
   2552 
   2553 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
   2554 {
   2555     unsigned length;
   2556     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
   2557     if (!buf)
   2558         return "";
   2559     String result(buf, length);
   2560     free(buf);
   2561     return result;
   2562 }
   2563 
   2564 static inline bool isAllCollapsibleWhitespace(const String& string)
   2565 {
   2566     const UChar* characters = string.characters();
   2567     unsigned length = string.length();
   2568     for (unsigned i = 0; i < length; ++i) {
   2569         if (!isCollapsibleWhitespace(characters[i]))
   2570             return false;
   2571     }
   2572     return true;
   2573 }
   2574 
   2575 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
   2576 {
   2577     ExceptionCode ec = 0;
   2578     RefPtr<Range> result = range->cloneRange(ec);
   2579     ASSERT(!ec);
   2580     result->collapse(!forward, ec);
   2581     ASSERT(!ec);
   2582     return result.release();
   2583 }
   2584 
   2585 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
   2586 {
   2587     matchStart = 0;
   2588     size_t matchLength = 0;
   2589 
   2590     SearchBuffer buffer(target, options);
   2591 
   2592     if (buffer.needsMoreContext()) {
   2593         RefPtr<Range> startRange = it.range();
   2594         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
   2595         ExceptionCode ec = 0;
   2596         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
   2597         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
   2598             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
   2599             if (!buffer.needsMoreContext())
   2600                 break;
   2601         }
   2602     }
   2603 
   2604     while (!it.atEnd()) {
   2605         it.advance(buffer.append(it.characters(), it.length()));
   2606 tryAgain:
   2607         size_t matchStartOffset;
   2608         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
   2609             // Note that we found a match, and where we found it.
   2610             size_t lastCharacterInBufferOffset = it.characterOffset();
   2611             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
   2612             matchStart = lastCharacterInBufferOffset - matchStartOffset;
   2613             matchLength = newMatchLength;
   2614             // If searching forward, stop on the first match.
   2615             // If searching backward, don't stop, so we end up with the last match.
   2616             if (!(options & Backwards))
   2617                 break;
   2618             goto tryAgain;
   2619         }
   2620         if (it.atBreak() && !buffer.atBreak()) {
   2621             buffer.reachedBreak();
   2622             goto tryAgain;
   2623         }
   2624     }
   2625 
   2626     return matchLength;
   2627 }
   2628 
   2629 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
   2630 {
   2631     // First, find the text.
   2632     size_t matchStart;
   2633     size_t matchLength;
   2634     {
   2635         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
   2636         matchLength = findPlainText(findIterator, target, options, matchStart);
   2637         if (!matchLength)
   2638             return collapsedToBoundary(range, !(options & Backwards));
   2639     }
   2640 
   2641     // Then, find the document position of the start and the end of the text.
   2642     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
   2643     return characterSubrange(computeRangeIterator, matchStart, matchLength);
   2644 }
   2645 
   2646 }
   2647