Home | History | Annotate | Download | only in editing
      1 /*
      2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
      3  * Copyright (C) 2005 Alexey Proskuryakov.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextIterator.h"
     29 
     30 #include "Document.h"
     31 #include "Frame.h"
     32 #include "HTMLElement.h"
     33 #include "HTMLNames.h"
     34 #include "htmlediting.h"
     35 #include "InlineTextBox.h"
     36 #include "Range.h"
     37 #include "RenderTableCell.h"
     38 #include "RenderTableRow.h"
     39 #include "RenderTextControl.h"
     40 #include "RenderTextFragment.h"
     41 #include "TextBoundaries.h"
     42 #include "TextBreakIterator.h"
     43 #include "VisiblePosition.h"
     44 #include "visible_units.h"
     45 #include <wtf/unicode/CharacterNames.h>
     46 
     47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     48 #include "TextBreakIteratorInternalICU.h"
     49 #include <unicode/usearch.h>
     50 #endif
     51 
     52 using namespace WTF::Unicode;
     53 using namespace std;
     54 
     55 namespace WebCore {
     56 
     57 using namespace HTMLNames;
     58 
     59 // Buffer that knows how to compare with a search target.
     60 // Keeps enough of the previous text to be able to search in the future, but no more.
     61 // Non-breaking spaces are always equal to normal spaces.
     62 // Case folding is also done if the CaseInsensitive option is specified.
     63 // Matches are further filtered if the AtWordStarts option is specified, although some
     64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
     65 class SearchBuffer {
     66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
     67 public:
     68     SearchBuffer(const String& target, FindOptions);
     69     ~SearchBuffer();
     70 
     71     // Returns number of characters appended; guaranteed to be in the range [1, length].
     72     size_t append(const UChar*, size_t length);
     73     bool needsMoreContext() const;
     74     void prependContext(const UChar*, size_t length);
     75     void reachedBreak();
     76 
     77     // Result is the size in characters of what was found.
     78     // And <startOffset> is the number of characters back to the start of what was found.
     79     size_t search(size_t& startOffset);
     80     bool atBreak() const;
     81 
     82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
     83 
     84 private:
     85     bool isBadMatch(const UChar*, size_t length) const;
     86     bool isWordStartMatch(size_t start, size_t length) const;
     87 
     88     String m_target;
     89     FindOptions m_options;
     90 
     91     Vector<UChar> m_buffer;
     92     size_t m_overlap;
     93     size_t m_prefixLength;
     94     bool m_atBreak;
     95     bool m_needsMoreContext;
     96 
     97     bool m_targetRequiresKanaWorkaround;
     98     Vector<UChar> m_normalizedTarget;
     99     mutable Vector<UChar> m_normalizedMatch;
    100 
    101 #else
    102 
    103 private:
    104     void append(UChar, bool isCharacterStart);
    105     size_t length() const;
    106 
    107     String m_target;
    108     FindOptions m_options;
    109 
    110     Vector<UChar> m_buffer;
    111     Vector<bool> m_isCharacterStartBuffer;
    112     bool m_isBufferFull;
    113     size_t m_cursor;
    114 
    115 #endif
    116 };
    117 
    118 // --------
    119 
    120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
    121 static const unsigned bitInWordMask = bitsInWord - 1;
    122 
    123 BitStack::BitStack()
    124     : m_size(0)
    125 {
    126 }
    127 
    128 BitStack::~BitStack()
    129 {
    130 }
    131 
    132 void BitStack::push(bool bit)
    133 {
    134     unsigned index = m_size / bitsInWord;
    135     unsigned shift = m_size & bitInWordMask;
    136     if (!shift && index == m_words.size()) {
    137         m_words.grow(index + 1);
    138         m_words[index] = 0;
    139     }
    140     unsigned& word = m_words[index];
    141     unsigned mask = 1U << shift;
    142     if (bit)
    143         word |= mask;
    144     else
    145         word &= ~mask;
    146     ++m_size;
    147 }
    148 
    149 void BitStack::pop()
    150 {
    151     if (m_size)
    152         --m_size;
    153 }
    154 
    155 bool BitStack::top() const
    156 {
    157     if (!m_size)
    158         return false;
    159     unsigned shift = (m_size - 1) & bitInWordMask;
    160     return m_words.last() & (1U << shift);
    161 }
    162 
    163 unsigned BitStack::size() const
    164 {
    165     return m_size;
    166 }
    167 
    168 // --------
    169 
    170 #if !ASSERT_DISABLED
    171 
    172 static unsigned depthCrossingShadowBoundaries(Node* node)
    173 {
    174     unsigned depth = 0;
    175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    176         ++depth;
    177     return depth;
    178 }
    179 
    180 #endif
    181 
    182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
    183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
    184 {
    185     if (!rangeEndContainer)
    186         return 0;
    187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
    188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
    189             return next;
    190     }
    191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
    192         if (Node* next = node->nextSibling())
    193             return next;
    194     }
    195     return 0;
    196 }
    197 
    198 // --------
    199 
    200 static inline bool fullyClipsContents(Node* node)
    201 {
    202     RenderObject* renderer = node->renderer();
    203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
    204         return false;
    205     return toRenderBox(renderer)->size().isEmpty();
    206 }
    207 
    208 static inline bool ignoresContainerClip(Node* node)
    209 {
    210     RenderObject* renderer = node->renderer();
    211     if (!renderer || renderer->isText())
    212         return false;
    213     EPosition position = renderer->style()->position();
    214     return position == AbsolutePosition || position == FixedPosition;
    215 }
    216 
    217 static void pushFullyClippedState(BitStack& stack, Node* node)
    218 {
    219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
    220 
    221     // Push true if this node full clips its contents, or if a parent already has fully
    222     // clipped and this is not a node that ignores its container's clip.
    223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
    224 }
    225 
    226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
    227 {
    228     // Put the nodes in a vector so we can iterate in reverse order.
    229     Vector<Node*, 100> ancestry;
    230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
    231         ancestry.append(parent);
    232 
    233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
    234     size_t size = ancestry.size();
    235     for (size_t i = 0; i < size; ++i)
    236         pushFullyClippedState(stack, ancestry[size - i - 1]);
    237     pushFullyClippedState(stack, node);
    238 
    239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
    240 }
    241 
    242 #if OS(ANDROID)
    243 static bool checkFormControlElement(Node* startNode)
    244 {
    245     Node* node = startNode;
    246     while (node) {
    247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
    248             return true;
    249         node = node->parentNode();
    250     }
    251     return false;
    252 }
    253 #endif
    254 
    255 
    256 // --------
    257 
    258 TextIterator::TextIterator()
    259     : m_startContainer(0)
    260     , m_startOffset(0)
    261     , m_endContainer(0)
    262     , m_endOffset(0)
    263     , m_positionNode(0)
    264     , m_textCharacters(0)
    265     , m_textLength(0)
    266     , m_remainingTextBox(0)
    267     , m_firstLetterText(0)
    268     , m_lastCharacter(0)
    269     , m_emitsCharactersBetweenAllVisiblePositions(false)
    270     , m_entersTextControls(false)
    271     , m_emitsTextWithoutTranscoding(false)
    272     , m_handledFirstLetter(false)
    273     , m_ignoresStyleVisibility(false)
    274     , m_emitsObjectReplacementCharacters(false)
    275 #if OS(ANDROID)
    276     , m_stopsOnFormControls(false)
    277     , m_shouldStop(false)
    278 #endif
    279 {
    280 }
    281 
    282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
    283     : m_startContainer(0)
    284     , m_startOffset(0)
    285     , m_endContainer(0)
    286     , m_endOffset(0)
    287     , m_positionNode(0)
    288     , m_textCharacters(0)
    289     , m_textLength(0)
    290     , m_remainingTextBox(0)
    291     , m_firstLetterText(0)
    292     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
    293     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
    294     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
    295     , m_handledFirstLetter(false)
    296     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
    297     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
    298 #if OS(ANDROID)
    299     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
    300     , m_shouldStop(false)
    301     , m_needsAnotherNewline(false)
    302 #endif
    303 {
    304     if (!r)
    305         return;
    306 
    307     // get and validate the range endpoints
    308     Node* startContainer = r->startContainer();
    309     if (!startContainer)
    310         return;
    311     int startOffset = r->startOffset();
    312     Node* endContainer = r->endContainer();
    313     int endOffset = r->endOffset();
    314 
    315     // Callers should be handing us well-formed ranges. If we discover that this isn't
    316     // the case, we could consider changing this assertion to an early return.
    317     ASSERT(r->boundaryPointsValid());
    318 
    319     // remember range - this does not change
    320     m_startContainer = startContainer;
    321     m_startOffset = startOffset;
    322     m_endContainer = endContainer;
    323     m_endOffset = endOffset;
    324 
    325     // set up the current node for processing
    326     m_node = r->firstNode();
    327     if (!m_node)
    328         return;
    329     setUpFullyClippedStack(m_fullyClippedStack, m_node);
    330     m_offset = m_node == m_startContainer ? m_startOffset : 0;
    331     m_handledNode = false;
    332     m_handledChildren = false;
    333 
    334     // calculate first out of bounds node
    335     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
    336 
    337     // initialize node processing state
    338     m_needsAnotherNewline = false;
    339     m_textBox = 0;
    340 
    341     // initialize record of previous node processing
    342     m_hasEmitted = false;
    343     m_lastTextNode = 0;
    344     m_lastTextNodeEndedWithCollapsedSpace = false;
    345     m_lastCharacter = 0;
    346 
    347 #ifndef NDEBUG
    348     // need this just because of the assert in advance()
    349     m_positionNode = m_node;
    350 #endif
    351 
    352     // identify the first run
    353     advance();
    354 }
    355 
    356 TextIterator::~TextIterator()
    357 {
    358 }
    359 
    360 bool TextIterator::atEnd() const
    361 {
    362 #if OS(ANDROID)
    363     return !m_positionNode || m_shouldStop;
    364 #else
    365     return !m_positionNode;
    366 #endif
    367 }
    368 
    369 void TextIterator::advance()
    370 {
    371 #if OS(ANDROID)
    372     if (m_shouldStop)
    373         return;
    374 #endif
    375     // reset the run information
    376     m_positionNode = 0;
    377     m_textLength = 0;
    378 
    379     // handle remembered node that needed a newline after the text node's newline
    380     if (m_needsAnotherNewline) {
    381         // Emit the extra newline, and position it *inside* m_node, after m_node's
    382         // contents, in case it's a block, in the same way that we position the first
    383         // newline.  The range for the emitted newline should start where the line
    384         // break begins.
    385         // FIXME: It would be cleaner if we emitted two newlines during the last
    386         // iteration, instead of using m_needsAnotherNewline.
    387         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    388         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    389         m_needsAnotherNewline = false;
    390         return;
    391     }
    392 
    393     if (!m_textBox && m_remainingTextBox) {
    394         m_textBox = m_remainingTextBox;
    395         m_remainingTextBox = 0;
    396         m_firstLetterText = 0;
    397         m_offset = 0;
    398     }
    399     // handle remembered text box
    400     if (m_textBox) {
    401         handleTextBox();
    402         if (m_positionNode)
    403             return;
    404     }
    405 
    406     while (m_node && m_node != m_pastEndNode) {
    407 #if OS(ANDROID)
    408         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
    409             m_shouldStop = true;
    410 #endif
    411         // if the range ends at offset 0 of an element, represent the
    412         // position, but not the content, of that element e.g. if the
    413         // node is a blockflow element, emit a newline that
    414         // precedes the element
    415         if (m_node == m_endContainer && m_endOffset == 0) {
    416             representNodeOffsetZero();
    417             m_node = 0;
    418             return;
    419         }
    420 
    421         RenderObject* renderer = m_node->renderer();
    422         if (!renderer) {
    423             m_handledNode = true;
    424             m_handledChildren = true;
    425         } else {
    426             // handle current node according to its type
    427             if (!m_handledNode) {
    428                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
    429                     m_handledNode = handleTextNode();
    430                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
    431                          (renderer->node() && renderer->node()->isElementNode() &&
    432                           static_cast<Element*>(renderer->node())->isFormControlElement())))
    433                     m_handledNode = handleReplacedElement();
    434                 else
    435                     m_handledNode = handleNonTextNode();
    436                 if (m_positionNode)
    437                     return;
    438             }
    439         }
    440 
    441         // find a new current node to handle in depth-first manner,
    442         // calling exitNode() as we come back thru a parent node
    443         Node* next = m_handledChildren ? 0 : m_node->firstChild();
    444         m_offset = 0;
    445         if (!next) {
    446             next = m_node->nextSibling();
    447             if (!next) {
    448                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
    449                 Node* parentNode = m_node->parentOrHostNode();
    450                 while (!next && parentNode) {
    451                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
    452                         return;
    453                     bool haveRenderer = m_node->renderer();
    454                     m_node = parentNode;
    455                     m_fullyClippedStack.pop();
    456                     parentNode = m_node->parentOrHostNode();
    457                     if (haveRenderer)
    458                         exitNode();
    459                     if (m_positionNode) {
    460                         m_handledNode = true;
    461                         m_handledChildren = true;
    462                         return;
    463                     }
    464                     next = m_node->nextSibling();
    465                 }
    466             }
    467             m_fullyClippedStack.pop();
    468         }
    469 
    470         // set the new current node
    471         m_node = next;
    472         if (m_node)
    473             pushFullyClippedState(m_fullyClippedStack, m_node);
    474         m_handledNode = false;
    475         m_handledChildren = false;
    476         m_handledFirstLetter = false;
    477         m_firstLetterText = 0;
    478 
    479         // how would this ever be?
    480         if (m_positionNode)
    481             return;
    482     }
    483 }
    484 
    485 bool TextIterator::handleTextNode()
    486 {
    487     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
    488         return false;
    489 
    490     RenderText* renderer = toRenderText(m_node->renderer());
    491 
    492     m_lastTextNode = m_node;
    493     String str = renderer->text();
    494 
    495     // handle pre-formatted text
    496     if (!renderer->style()->collapseWhiteSpace()) {
    497         int runStart = m_offset;
    498         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
    499             emitCharacter(' ', m_node, 0, runStart, runStart);
    500             return false;
    501         }
    502         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    503             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    504             if (m_firstLetterText) {
    505                 String firstLetter = m_firstLetterText->text();
    506                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
    507                 m_firstLetterText = 0;
    508                 m_textBox = 0;
    509                 return false;
    510             }
    511         }
    512         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    513             return false;
    514         int strLength = str.length();
    515         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
    516         int runEnd = min(strLength, end);
    517 
    518         if (runStart >= runEnd)
    519             return true;
    520 
    521         emitText(m_node, runStart, runEnd);
    522         return true;
    523     }
    524 
    525     if (!renderer->firstTextBox() && str.length() > 0) {
    526         if (!m_handledFirstLetter && renderer->isTextFragment()) {
    527             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    528             if (m_firstLetterText) {
    529                 handleTextBox();
    530                 return false;
    531             }
    532         }
    533         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    534             return false;
    535         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
    536         return true;
    537     }
    538 
    539     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
    540     if (renderer->containsReversedText()) {
    541         m_sortedTextBoxes.clear();
    542         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
    543             m_sortedTextBoxes.append(textBox);
    544         }
    545         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
    546         m_sortedTextBoxesPosition = 0;
    547     }
    548 
    549     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    550     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
    551         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
    552     handleTextBox();
    553     return true;
    554 }
    555 
    556 void TextIterator::handleTextBox()
    557 {
    558     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
    559     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
    560         m_textBox = 0;
    561         return;
    562     }
    563     String str = renderer->text();
    564     unsigned start = m_offset;
    565     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
    566     while (m_textBox) {
    567         unsigned textBoxStart = m_textBox->start();
    568         unsigned runStart = max(textBoxStart, start);
    569 
    570         // Check for collapsed space at the start of this run.
    571         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
    572         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
    573             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
    574         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
    575             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
    576                 unsigned spaceRunStart = runStart - 1;
    577                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
    578                     --spaceRunStart;
    579                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
    580             } else
    581                 emitCharacter(' ', m_node, 0, runStart, runStart);
    582             return;
    583         }
    584         unsigned textBoxEnd = textBoxStart + m_textBox->len();
    585         unsigned runEnd = min(textBoxEnd, end);
    586 
    587         // Determine what the next text box will be, but don't advance yet
    588         InlineTextBox* nextTextBox = 0;
    589         if (renderer->containsReversedText()) {
    590             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
    591                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
    592         } else
    593             nextTextBox = m_textBox->nextTextBox();
    594 
    595         if (runStart < runEnd) {
    596             // Handle either a single newline character (which becomes a space),
    597             // or a run of characters that does not include a newline.
    598             // This effectively translates newlines to spaces without copying the text.
    599             if (str[runStart] == '\n') {
    600                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
    601                 m_offset = runStart + 1;
    602             } else {
    603                 size_t subrunEnd = str.find('\n', runStart);
    604                 if (subrunEnd == notFound || subrunEnd > runEnd)
    605                     subrunEnd = runEnd;
    606 
    607                 m_offset = subrunEnd;
    608                 emitText(m_node, renderer, runStart, subrunEnd);
    609             }
    610 
    611             // If we are doing a subrun that doesn't go to the end of the text box,
    612             // come back again to finish handling this text box; don't advance to the next one.
    613             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
    614                 return;
    615 
    616             // Advance and return
    617             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
    618             if (nextRunStart > runEnd)
    619                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
    620             m_textBox = nextTextBox;
    621             if (renderer->containsReversedText())
    622                 ++m_sortedTextBoxesPosition;
    623             return;
    624         }
    625         // Advance and continue
    626         m_textBox = nextTextBox;
    627         if (renderer->containsReversedText())
    628             ++m_sortedTextBoxesPosition;
    629     }
    630     if (!m_textBox && m_remainingTextBox) {
    631         m_textBox = m_remainingTextBox;
    632         m_remainingTextBox = 0;
    633         m_firstLetterText = 0;
    634         m_offset = 0;
    635         handleTextBox();
    636     }
    637 }
    638 
    639 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
    640 {
    641     if (renderer->firstLetter()) {
    642         RenderObject* r = renderer->firstLetter();
    643         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    644             return;
    645         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
    646             if (currChild->isText()) {
    647                 RenderText* firstLetter = toRenderText(currChild);
    648                 m_handledFirstLetter = true;
    649                 m_remainingTextBox = m_textBox;
    650                 m_textBox = firstLetter->firstTextBox();
    651                 m_firstLetterText = firstLetter;
    652                 return;
    653             }
    654         }
    655     }
    656     m_handledFirstLetter = true;
    657 }
    658 
    659 bool TextIterator::handleReplacedElement()
    660 {
    661     if (m_fullyClippedStack.top())
    662         return false;
    663 
    664     RenderObject* renderer = m_node->renderer();
    665     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
    666         return false;
    667 
    668     if (m_lastTextNodeEndedWithCollapsedSpace) {
    669         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
    670         return false;
    671     }
    672 
    673     if (m_entersTextControls && renderer->isTextControl()) {
    674         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
    675             m_node = innerTextElement->shadowTreeRootNode();
    676             pushFullyClippedState(m_fullyClippedStack, m_node);
    677             m_offset = 0;
    678             return false;
    679         }
    680     }
    681 
    682     m_hasEmitted = true;
    683 
    684     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
    685         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
    686         return true;
    687     }
    688 
    689     if (m_emitsCharactersBetweenAllVisiblePositions) {
    690         // We want replaced elements to behave like punctuation for boundary
    691         // finding, and to simply take up space for the selection preservation
    692         // code in moveParagraphs, so we use a comma.
    693         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
    694         return true;
    695     }
    696 
    697     m_positionNode = m_node->parentNode();
    698     m_positionOffsetBaseNode = m_node;
    699     m_positionStartOffset = 0;
    700     m_positionEndOffset = 1;
    701 
    702     m_textCharacters = 0;
    703     m_textLength = 0;
    704 
    705     m_lastCharacter = 0;
    706 
    707     return true;
    708 }
    709 
    710 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
    711 {
    712     if (renderer->style()->visibility() == VISIBLE)
    713         return true;
    714     if (renderer->isTextFragment()) {
    715         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
    716         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
    717             return true;
    718     }
    719     return false;
    720 }
    721 
    722 static bool shouldEmitTabBeforeNode(Node* node)
    723 {
    724     RenderObject* r = node->renderer();
    725 
    726     // Table cells are delimited by tabs.
    727     if (!r || !isTableCell(node))
    728         return false;
    729 
    730     // Want a tab before every cell other than the first one
    731     RenderTableCell* rc = toRenderTableCell(r);
    732     RenderTable* t = rc->table();
    733     return t && (t->cellBefore(rc) || t->cellAbove(rc));
    734 }
    735 
    736 static bool shouldEmitNewlineForNode(Node* node)
    737 {
    738     // br elements are represented by a single newline.
    739     RenderObject* r = node->renderer();
    740     if (!r)
    741         return node->hasTagName(brTag);
    742 
    743     return r->isBR();
    744 }
    745 
    746 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
    747 {
    748     // Block flow (versus inline flow) is represented by having
    749     // a newline both before and after the element.
    750     RenderObject* r = node->renderer();
    751     if (!r) {
    752         return (node->hasTagName(blockquoteTag)
    753                 || node->hasTagName(ddTag)
    754                 || node->hasTagName(divTag)
    755                 || node->hasTagName(dlTag)
    756                 || node->hasTagName(dtTag)
    757                 || node->hasTagName(h1Tag)
    758                 || node->hasTagName(h2Tag)
    759                 || node->hasTagName(h3Tag)
    760                 || node->hasTagName(h4Tag)
    761                 || node->hasTagName(h5Tag)
    762                 || node->hasTagName(h6Tag)
    763                 || node->hasTagName(hrTag)
    764                 || node->hasTagName(liTag)
    765                 || node->hasTagName(listingTag)
    766                 || node->hasTagName(olTag)
    767                 || node->hasTagName(pTag)
    768                 || node->hasTagName(preTag)
    769                 || node->hasTagName(trTag)
    770                 || node->hasTagName(ulTag));
    771     }
    772 
    773     // Need to make an exception for table cells, because they are blocks, but we
    774     // want them tab-delimited rather than having newlines before and after.
    775     if (isTableCell(node))
    776         return false;
    777 
    778     // Need to make an exception for table row elements, because they are neither
    779     // "inline" or "RenderBlock", but we want newlines for them.
    780     if (r->isTableRow()) {
    781         RenderTable* t = toRenderTableRow(r)->table();
    782         if (t && !t->isInline())
    783             return true;
    784     }
    785 
    786     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
    787 }
    788 
    789 static bool shouldEmitNewlineAfterNode(Node* node)
    790 {
    791     // FIXME: It should be better but slower to create a VisiblePosition here.
    792     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
    793         return false;
    794     // Check if this is the very last renderer in the document.
    795     // If so, then we should not emit a newline.
    796     while ((node = node->traverseNextSibling()))
    797         if (node->renderer())
    798             return true;
    799     return false;
    800 }
    801 
    802 static bool shouldEmitNewlineBeforeNode(Node* node)
    803 {
    804     return shouldEmitNewlinesBeforeAndAfterNode(node);
    805 }
    806 
    807 static bool shouldEmitExtraNewlineForNode(Node* node)
    808 {
    809     // When there is a significant collapsed bottom margin, emit an extra
    810     // newline for a more realistic result.  We end up getting the right
    811     // result even without margin collapsing. For example: <div><p>text</p></div>
    812     // will work right even if both the <div> and the <p> have bottom margins.
    813     RenderObject* r = node->renderer();
    814     if (!r || !r->isBox())
    815         return false;
    816 
    817     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
    818     // not to do this at all
    819     if (node->hasTagName(h1Tag)
    820         || node->hasTagName(h2Tag)
    821         || node->hasTagName(h3Tag)
    822         || node->hasTagName(h4Tag)
    823         || node->hasTagName(h5Tag)
    824         || node->hasTagName(h6Tag)
    825         || node->hasTagName(pTag)) {
    826         RenderStyle* style = r->style();
    827         if (style) {
    828             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
    829             int fontSize = style->fontDescription().computedPixelSize();
    830             if (bottomMargin * 2 >= fontSize)
    831                 return true;
    832         }
    833     }
    834 
    835     return false;
    836 }
    837 
    838 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
    839 {
    840     const UChar* characters = renderer->text()->characters();
    841     int length = renderer->text()->length();
    842     for (int i = textEnd; i < length; ++i) {
    843         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
    844             return i - textEnd;
    845     }
    846 
    847     return length - textEnd;
    848 }
    849 
    850 static int maxOffsetIncludingCollapsedSpaces(Node* node)
    851 {
    852     int offset = caretMaxOffset(node);
    853 
    854     if (node->renderer() && node->renderer()->isText())
    855         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
    856 
    857     return offset;
    858 }
    859 
    860 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
    861 bool TextIterator::shouldRepresentNodeOffsetZero()
    862 {
    863     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
    864         return true;
    865 
    866     // Leave element positioned flush with start of a paragraph
    867     // (e.g. do not insert tab before a table cell at the start of a paragraph)
    868     if (m_lastCharacter == '\n')
    869         return false;
    870 
    871     // Otherwise, show the position if we have emitted any characters
    872     if (m_hasEmitted)
    873         return true;
    874 
    875     // We've not emitted anything yet. Generally, there is no need for any positioning then.
    876     // The only exception is when the element is visually not in the same line as
    877     // the start of the range (e.g. the range starts at the end of the previous paragraph).
    878     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
    879     // make quicker checks to possibly avoid that. Another check that we could make is
    880     // is whether the inline vs block flow changed since the previous visible element.
    881     // I think we're already in a special enough case that that won't be needed, tho.
    882 
    883     // No character needed if this is the first node in the range.
    884     if (m_node == m_startContainer)
    885         return false;
    886 
    887     // If we are outside the start container's subtree, assume we need to emit.
    888     // FIXME: m_startContainer could be an inline block
    889     if (!m_node->isDescendantOf(m_startContainer))
    890         return true;
    891 
    892     // If we started as m_startContainer offset 0 and the current node is a descendant of
    893     // the start container, we already had enough context to correctly decide whether to
    894     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
    895     // so don't second guess that now.
    896     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
    897     // immaterial since we likely would have already emitted something by now.
    898     if (m_startOffset == 0)
    899         return false;
    900 
    901     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
    902     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
    903     // we would create VisiblePositions on every call to this function without this check.
    904     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
    905         return false;
    906 
    907     // The startPos.isNotNull() check is needed because the start could be before the body,
    908     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
    909     // The currPos.isNotNull() check is needed because positions in non-HTML content
    910     // (like SVG) do not have visible positions, and we don't want to emit for them either.
    911     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
    912     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
    913     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
    914 }
    915 
    916 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
    917 {
    918     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
    919 }
    920 
    921 void TextIterator::representNodeOffsetZero()
    922 {
    923     // Emit a character to show the positioning of m_node.
    924 
    925     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
    926     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
    927     // on m_node to see if it necessitates emitting a character first and will early return
    928     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
    929     if (shouldEmitTabBeforeNode(m_node)) {
    930         if (shouldRepresentNodeOffsetZero())
    931             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
    932     } else if (shouldEmitNewlineBeforeNode(m_node)) {
    933         if (shouldRepresentNodeOffsetZero())
    934             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
    935     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
    936         if (shouldRepresentNodeOffsetZero())
    937             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
    938     }
    939 }
    940 
    941 bool TextIterator::handleNonTextNode()
    942 {
    943     if (shouldEmitNewlineForNode(m_node))
    944         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
    945     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
    946         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
    947     else
    948         representNodeOffsetZero();
    949 
    950     return true;
    951 }
    952 
    953 void TextIterator::exitNode()
    954 {
    955     // prevent emitting a newline when exiting a collapsed block at beginning of the range
    956     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
    957     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
    958     // therefore look like a blank line.
    959     if (!m_hasEmitted)
    960         return;
    961 
    962     // Emit with a position *inside* m_node, after m_node's contents, in
    963     // case it is a block, because the run should start where the
    964     // emitted character is positioned visually.
    965     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
    966     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
    967     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
    968     // TextIterator in _web_attributedStringFromRange.
    969     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
    970     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
    971         // use extra newline to represent margin bottom, as needed
    972         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
    973 
    974         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
    975         // contain a VisiblePosition when doing selection preservation.
    976         if (m_lastCharacter != '\n') {
    977             // insert a newline with a position following this block's contents.
    978             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    979             // remember whether to later add a newline for the current node
    980             ASSERT(!m_needsAnotherNewline);
    981             m_needsAnotherNewline = addNewline;
    982         } else if (addNewline)
    983             // insert a newline with a position following this block's contents.
    984             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
    985     }
    986 
    987     // If nothing was emitted, see if we need to emit a space.
    988     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
    989         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
    990 }
    991 
    992 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
    993 {
    994     m_hasEmitted = true;
    995 
    996     // remember information with which to construct the TextIterator::range()
    997     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
    998     m_positionNode = textNode;
    999     m_positionOffsetBaseNode = offsetBaseNode;
   1000     m_positionStartOffset = textStartOffset;
   1001     m_positionEndOffset = textEndOffset;
   1002 
   1003     // remember information with which to construct the TextIterator::characters() and length()
   1004     m_singleCharacterBuffer = c;
   1005     m_textCharacters = &m_singleCharacterBuffer;
   1006     m_textLength = 1;
   1007 
   1008     // remember some iteration state
   1009     m_lastTextNodeEndedWithCollapsedSpace = false;
   1010     m_lastCharacter = c;
   1011 }
   1012 
   1013 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
   1014 {
   1015     RenderText* renderer = toRenderText(renderObject);
   1016     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
   1017     ASSERT(m_text.characters());
   1018 
   1019     m_positionNode = textNode;
   1020     m_positionOffsetBaseNode = 0;
   1021     m_positionStartOffset = textStartOffset;
   1022     m_positionEndOffset = textEndOffset;
   1023     m_textCharacters = m_text.characters() + textStartOffset;
   1024     m_textLength = textEndOffset - textStartOffset;
   1025     m_lastCharacter = m_text[textEndOffset - 1];
   1026 
   1027     m_lastTextNodeEndedWithCollapsedSpace = false;
   1028     m_hasEmitted = true;
   1029 }
   1030 
   1031 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
   1032 {
   1033     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
   1034 }
   1035 
   1036 PassRefPtr<Range> TextIterator::range() const
   1037 {
   1038     // use the current run information, if we have it
   1039     if (m_positionNode) {
   1040         if (m_positionOffsetBaseNode) {
   1041             int index = m_positionOffsetBaseNode->nodeIndex();
   1042             m_positionStartOffset += index;
   1043             m_positionEndOffset += index;
   1044             m_positionOffsetBaseNode = 0;
   1045         }
   1046         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1047     }
   1048 
   1049     // otherwise, return the end of the overall range we were given
   1050     if (m_endContainer)
   1051         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
   1052 
   1053     return 0;
   1054 }
   1055 
   1056 Node* TextIterator::node() const
   1057 {
   1058     RefPtr<Range> textRange = range();
   1059     if (!textRange)
   1060         return 0;
   1061 
   1062     Node* node = textRange->startContainer();
   1063     if (!node)
   1064         return 0;
   1065     if (node->offsetInCharacters())
   1066         return node;
   1067 
   1068     return node->childNode(textRange->startOffset());
   1069 }
   1070 
   1071 // --------
   1072 
   1073 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
   1074     : m_behavior(TextIteratorDefaultBehavior)
   1075     , m_node(0)
   1076     , m_positionNode(0)
   1077 #if OS(ANDROID)
   1078     , m_stopsOnFormControls(false)
   1079     , m_shouldStop(false)
   1080 #endif
   1081 {
   1082 }
   1083 
   1084 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
   1085     : m_behavior(behavior)
   1086     , m_node(0)
   1087     , m_positionNode(0)
   1088 #if OS(ANDROID)
   1089     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
   1090     , m_shouldStop(false)
   1091 #endif
   1092 {
   1093 #if OS(ANDROID)
   1094     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
   1095 #else
   1096     ASSERT(m_behavior == TextIteratorDefaultBehavior);
   1097 #endif
   1098 
   1099     if (!r)
   1100         return;
   1101 
   1102     Node* startNode = r->startContainer();
   1103     if (!startNode)
   1104         return;
   1105     Node* endNode = r->endContainer();
   1106     int startOffset = r->startOffset();
   1107     int endOffset = r->endOffset();
   1108 
   1109     if (!startNode->offsetInCharacters()) {
   1110         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
   1111             startNode = startNode->childNode(startOffset);
   1112             startOffset = 0;
   1113         }
   1114     }
   1115     if (!endNode->offsetInCharacters()) {
   1116         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
   1117             endNode = endNode->childNode(endOffset - 1);
   1118             endOffset = lastOffsetInNode(endNode);
   1119         }
   1120     }
   1121 
   1122     m_node = endNode;
   1123     setUpFullyClippedStack(m_fullyClippedStack, m_node);
   1124     m_offset = endOffset;
   1125     m_handledNode = false;
   1126     m_handledChildren = endOffset == 0;
   1127 
   1128     m_startNode = startNode;
   1129     m_startOffset = startOffset;
   1130     m_endNode = endNode;
   1131     m_endOffset = endOffset;
   1132 
   1133 #ifndef NDEBUG
   1134     // Need this just because of the assert.
   1135     m_positionNode = endNode;
   1136 #endif
   1137 
   1138     m_lastTextNode = 0;
   1139     m_lastCharacter = '\n';
   1140 
   1141     m_havePassedStartNode = false;
   1142 
   1143     advance();
   1144 }
   1145 
   1146 bool SimplifiedBackwardsTextIterator::atEnd() const
   1147 {
   1148 #if OS(ANDROID)
   1149     return !m_positionNode || m_shouldStop;
   1150 #else
   1151     return !m_positionNode;
   1152 #endif
   1153 }
   1154 
   1155 void SimplifiedBackwardsTextIterator::advance()
   1156 {
   1157     ASSERT(m_positionNode);
   1158 
   1159 #if OS(ANDROID)
   1160     if (m_shouldStop)
   1161         return;
   1162 
   1163     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
   1164     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
   1165         m_shouldStop = true;
   1166         return;
   1167     }
   1168 #endif
   1169 
   1170     m_positionNode = 0;
   1171     m_textLength = 0;
   1172 
   1173     while (m_node && !m_havePassedStartNode) {
   1174         // Don't handle node if we start iterating at [node, 0].
   1175         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
   1176             RenderObject* renderer = m_node->renderer();
   1177             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
   1178                 // FIXME: What about CDATA_SECTION_NODE?
   1179                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1180                     m_handledNode = handleTextNode();
   1181             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
   1182                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
   1183                     m_handledNode = handleReplacedElement();
   1184             } else
   1185                 m_handledNode = handleNonTextNode();
   1186             if (m_positionNode)
   1187                 return;
   1188         }
   1189 
   1190         if (!m_handledChildren && m_node->hasChildNodes()) {
   1191             m_node = m_node->lastChild();
   1192             pushFullyClippedState(m_fullyClippedStack, m_node);
   1193         } else {
   1194             // Exit empty containers as we pass over them or containers
   1195             // where [container, 0] is where we started iterating.
   1196             if (!m_handledNode
   1197                     && canHaveChildrenForEditing(m_node)
   1198                     && m_node->parentNode()
   1199                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
   1200                 exitNode();
   1201                 if (m_positionNode) {
   1202                     m_handledNode = true;
   1203                     m_handledChildren = true;
   1204                     return;
   1205                 }
   1206             }
   1207 
   1208             // Exit all other containers.
   1209             while (!m_node->previousSibling()) {
   1210                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
   1211                     break;
   1212                 m_fullyClippedStack.pop();
   1213                 exitNode();
   1214                 if (m_positionNode) {
   1215                     m_handledNode = true;
   1216                     m_handledChildren = true;
   1217                     return;
   1218                 }
   1219             }
   1220 
   1221             m_fullyClippedStack.pop();
   1222             if (advanceRespectingRange(m_node->previousSibling()))
   1223                 pushFullyClippedState(m_fullyClippedStack, m_node);
   1224             else
   1225                 m_node = 0;
   1226         }
   1227 
   1228         // For the purpose of word boundary detection,
   1229         // we should iterate all visible text and trailing (collapsed) whitespaces.
   1230         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
   1231         m_handledNode = false;
   1232         m_handledChildren = false;
   1233 
   1234         if (m_positionNode)
   1235             return;
   1236     }
   1237 }
   1238 
   1239 bool SimplifiedBackwardsTextIterator::handleTextNode()
   1240 {
   1241     m_lastTextNode = m_node;
   1242 
   1243     RenderText* renderer = toRenderText(m_node->renderer());
   1244     String str = renderer->text();
   1245 
   1246     if (!renderer->firstTextBox() && str.length() > 0)
   1247         return true;
   1248 
   1249     m_positionEndOffset = m_offset;
   1250 
   1251     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
   1252     m_positionNode = m_node;
   1253     m_positionStartOffset = m_offset;
   1254     m_textLength = m_positionEndOffset - m_positionStartOffset;
   1255     m_textCharacters = str.characters() + m_positionStartOffset;
   1256 
   1257     m_lastCharacter = str[m_positionEndOffset - 1];
   1258 
   1259     return true;
   1260 }
   1261 
   1262 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
   1263 {
   1264     unsigned index = m_node->nodeIndex();
   1265     // We want replaced elements to behave like punctuation for boundary
   1266     // finding, and to simply take up space for the selection preservation
   1267     // code in moveParagraphs, so we use a comma.  Unconditionally emit
   1268     // here because this iterator is only used for boundary finding.
   1269     emitCharacter(',', m_node->parentNode(), index, index + 1);
   1270     return true;
   1271 }
   1272 
   1273 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
   1274 {
   1275     // We can use a linefeed in place of a tab because this simple iterator is only used to
   1276     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
   1277     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1278         unsigned index = m_node->nodeIndex();
   1279         // The start of this emitted range is wrong. Ensuring correctness would require
   1280         // VisiblePositions and so would be slow. previousBoundary expects this.
   1281         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
   1282     }
   1283     return true;
   1284 }
   1285 
   1286 void SimplifiedBackwardsTextIterator::exitNode()
   1287 {
   1288     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
   1289         // The start of this emitted range is wrong. Ensuring correctness would require
   1290         // VisiblePositions and so would be slow. previousBoundary expects this.
   1291         emitCharacter('\n', m_node, 0, 0);
   1292     }
   1293 }
   1294 
   1295 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
   1296 {
   1297     m_singleCharacterBuffer = c;
   1298     m_positionNode = node;
   1299     m_positionStartOffset = startOffset;
   1300     m_positionEndOffset = endOffset;
   1301     m_textCharacters = &m_singleCharacterBuffer;
   1302     m_textLength = 1;
   1303     m_lastCharacter = c;
   1304 }
   1305 
   1306 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
   1307 {
   1308     if (!next)
   1309         return false;
   1310     m_havePassedStartNode |= m_node == m_startNode;
   1311     if (m_havePassedStartNode)
   1312         return false;
   1313     m_node = next;
   1314     return true;
   1315 }
   1316 
   1317 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
   1318 {
   1319     if (m_positionNode)
   1320         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
   1321 
   1322     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
   1323 }
   1324 
   1325 // --------
   1326 
   1327 CharacterIterator::CharacterIterator()
   1328     : m_offset(0)
   1329     , m_runOffset(0)
   1330     , m_atBreak(true)
   1331 {
   1332 }
   1333 
   1334 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
   1335     : m_offset(0)
   1336     , m_runOffset(0)
   1337     , m_atBreak(true)
   1338     , m_textIterator(r, behavior)
   1339 {
   1340     while (!atEnd() && m_textIterator.length() == 0)
   1341         m_textIterator.advance();
   1342 }
   1343 
   1344 PassRefPtr<Range> CharacterIterator::range() const
   1345 {
   1346     RefPtr<Range> r = m_textIterator.range();
   1347     if (!m_textIterator.atEnd()) {
   1348         if (m_textIterator.length() <= 1) {
   1349             ASSERT(m_runOffset == 0);
   1350         } else {
   1351             Node* n = r->startContainer();
   1352             ASSERT(n == r->endContainer());
   1353             int offset = r->startOffset() + m_runOffset;
   1354             ExceptionCode ec = 0;
   1355             r->setStart(n, offset, ec);
   1356             r->setEnd(n, offset + 1, ec);
   1357             ASSERT(!ec);
   1358         }
   1359     }
   1360     return r.release();
   1361 }
   1362 
   1363 void CharacterIterator::advance(int count)
   1364 {
   1365     if (count <= 0) {
   1366         ASSERT(count == 0);
   1367         return;
   1368     }
   1369 
   1370     m_atBreak = false;
   1371 
   1372     // easy if there is enough left in the current m_textIterator run
   1373     int remaining = m_textIterator.length() - m_runOffset;
   1374     if (count < remaining) {
   1375         m_runOffset += count;
   1376         m_offset += count;
   1377         return;
   1378     }
   1379 
   1380     // exhaust the current m_textIterator run
   1381     count -= remaining;
   1382     m_offset += remaining;
   1383 
   1384     // move to a subsequent m_textIterator run
   1385     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1386         int runLength = m_textIterator.length();
   1387         if (runLength == 0)
   1388             m_atBreak = true;
   1389         else {
   1390             // see whether this is m_textIterator to use
   1391             if (count < runLength) {
   1392                 m_runOffset = count;
   1393                 m_offset += count;
   1394                 return;
   1395             }
   1396 
   1397             // exhaust this m_textIterator run
   1398             count -= runLength;
   1399             m_offset += runLength;
   1400         }
   1401     }
   1402 
   1403     // ran to the end of the m_textIterator... no more runs left
   1404     m_atBreak = true;
   1405     m_runOffset = 0;
   1406 }
   1407 
   1408 String CharacterIterator::string(int numChars)
   1409 {
   1410     Vector<UChar> result;
   1411     result.reserveInitialCapacity(numChars);
   1412     while (numChars > 0 && !atEnd()) {
   1413         int runSize = min(numChars, length());
   1414         result.append(characters(), runSize);
   1415         numChars -= runSize;
   1416         advance(runSize);
   1417     }
   1418     return String::adopt(result);
   1419 }
   1420 
   1421 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
   1422 {
   1423     it.advance(offset);
   1424     RefPtr<Range> start = it.range();
   1425 
   1426     if (length > 1)
   1427         it.advance(length - 1);
   1428     RefPtr<Range> end = it.range();
   1429 
   1430     return Range::create(start->startContainer()->document(),
   1431         start->startContainer(), start->startOffset(),
   1432         end->endContainer(), end->endOffset());
   1433 }
   1434 
   1435 BackwardsCharacterIterator::BackwardsCharacterIterator()
   1436     : m_offset(0)
   1437     , m_runOffset(0)
   1438     , m_atBreak(true)
   1439 {
   1440 }
   1441 
   1442 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
   1443     : m_offset(0)
   1444     , m_runOffset(0)
   1445     , m_atBreak(true)
   1446     , m_textIterator(range, behavior)
   1447 {
   1448     while (!atEnd() && !m_textIterator.length())
   1449         m_textIterator.advance();
   1450 }
   1451 
   1452 PassRefPtr<Range> BackwardsCharacterIterator::range() const
   1453 {
   1454     RefPtr<Range> r = m_textIterator.range();
   1455     if (!m_textIterator.atEnd()) {
   1456         if (m_textIterator.length() <= 1)
   1457             ASSERT(m_runOffset == 0);
   1458         else {
   1459             Node* n = r->startContainer();
   1460             ASSERT(n == r->endContainer());
   1461             int offset = r->endOffset() - m_runOffset;
   1462             ExceptionCode ec = 0;
   1463             r->setStart(n, offset - 1, ec);
   1464             r->setEnd(n, offset, ec);
   1465             ASSERT(!ec);
   1466         }
   1467     }
   1468     return r.release();
   1469 }
   1470 
   1471 void BackwardsCharacterIterator::advance(int count)
   1472 {
   1473     if (count <= 0) {
   1474         ASSERT(!count);
   1475         return;
   1476     }
   1477 
   1478     m_atBreak = false;
   1479 
   1480     int remaining = m_textIterator.length() - m_runOffset;
   1481     if (count < remaining) {
   1482         m_runOffset += count;
   1483         m_offset += count;
   1484         return;
   1485     }
   1486 
   1487     count -= remaining;
   1488     m_offset += remaining;
   1489 
   1490     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
   1491         int runLength = m_textIterator.length();
   1492         if (runLength == 0)
   1493             m_atBreak = true;
   1494         else {
   1495             if (count < runLength) {
   1496                 m_runOffset = count;
   1497                 m_offset += count;
   1498                 return;
   1499             }
   1500 
   1501             count -= runLength;
   1502             m_offset += runLength;
   1503         }
   1504     }
   1505 
   1506     m_atBreak = true;
   1507     m_runOffset = 0;
   1508 }
   1509 
   1510 // --------
   1511 
   1512 WordAwareIterator::WordAwareIterator()
   1513     : m_previousText(0)
   1514     , m_didLookAhead(false)
   1515 {
   1516 }
   1517 
   1518 WordAwareIterator::WordAwareIterator(const Range* r)
   1519     : m_previousText(0)
   1520     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
   1521     , m_textIterator(r)
   1522 {
   1523     advance(); // get in position over the first chunk of text
   1524 }
   1525 
   1526 WordAwareIterator::~WordAwareIterator()
   1527 {
   1528 }
   1529 
   1530 // We're always in one of these modes:
   1531 // - The current chunk in the text iterator is our current chunk
   1532 //      (typically its a piece of whitespace, or text that ended with whitespace)
   1533 // - The previous chunk in the text iterator is our current chunk
   1534 //      (we looked ahead to the next chunk and found a word boundary)
   1535 // - We built up our own chunk of text from many chunks from the text iterator
   1536 
   1537 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
   1538 
   1539 void WordAwareIterator::advance()
   1540 {
   1541     m_previousText = 0;
   1542     m_buffer.clear();      // toss any old buffer we built up
   1543 
   1544     // If last time we did a look-ahead, start with that looked-ahead chunk now
   1545     if (!m_didLookAhead) {
   1546         ASSERT(!m_textIterator.atEnd());
   1547         m_textIterator.advance();
   1548     }
   1549     m_didLookAhead = false;
   1550 
   1551     // Go to next non-empty chunk
   1552     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
   1553         m_textIterator.advance();
   1554     m_range = m_textIterator.range();
   1555 
   1556     if (m_textIterator.atEnd())
   1557         return;
   1558 
   1559     while (1) {
   1560         // If this chunk ends in whitespace we can just use it as our chunk.
   1561         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
   1562             return;
   1563 
   1564         // If this is the first chunk that failed, save it in previousText before look ahead
   1565         if (m_buffer.isEmpty()) {
   1566             m_previousText = m_textIterator.characters();
   1567             m_previousLength = m_textIterator.length();
   1568         }
   1569 
   1570         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
   1571         m_textIterator.advance();
   1572         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
   1573             m_didLookAhead = true;
   1574             return;
   1575         }
   1576 
   1577         if (m_buffer.isEmpty()) {
   1578             // Start gobbling chunks until we get to a suitable stopping point
   1579             m_buffer.append(m_previousText, m_previousLength);
   1580             m_previousText = 0;
   1581         }
   1582         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
   1583         int exception = 0;
   1584         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
   1585     }
   1586 }
   1587 
   1588 int WordAwareIterator::length() const
   1589 {
   1590     if (!m_buffer.isEmpty())
   1591         return m_buffer.size();
   1592     if (m_previousText)
   1593         return m_previousLength;
   1594     return m_textIterator.length();
   1595 }
   1596 
   1597 const UChar* WordAwareIterator::characters() const
   1598 {
   1599     if (!m_buffer.isEmpty())
   1600         return m_buffer.data();
   1601     if (m_previousText)
   1602         return m_previousText;
   1603     return m_textIterator.characters();
   1604 }
   1605 
   1606 // --------
   1607 
   1608 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
   1609 {
   1610     switch (c) {
   1611         case hebrewPunctuationGershayim:
   1612         case leftDoubleQuotationMark:
   1613         case rightDoubleQuotationMark:
   1614             return '"';
   1615         case hebrewPunctuationGeresh:
   1616         case leftSingleQuotationMark:
   1617         case rightSingleQuotationMark:
   1618             return '\'';
   1619         case softHyphen:
   1620             // Replace soft hyphen with an ignorable character so that their presence or absence will
   1621             // not affect string comparison.
   1622             return 0;
   1623         default:
   1624             return c;
   1625     }
   1626 }
   1627 
   1628 static inline void foldQuoteMarksAndSoftHyphens(String& s)
   1629 {
   1630     s.replace(hebrewPunctuationGeresh, '\'');
   1631     s.replace(hebrewPunctuationGershayim, '"');
   1632     s.replace(leftDoubleQuotationMark, '"');
   1633     s.replace(leftSingleQuotationMark, '\'');
   1634     s.replace(rightDoubleQuotationMark, '"');
   1635     s.replace(rightSingleQuotationMark, '\'');
   1636     // Replace soft hyphen with an ignorable character so that their presence or absence will
   1637     // not affect string comparison.
   1638     s.replace(softHyphen, 0);
   1639 }
   1640 
   1641 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
   1642 
   1643 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
   1644 {
   1645     for (size_t i = 0; i < length; ++i)
   1646         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
   1647 }
   1648 
   1649 static const size_t minimumSearchBufferSize = 8192;
   1650 
   1651 #ifndef NDEBUG
   1652 static bool searcherInUse;
   1653 #endif
   1654 
   1655 static UStringSearch* createSearcher()
   1656 {
   1657     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
   1658     // but it doesn't matter exactly what it is, since we don't perform any searches
   1659     // without setting both the pattern and the text.
   1660     UErrorCode status = U_ZERO_ERROR;
   1661     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
   1662     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
   1663     return searcher;
   1664 }
   1665 
   1666 static UStringSearch* searcher()
   1667 {
   1668     static UStringSearch* searcher = createSearcher();
   1669     return searcher;
   1670 }
   1671 
   1672 static inline void lockSearcher()
   1673 {
   1674 #ifndef NDEBUG
   1675     ASSERT(!searcherInUse);
   1676     searcherInUse = true;
   1677 #endif
   1678 }
   1679 
   1680 static inline void unlockSearcher()
   1681 {
   1682 #ifndef NDEBUG
   1683     ASSERT(searcherInUse);
   1684     searcherInUse = false;
   1685 #endif
   1686 }
   1687 
   1688 // ICU's search ignores the distinction between small kana letters and ones
   1689 // that are not small, and also characters that differ only in the voicing
   1690 // marks when considering only primary collation strength diffrences.
   1691 // This is not helpful for end users, since these differences make words
   1692 // distinct, so for our purposes we need these to be considered.
   1693 // The Unicode folks do not think the collation algorithm should be
   1694 // changed. To work around this, we would like to tailor the ICU searcher,
   1695 // but we can't get that to work yet. So instead, we check for cases where
   1696 // these differences occur, and skip those matches.
   1697 
   1698 // We refer to the above technique as the "kana workaround". The next few
   1699 // functions are helper functinos for the kana workaround.
   1700 
   1701 static inline bool isKanaLetter(UChar character)
   1702 {
   1703     // Hiragana letters.
   1704     if (character >= 0x3041 && character <= 0x3096)
   1705         return true;
   1706 
   1707     // Katakana letters.
   1708     if (character >= 0x30A1 && character <= 0x30FA)
   1709         return true;
   1710     if (character >= 0x31F0 && character <= 0x31FF)
   1711         return true;
   1712 
   1713     // Halfwidth katakana letters.
   1714     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
   1715         return true;
   1716 
   1717     return false;
   1718 }
   1719 
   1720 static inline bool isSmallKanaLetter(UChar character)
   1721 {
   1722     ASSERT(isKanaLetter(character));
   1723 
   1724     switch (character) {
   1725     case 0x3041: // HIRAGANA LETTER SMALL A
   1726     case 0x3043: // HIRAGANA LETTER SMALL I
   1727     case 0x3045: // HIRAGANA LETTER SMALL U
   1728     case 0x3047: // HIRAGANA LETTER SMALL E
   1729     case 0x3049: // HIRAGANA LETTER SMALL O
   1730     case 0x3063: // HIRAGANA LETTER SMALL TU
   1731     case 0x3083: // HIRAGANA LETTER SMALL YA
   1732     case 0x3085: // HIRAGANA LETTER SMALL YU
   1733     case 0x3087: // HIRAGANA LETTER SMALL YO
   1734     case 0x308E: // HIRAGANA LETTER SMALL WA
   1735     case 0x3095: // HIRAGANA LETTER SMALL KA
   1736     case 0x3096: // HIRAGANA LETTER SMALL KE
   1737     case 0x30A1: // KATAKANA LETTER SMALL A
   1738     case 0x30A3: // KATAKANA LETTER SMALL I
   1739     case 0x30A5: // KATAKANA LETTER SMALL U
   1740     case 0x30A7: // KATAKANA LETTER SMALL E
   1741     case 0x30A9: // KATAKANA LETTER SMALL O
   1742     case 0x30C3: // KATAKANA LETTER SMALL TU
   1743     case 0x30E3: // KATAKANA LETTER SMALL YA
   1744     case 0x30E5: // KATAKANA LETTER SMALL YU
   1745     case 0x30E7: // KATAKANA LETTER SMALL YO
   1746     case 0x30EE: // KATAKANA LETTER SMALL WA
   1747     case 0x30F5: // KATAKANA LETTER SMALL KA
   1748     case 0x30F6: // KATAKANA LETTER SMALL KE
   1749     case 0x31F0: // KATAKANA LETTER SMALL KU
   1750     case 0x31F1: // KATAKANA LETTER SMALL SI
   1751     case 0x31F2: // KATAKANA LETTER SMALL SU
   1752     case 0x31F3: // KATAKANA LETTER SMALL TO
   1753     case 0x31F4: // KATAKANA LETTER SMALL NU
   1754     case 0x31F5: // KATAKANA LETTER SMALL HA
   1755     case 0x31F6: // KATAKANA LETTER SMALL HI
   1756     case 0x31F7: // KATAKANA LETTER SMALL HU
   1757     case 0x31F8: // KATAKANA LETTER SMALL HE
   1758     case 0x31F9: // KATAKANA LETTER SMALL HO
   1759     case 0x31FA: // KATAKANA LETTER SMALL MU
   1760     case 0x31FB: // KATAKANA LETTER SMALL RA
   1761     case 0x31FC: // KATAKANA LETTER SMALL RI
   1762     case 0x31FD: // KATAKANA LETTER SMALL RU
   1763     case 0x31FE: // KATAKANA LETTER SMALL RE
   1764     case 0x31FF: // KATAKANA LETTER SMALL RO
   1765     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
   1766     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
   1767     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
   1768     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
   1769     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
   1770     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
   1771     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
   1772     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
   1773     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
   1774         return true;
   1775     }
   1776     return false;
   1777 }
   1778 
   1779 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
   1780 
   1781 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
   1782 {
   1783     ASSERT(isKanaLetter(character));
   1784 
   1785     switch (character) {
   1786     case 0x304C: // HIRAGANA LETTER GA
   1787     case 0x304E: // HIRAGANA LETTER GI
   1788     case 0x3050: // HIRAGANA LETTER GU
   1789     case 0x3052: // HIRAGANA LETTER GE
   1790     case 0x3054: // HIRAGANA LETTER GO
   1791     case 0x3056: // HIRAGANA LETTER ZA
   1792     case 0x3058: // HIRAGANA LETTER ZI
   1793     case 0x305A: // HIRAGANA LETTER ZU
   1794     case 0x305C: // HIRAGANA LETTER ZE
   1795     case 0x305E: // HIRAGANA LETTER ZO
   1796     case 0x3060: // HIRAGANA LETTER DA
   1797     case 0x3062: // HIRAGANA LETTER DI
   1798     case 0x3065: // HIRAGANA LETTER DU
   1799     case 0x3067: // HIRAGANA LETTER DE
   1800     case 0x3069: // HIRAGANA LETTER DO
   1801     case 0x3070: // HIRAGANA LETTER BA
   1802     case 0x3073: // HIRAGANA LETTER BI
   1803     case 0x3076: // HIRAGANA LETTER BU
   1804     case 0x3079: // HIRAGANA LETTER BE
   1805     case 0x307C: // HIRAGANA LETTER BO
   1806     case 0x3094: // HIRAGANA LETTER VU
   1807     case 0x30AC: // KATAKANA LETTER GA
   1808     case 0x30AE: // KATAKANA LETTER GI
   1809     case 0x30B0: // KATAKANA LETTER GU
   1810     case 0x30B2: // KATAKANA LETTER GE
   1811     case 0x30B4: // KATAKANA LETTER GO
   1812     case 0x30B6: // KATAKANA LETTER ZA
   1813     case 0x30B8: // KATAKANA LETTER ZI
   1814     case 0x30BA: // KATAKANA LETTER ZU
   1815     case 0x30BC: // KATAKANA LETTER ZE
   1816     case 0x30BE: // KATAKANA LETTER ZO
   1817     case 0x30C0: // KATAKANA LETTER DA
   1818     case 0x30C2: // KATAKANA LETTER DI
   1819     case 0x30C5: // KATAKANA LETTER DU
   1820     case 0x30C7: // KATAKANA LETTER DE
   1821     case 0x30C9: // KATAKANA LETTER DO
   1822     case 0x30D0: // KATAKANA LETTER BA
   1823     case 0x30D3: // KATAKANA LETTER BI
   1824     case 0x30D6: // KATAKANA LETTER BU
   1825     case 0x30D9: // KATAKANA LETTER BE
   1826     case 0x30DC: // KATAKANA LETTER BO
   1827     case 0x30F4: // KATAKANA LETTER VU
   1828     case 0x30F7: // KATAKANA LETTER VA
   1829     case 0x30F8: // KATAKANA LETTER VI
   1830     case 0x30F9: // KATAKANA LETTER VE
   1831     case 0x30FA: // KATAKANA LETTER VO
   1832         return VoicedSoundMark;
   1833     case 0x3071: // HIRAGANA LETTER PA
   1834     case 0x3074: // HIRAGANA LETTER PI
   1835     case 0x3077: // HIRAGANA LETTER PU
   1836     case 0x307A: // HIRAGANA LETTER PE
   1837     case 0x307D: // HIRAGANA LETTER PO
   1838     case 0x30D1: // KATAKANA LETTER PA
   1839     case 0x30D4: // KATAKANA LETTER PI
   1840     case 0x30D7: // KATAKANA LETTER PU
   1841     case 0x30DA: // KATAKANA LETTER PE
   1842     case 0x30DD: // KATAKANA LETTER PO
   1843         return SemiVoicedSoundMark;
   1844     }
   1845     return NoVoicedSoundMark;
   1846 }
   1847 
   1848 static inline bool isCombiningVoicedSoundMark(UChar character)
   1849 {
   1850     switch (character) {
   1851     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
   1852     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
   1853         return true;
   1854     }
   1855     return false;
   1856 }
   1857 
   1858 static inline bool containsKanaLetters(const String& pattern)
   1859 {
   1860     const UChar* characters = pattern.characters();
   1861     unsigned length = pattern.length();
   1862     for (unsigned i = 0; i < length; ++i) {
   1863         if (isKanaLetter(characters[i]))
   1864             return true;
   1865     }
   1866     return false;
   1867 }
   1868 
   1869 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
   1870 {
   1871     ASSERT(length);
   1872 
   1873     buffer.resize(length);
   1874 
   1875     UErrorCode status = U_ZERO_ERROR;
   1876     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
   1877     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
   1878     ASSERT(bufferSize);
   1879 
   1880     buffer.resize(bufferSize);
   1881 
   1882     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
   1883         return;
   1884 
   1885     status = U_ZERO_ERROR;
   1886     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
   1887     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
   1888 }
   1889 
   1890 static bool isNonLatin1Separator(UChar32 character)
   1891 {
   1892     ASSERT_ARG(character, character >= 256);
   1893 
   1894     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
   1895 }
   1896 
   1897 static inline bool isSeparator(UChar32 character)
   1898 {
   1899     static const bool latin1SeparatorTable[256] = {
   1900         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1901         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1902         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
   1903         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
   1904         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
   1905         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
   1906         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
   1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
   1908         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1909         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1910         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1911         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
   1912         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1913         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
   1914         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1915         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
   1916     };
   1917 
   1918     if (character < 256)
   1919         return latin1SeparatorTable[character];
   1920 
   1921     return isNonLatin1Separator(character);
   1922 }
   1923 
   1924 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   1925     : m_target(target)
   1926     , m_options(options)
   1927     , m_prefixLength(0)
   1928     , m_atBreak(true)
   1929     , m_needsMoreContext(options & AtWordStarts)
   1930     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
   1931 {
   1932     ASSERT(!m_target.isEmpty());
   1933 
   1934     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
   1935     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
   1936     // to add tailoring on top of the locale-specific tailoring as of this writing.
   1937     foldQuoteMarksAndSoftHyphens(m_target);
   1938 
   1939     size_t targetLength = m_target.length();
   1940     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
   1941     m_overlap = m_buffer.capacity() / 4;
   1942 
   1943     if ((m_options & AtWordStarts) && targetLength) {
   1944         UChar32 targetFirstCharacter;
   1945         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
   1946         // Characters in the separator category never really occur at the beginning of a word,
   1947         // so if the target begins with such a character, we just ignore the AtWordStart option.
   1948         if (isSeparator(targetFirstCharacter)) {
   1949             m_options &= ~AtWordStarts;
   1950             m_needsMoreContext = false;
   1951         }
   1952     }
   1953 
   1954     // Grab the single global searcher.
   1955     // If we ever have a reason to do more than once search buffer at once, we'll have
   1956     // to move to multiple searchers.
   1957     lockSearcher();
   1958 
   1959     UStringSearch* searcher = WebCore::searcher();
   1960     UCollator* collator = usearch_getCollator(searcher);
   1961 
   1962     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
   1963     if (ucol_getStrength(collator) != strength) {
   1964         ucol_setStrength(collator, strength);
   1965         usearch_reset(searcher);
   1966     }
   1967 
   1968     UErrorCode status = U_ZERO_ERROR;
   1969     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
   1970     ASSERT(status == U_ZERO_ERROR);
   1971 
   1972     // The kana workaround requires a normalized copy of the target string.
   1973     if (m_targetRequiresKanaWorkaround)
   1974         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
   1975 }
   1976 
   1977 inline SearchBuffer::~SearchBuffer()
   1978 {
   1979     // Leave the static object pointing to a valid string.
   1980     UErrorCode status = U_ZERO_ERROR;
   1981     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
   1982     ASSERT(status == U_ZERO_ERROR);
   1983 
   1984     unlockSearcher();
   1985 }
   1986 
   1987 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   1988 {
   1989     ASSERT(length);
   1990 
   1991     if (m_atBreak) {
   1992         m_buffer.shrink(0);
   1993         m_prefixLength = 0;
   1994         m_atBreak = false;
   1995     } else if (m_buffer.size() == m_buffer.capacity()) {
   1996         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
   1997         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
   1998         m_buffer.shrink(m_overlap);
   1999     }
   2000 
   2001     size_t oldLength = m_buffer.size();
   2002     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
   2003     ASSERT(usableLength);
   2004     m_buffer.append(characters, usableLength);
   2005     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
   2006     return usableLength;
   2007 }
   2008 
   2009 inline bool SearchBuffer::needsMoreContext() const
   2010 {
   2011     return m_needsMoreContext;
   2012 }
   2013 
   2014 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
   2015 {
   2016     ASSERT(m_needsMoreContext);
   2017     ASSERT(m_prefixLength == m_buffer.size());
   2018 
   2019     if (!length)
   2020         return;
   2021 
   2022     m_atBreak = false;
   2023 
   2024     size_t wordBoundaryContextStart = length;
   2025     if (wordBoundaryContextStart) {
   2026         U16_BACK_1(characters, 0, wordBoundaryContextStart);
   2027         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
   2028     }
   2029 
   2030     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
   2031     m_buffer.prepend(characters + length - usableLength, usableLength);
   2032     m_prefixLength += usableLength;
   2033 
   2034     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
   2035         m_needsMoreContext = false;
   2036 }
   2037 
   2038 inline bool SearchBuffer::atBreak() const
   2039 {
   2040     return m_atBreak;
   2041 }
   2042 
   2043 inline void SearchBuffer::reachedBreak()
   2044 {
   2045     m_atBreak = true;
   2046 }
   2047 
   2048 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
   2049 {
   2050     // This function implements the kana workaround. If usearch treats
   2051     // it as a match, but we do not want to, then it's a "bad match".
   2052     if (!m_targetRequiresKanaWorkaround)
   2053         return false;
   2054 
   2055     // Normalize into a match buffer. We reuse a single buffer rather than
   2056     // creating a new one each time.
   2057     normalizeCharacters(match, matchLength, m_normalizedMatch);
   2058 
   2059     const UChar* a = m_normalizedTarget.begin();
   2060     const UChar* aEnd = m_normalizedTarget.end();
   2061 
   2062     const UChar* b = m_normalizedMatch.begin();
   2063     const UChar* bEnd = m_normalizedMatch.end();
   2064 
   2065     while (true) {
   2066         // Skip runs of non-kana-letter characters. This is necessary so we can
   2067         // correctly handle strings where the target and match have different-length
   2068         // runs of characters that match, while still double checking the correctness
   2069         // of matches of kana letters with other kana letters.
   2070         while (a != aEnd && !isKanaLetter(*a))
   2071             ++a;
   2072         while (b != bEnd && !isKanaLetter(*b))
   2073             ++b;
   2074 
   2075         // If we reached the end of either the target or the match, we should have
   2076         // reached the end of both; both should have the same number of kana letters.
   2077         if (a == aEnd || b == bEnd) {
   2078             ASSERT(a == aEnd);
   2079             ASSERT(b == bEnd);
   2080             return false;
   2081         }
   2082 
   2083         // Check for differences in the kana letter character itself.
   2084         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
   2085             return true;
   2086         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
   2087             return true;
   2088         ++a;
   2089         ++b;
   2090 
   2091         // Check for differences in combining voiced sound marks found after the letter.
   2092         while (1) {
   2093             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
   2094                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
   2095                     return true;
   2096                 break;
   2097             }
   2098             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
   2099                 return true;
   2100             if (*a != *b)
   2101                 return true;
   2102             ++a;
   2103             ++b;
   2104         }
   2105     }
   2106 }
   2107 
   2108 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
   2109 {
   2110     ASSERT(m_options & AtWordStarts);
   2111 
   2112     if (!start)
   2113         return true;
   2114 
   2115     if (m_options & TreatMedialCapitalAsWordStart) {
   2116         int size = m_buffer.size();
   2117         int offset = start;
   2118         UChar32 firstCharacter;
   2119         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
   2120         UChar32 previousCharacter;
   2121         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
   2122 
   2123         if (isSeparator(firstCharacter)) {
   2124             // The start of a separator run is a word start (".org" in "webkit.org").
   2125             if (!isSeparator(previousCharacter))
   2126                 return true;
   2127         } else if (isASCIIUpper(firstCharacter)) {
   2128             // The start of an uppercase run is a word start ("Kit" in "WebKit").
   2129             if (!isASCIIUpper(previousCharacter))
   2130                 return true;
   2131             // The last character of an uppercase run followed by a non-separator, non-digit
   2132             // is a word start ("Request" in "XMLHTTPRequest").
   2133             offset = start;
   2134             U16_FWD_1(m_buffer.data(), offset, size);
   2135             UChar32 nextCharacter = 0;
   2136             if (offset < size)
   2137                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
   2138             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
   2139                 return true;
   2140         } else if (isASCIIDigit(firstCharacter)) {
   2141             // The start of a digit run is a word start ("2" in "WebKit2").
   2142             if (!isASCIIDigit(previousCharacter))
   2143                 return true;
   2144         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
   2145             // The start of a non-separator, non-uppercase, non-digit run is a word start,
   2146             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
   2147             return true;
   2148         }
   2149     }
   2150 
   2151     size_t wordBreakSearchStart = start + length;
   2152     while (wordBreakSearchStart > start)
   2153         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
   2154     return wordBreakSearchStart == start;
   2155 }
   2156 
   2157 inline size_t SearchBuffer::search(size_t& start)
   2158 {
   2159     size_t size = m_buffer.size();
   2160     if (m_atBreak) {
   2161         if (!size)
   2162             return 0;
   2163     } else {
   2164         if (size != m_buffer.capacity())
   2165             return 0;
   2166     }
   2167 
   2168     UStringSearch* searcher = WebCore::searcher();
   2169 
   2170     UErrorCode status = U_ZERO_ERROR;
   2171     usearch_setText(searcher, m_buffer.data(), size, &status);
   2172     ASSERT(status == U_ZERO_ERROR);
   2173 
   2174     usearch_setOffset(searcher, m_prefixLength, &status);
   2175     ASSERT(status == U_ZERO_ERROR);
   2176 
   2177     int matchStart = usearch_next(searcher, &status);
   2178     ASSERT(status == U_ZERO_ERROR);
   2179 
   2180 nextMatch:
   2181     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
   2182         ASSERT(matchStart == USEARCH_DONE);
   2183         return 0;
   2184     }
   2185 
   2186     // Matches that start in the overlap area are only tentative.
   2187     // The same match may appear later, matching more characters,
   2188     // possibly including a combining character that's not yet in the buffer.
   2189     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
   2190         size_t overlap = m_overlap;
   2191         if (m_options & AtWordStarts) {
   2192             // Ensure that there is sufficient context before matchStart the next time around for
   2193             // determining if it is at a word boundary.
   2194             int wordBoundaryContextStart = matchStart;
   2195             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
   2196             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
   2197             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
   2198         }
   2199         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
   2200         m_prefixLength -= min(m_prefixLength, size - overlap);
   2201         m_buffer.shrink(overlap);
   2202         return 0;
   2203     }
   2204 
   2205     size_t matchedLength = usearch_getMatchedLength(searcher);
   2206     ASSERT(matchStart + matchedLength <= size);
   2207 
   2208     // If this match is "bad", move on to the next match.
   2209     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
   2210         matchStart = usearch_next(searcher, &status);
   2211         ASSERT(status == U_ZERO_ERROR);
   2212         goto nextMatch;
   2213     }
   2214 
   2215     size_t newSize = size - (matchStart + 1);
   2216     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
   2217     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
   2218     m_buffer.shrink(newSize);
   2219 
   2220     start = size - matchStart;
   2221     return matchedLength;
   2222 }
   2223 
   2224 #else // !ICU_UNICODE
   2225 
   2226 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
   2227     : m_target(options & CaseInsensitive ? target.foldCase() : target)
   2228     , m_options(options)
   2229     , m_buffer(m_target.length())
   2230     , m_isCharacterStartBuffer(m_target.length())
   2231     , m_isBufferFull(false)
   2232     , m_cursor(0)
   2233 {
   2234     ASSERT(!m_target.isEmpty());
   2235     m_target.replace(noBreakSpace, ' ');
   2236     foldQuoteMarksAndSoftHyphens(m_target);
   2237 }
   2238 
   2239 inline SearchBuffer::~SearchBuffer()
   2240 {
   2241 }
   2242 
   2243 inline void SearchBuffer::reachedBreak()
   2244 {
   2245     m_cursor = 0;
   2246     m_isBufferFull = false;
   2247 }
   2248 
   2249 inline bool SearchBuffer::atBreak() const
   2250 {
   2251     return !m_cursor && !m_isBufferFull;
   2252 }
   2253 
   2254 inline void SearchBuffer::append(UChar c, bool isStart)
   2255 {
   2256     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
   2257     m_isCharacterStartBuffer[m_cursor] = isStart;
   2258     if (++m_cursor == m_target.length()) {
   2259         m_cursor = 0;
   2260         m_isBufferFull = true;
   2261     }
   2262 }
   2263 
   2264 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
   2265 {
   2266     ASSERT(length);
   2267     if (!(m_options & CaseInsensitive)) {
   2268         append(characters[0], true);
   2269         return 1;
   2270     }
   2271     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
   2272     UChar foldedCharacters[maxFoldedCharacters];
   2273     bool error;
   2274     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
   2275     ASSERT(!error);
   2276     ASSERT(numFoldedCharacters);
   2277     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
   2278     if (!error && numFoldedCharacters) {
   2279         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
   2280         append(foldedCharacters[0], true);
   2281         for (int i = 1; i < numFoldedCharacters; ++i)
   2282             append(foldedCharacters[i], false);
   2283     }
   2284     return 1;
   2285 }
   2286 
   2287 inline bool SearchBuffer::needsMoreContext() const
   2288 {
   2289     return false;
   2290 }
   2291 
   2292 void SearchBuffer::prependContext(const UChar*, size_t)
   2293 {
   2294     ASSERT_NOT_REACHED();
   2295 }
   2296 
   2297 inline size_t SearchBuffer::search(size_t& start)
   2298 {
   2299     if (!m_isBufferFull)
   2300         return 0;
   2301     if (!m_isCharacterStartBuffer[m_cursor])
   2302         return 0;
   2303 
   2304     size_t tailSpace = m_target.length() - m_cursor;
   2305     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
   2306         return 0;
   2307     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
   2308         return 0;
   2309 
   2310     start = length();
   2311 
   2312     // Now that we've found a match once, we don't want to find it again, because those
   2313     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
   2314     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
   2315     // we want to get rid of that in the future we could track this with a separate boolean
   2316     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
   2317     m_isCharacterStartBuffer[m_cursor] = false;
   2318 
   2319     return start;
   2320 }
   2321 
   2322 // Returns the number of characters that were appended to the buffer (what we are searching in).
   2323 // That's not necessarily the same length as the passed-in target string, because case folding
   2324 // can make two strings match even though they're not the same length.
   2325 size_t SearchBuffer::length() const
   2326 {
   2327     size_t bufferSize = m_target.length();
   2328     size_t length = 0;
   2329     for (size_t i = 0; i < bufferSize; ++i)
   2330         length += m_isCharacterStartBuffer[i];
   2331     return length;
   2332 }
   2333 
   2334 #endif // !ICU_UNICODE
   2335 
   2336 // --------
   2337 
   2338 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
   2339 {
   2340     int length = 0;
   2341     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
   2342         length += it.length();
   2343 
   2344     return length;
   2345 }
   2346 
   2347 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
   2348 {
   2349     CharacterIterator entireRangeIterator(entireRange);
   2350     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
   2351 }
   2352 
   2353 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
   2354 {
   2355     RefPtr<Range> resultRange = scope->document()->createRange();
   2356 
   2357     int docTextPosition = 0;
   2358     int rangeEnd = rangeLocation + rangeLength;
   2359     bool startRangeFound = false;
   2360 
   2361     RefPtr<Range> textRunRange;
   2362 
   2363     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
   2364 
   2365     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
   2366     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
   2367         textRunRange = it.range();
   2368 
   2369         ExceptionCode ec = 0;
   2370         resultRange->setStart(textRunRange->startContainer(), 0, ec);
   2371         ASSERT(!ec);
   2372         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
   2373         ASSERT(!ec);
   2374 
   2375         return resultRange.release();
   2376     }
   2377 
   2378     for (; !it.atEnd(); it.advance()) {
   2379         int len = it.length();
   2380         textRunRange = it.range();
   2381 
   2382         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
   2383         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
   2384 
   2385         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
   2386         // in those cases that textRunRange is used.
   2387         if (foundEnd) {
   2388             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
   2389             // position for emitted '\n's.
   2390             if (len == 1 && it.characters()[0] == '\n') {
   2391                 scope->document()->updateLayoutIgnorePendingStylesheets();
   2392                 it.advance();
   2393                 if (!it.atEnd()) {
   2394                     RefPtr<Range> range = it.range();
   2395                     ExceptionCode ec = 0;
   2396                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
   2397                     ASSERT(!ec);
   2398                 } else {
   2399                     Position runStart = textRunRange->startPosition();
   2400                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
   2401                     if (runEnd.isNotNull()) {
   2402                         ExceptionCode ec = 0;
   2403                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
   2404                         ASSERT(!ec);
   2405                     }
   2406                 }
   2407             }
   2408         }
   2409 
   2410         if (foundStart) {
   2411             startRangeFound = true;
   2412             int exception = 0;
   2413             if (textRunRange->startContainer()->isTextNode()) {
   2414                 int offset = rangeLocation - docTextPosition;
   2415                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2416             } else {
   2417                 if (rangeLocation == docTextPosition)
   2418                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2419                 else
   2420                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2421             }
   2422         }
   2423 
   2424         if (foundEnd) {
   2425             int exception = 0;
   2426             if (textRunRange->startContainer()->isTextNode()) {
   2427                 int offset = rangeEnd - docTextPosition;
   2428                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
   2429             } else {
   2430                 if (rangeEnd == docTextPosition)
   2431                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
   2432                 else
   2433                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2434             }
   2435             docTextPosition += len;
   2436             break;
   2437         }
   2438         docTextPosition += len;
   2439     }
   2440 
   2441     if (!startRangeFound)
   2442         return 0;
   2443 
   2444     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
   2445         int exception = 0;
   2446         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
   2447     }
   2448 
   2449     return resultRange.release();
   2450 }
   2451 
   2452 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
   2453 {
   2454     location = notFound;
   2455     length = 0;
   2456 
   2457     if (!range->startContainer())
   2458         return false;
   2459 
   2460     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
   2461     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
   2462 
   2463     // The critical assumption is that this only gets called with ranges that
   2464     // concentrate on a given area containing the selection root. This is done
   2465     // because of text fields and textareas. The DOM for those is not
   2466     // directly in the document DOM, so ensure that the range does not cross a
   2467     // boundary of one of those.
   2468     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
   2469         return false;
   2470     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
   2471         return false;
   2472 
   2473     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
   2474     ASSERT(testRange->startContainer() == scope);
   2475     location = TextIterator::rangeLength(testRange.get());
   2476 
   2477     ExceptionCode ec;
   2478     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
   2479     ASSERT(testRange->startContainer() == scope);
   2480     length = TextIterator::rangeLength(testRange.get()) - location;
   2481     return true;
   2482 }
   2483 
   2484 // --------
   2485 
   2486 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
   2487 {
   2488     UChar* result = 0;
   2489 
   2490     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
   2491     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
   2492     static const unsigned cMaxSegmentSize = 1 << 16;
   2493     bufferLength = 0;
   2494     typedef pair<UChar*, unsigned> TextSegment;
   2495     OwnPtr<Vector<TextSegment> > textSegments;
   2496     Vector<UChar> textBuffer;
   2497     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
   2498     TextIteratorBehavior behavior = defaultBehavior;
   2499     if (!isDisplayString)
   2500         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
   2501 
   2502     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
   2503         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
   2504             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
   2505             if (!newSegmentBuffer)
   2506                 goto exit;
   2507             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2508             if (!textSegments)
   2509                 textSegments = adoptPtr(new Vector<TextSegment>);
   2510             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
   2511             textBuffer.clear();
   2512         }
   2513         textBuffer.append(it.characters(), it.length());
   2514         bufferLength += it.length();
   2515     }
   2516 
   2517     if (!bufferLength)
   2518         return 0;
   2519 
   2520     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
   2521     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
   2522     if (!result)
   2523         goto exit;
   2524 
   2525     {
   2526         UChar* resultPos = result;
   2527         if (textSegments) {
   2528             unsigned size = textSegments->size();
   2529             for (unsigned i = 0; i < size; ++i) {
   2530                 const TextSegment& segment = textSegments->at(i);
   2531                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
   2532                 resultPos += segment.second;
   2533             }
   2534         }
   2535         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
   2536     }
   2537 
   2538 exit:
   2539     if (textSegments) {
   2540         unsigned size = textSegments->size();
   2541         for (unsigned i = 0; i < size; ++i)
   2542             free(textSegments->at(i).first);
   2543     }
   2544 
   2545     if (isDisplayString && r->ownerDocument())
   2546         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
   2547 
   2548     return result;
   2549 }
   2550 
   2551 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
   2552 {
   2553     unsigned length;
   2554     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
   2555     if (!buf)
   2556         return "";
   2557     String result(buf, length);
   2558     free(buf);
   2559     return result;
   2560 }
   2561 
   2562 static inline bool isAllCollapsibleWhitespace(const String& string)
   2563 {
   2564     const UChar* characters = string.characters();
   2565     unsigned length = string.length();
   2566     for (unsigned i = 0; i < length; ++i) {
   2567         if (!isCollapsibleWhitespace(characters[i]))
   2568             return false;
   2569     }
   2570     return true;
   2571 }
   2572 
   2573 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
   2574 {
   2575     ExceptionCode ec = 0;
   2576     RefPtr<Range> result = range->cloneRange(ec);
   2577     ASSERT(!ec);
   2578     result->collapse(!forward, ec);
   2579     ASSERT(!ec);
   2580     return result.release();
   2581 }
   2582 
   2583 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
   2584 {
   2585     matchStart = 0;
   2586     size_t matchLength = 0;
   2587 
   2588     SearchBuffer buffer(target, options);
   2589 
   2590     if (buffer.needsMoreContext()) {
   2591         RefPtr<Range> startRange = it.range();
   2592         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
   2593         ExceptionCode ec = 0;
   2594         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
   2595         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
   2596             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
   2597             if (!buffer.needsMoreContext())
   2598                 break;
   2599         }
   2600     }
   2601 
   2602     while (!it.atEnd()) {
   2603         it.advance(buffer.append(it.characters(), it.length()));
   2604 tryAgain:
   2605         size_t matchStartOffset;
   2606         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
   2607             // Note that we found a match, and where we found it.
   2608             size_t lastCharacterInBufferOffset = it.characterOffset();
   2609             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
   2610             matchStart = lastCharacterInBufferOffset - matchStartOffset;
   2611             matchLength = newMatchLength;
   2612             // If searching forward, stop on the first match.
   2613             // If searching backward, don't stop, so we end up with the last match.
   2614             if (!(options & Backwards))
   2615                 break;
   2616             goto tryAgain;
   2617         }
   2618         if (it.atBreak() && !buffer.atBreak()) {
   2619             buffer.reachedBreak();
   2620             goto tryAgain;
   2621         }
   2622     }
   2623 
   2624     return matchLength;
   2625 }
   2626 
   2627 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
   2628 {
   2629     // First, find the text.
   2630     size_t matchStart;
   2631     size_t matchLength;
   2632     {
   2633         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
   2634         matchLength = findPlainText(findIterator, target, options, matchStart);
   2635         if (!matchLength)
   2636             return collapsedToBoundary(range, !(options & Backwards));
   2637     }
   2638 
   2639     // Then, find the document position of the start and the end of the text.
   2640     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
   2641     return characterSubrange(computeRangeIterator, matchStart, matchLength);
   2642 }
   2643 
   2644 }
   2645