1 /* 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved. 3 * Copyright (C) 2005 Alexey Proskuryakov. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextIterator.h" 29 30 #include "Document.h" 31 #include "Frame.h" 32 #include "HTMLElement.h" 33 #include "HTMLNames.h" 34 #include "htmlediting.h" 35 #include "InlineTextBox.h" 36 #include "Range.h" 37 #include "RenderTableCell.h" 38 #include "RenderTableRow.h" 39 #include "RenderTextControl.h" 40 #include "RenderTextFragment.h" 41 #include "TextBoundaries.h" 42 #include "TextBreakIterator.h" 43 #include "VisiblePosition.h" 44 #include "visible_units.h" 45 #include <wtf/unicode/CharacterNames.h> 46 47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 48 #include "TextBreakIteratorInternalICU.h" 49 #include <unicode/usearch.h> 50 #endif 51 52 using namespace WTF::Unicode; 53 using namespace std; 54 55 namespace WebCore { 56 57 using namespace HTMLNames; 58 59 // Buffer that knows how to compare with a search target. 60 // Keeps enough of the previous text to be able to search in the future, but no more. 61 // Non-breaking spaces are always equal to normal spaces. 62 // Case folding is also done if the CaseInsensitive option is specified. 63 // Matches are further filtered if the AtWordStarts option is specified, although some 64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well. 65 class SearchBuffer { 66 WTF_MAKE_NONCOPYABLE(SearchBuffer); 67 public: 68 SearchBuffer(const String& target, FindOptions); 69 ~SearchBuffer(); 70 71 // Returns number of characters appended; guaranteed to be in the range [1, length]. 72 size_t append(const UChar*, size_t length); 73 bool needsMoreContext() const; 74 void prependContext(const UChar*, size_t length); 75 void reachedBreak(); 76 77 // Result is the size in characters of what was found. 78 // And <startOffset> is the number of characters back to the start of what was found. 79 size_t search(size_t& startOffset); 80 bool atBreak() const; 81 82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 83 84 private: 85 bool isBadMatch(const UChar*, size_t length) const; 86 bool isWordStartMatch(size_t start, size_t length) const; 87 88 String m_target; 89 FindOptions m_options; 90 91 Vector<UChar> m_buffer; 92 size_t m_overlap; 93 size_t m_prefixLength; 94 bool m_atBreak; 95 bool m_needsMoreContext; 96 97 bool m_targetRequiresKanaWorkaround; 98 Vector<UChar> m_normalizedTarget; 99 mutable Vector<UChar> m_normalizedMatch; 100 101 #else 102 103 private: 104 void append(UChar, bool isCharacterStart); 105 size_t length() const; 106 107 String m_target; 108 FindOptions m_options; 109 110 Vector<UChar> m_buffer; 111 Vector<bool> m_isCharacterStartBuffer; 112 bool m_isBufferFull; 113 size_t m_cursor; 114 115 #endif 116 }; 117 118 // -------- 119 120 static const unsigned bitsInWord = sizeof(unsigned) * 8; 121 static const unsigned bitInWordMask = bitsInWord - 1; 122 123 BitStack::BitStack() 124 : m_size(0) 125 { 126 } 127 128 BitStack::~BitStack() 129 { 130 } 131 132 void BitStack::push(bool bit) 133 { 134 unsigned index = m_size / bitsInWord; 135 unsigned shift = m_size & bitInWordMask; 136 if (!shift && index == m_words.size()) { 137 m_words.grow(index + 1); 138 m_words[index] = 0; 139 } 140 unsigned& word = m_words[index]; 141 unsigned mask = 1U << shift; 142 if (bit) 143 word |= mask; 144 else 145 word &= ~mask; 146 ++m_size; 147 } 148 149 void BitStack::pop() 150 { 151 if (m_size) 152 --m_size; 153 } 154 155 bool BitStack::top() const 156 { 157 if (!m_size) 158 return false; 159 unsigned shift = (m_size - 1) & bitInWordMask; 160 return m_words.last() & (1U << shift); 161 } 162 163 unsigned BitStack::size() const 164 { 165 return m_size; 166 } 167 168 // -------- 169 170 #if !ASSERT_DISABLED 171 172 static unsigned depthCrossingShadowBoundaries(Node* node) 173 { 174 unsigned depth = 0; 175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode()) 176 ++depth; 177 return depth; 178 } 179 180 #endif 181 182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees. 183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset) 184 { 185 if (!rangeEndContainer) 186 return 0; 187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) { 188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset)) 189 return next; 190 } 191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) { 192 if (Node* next = node->nextSibling()) 193 return next; 194 } 195 return 0; 196 } 197 198 // -------- 199 200 static inline bool fullyClipsContents(Node* node) 201 { 202 RenderObject* renderer = node->renderer(); 203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip()) 204 return false; 205 return toRenderBox(renderer)->size().isEmpty(); 206 } 207 208 static inline bool ignoresContainerClip(Node* node) 209 { 210 RenderObject* renderer = node->renderer(); 211 if (!renderer || renderer->isText()) 212 return false; 213 EPosition position = renderer->style()->position(); 214 return position == AbsolutePosition || position == FixedPosition; 215 } 216 217 static void pushFullyClippedState(BitStack& stack, Node* node) 218 { 219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node)); 220 221 // Push true if this node full clips its contents, or if a parent already has fully 222 // clipped and this is not a node that ignores its container's clip. 223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node))); 224 } 225 226 static void setUpFullyClippedStack(BitStack& stack, Node* node) 227 { 228 // Put the nodes in a vector so we can iterate in reverse order. 229 Vector<Node*, 100> ancestry; 230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode()) 231 ancestry.append(parent); 232 233 // Call pushFullyClippedState on each node starting with the earliest ancestor. 234 size_t size = ancestry.size(); 235 for (size_t i = 0; i < size; ++i) 236 pushFullyClippedState(stack, ancestry[size - i - 1]); 237 pushFullyClippedState(stack, node); 238 239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node)); 240 } 241 242 // -------- 243 244 TextIterator::TextIterator() 245 : m_startContainer(0) 246 , m_startOffset(0) 247 , m_endContainer(0) 248 , m_endOffset(0) 249 , m_positionNode(0) 250 , m_textCharacters(0) 251 , m_textLength(0) 252 , m_remainingTextBox(0) 253 , m_firstLetterText(0) 254 , m_lastCharacter(0) 255 , m_emitsCharactersBetweenAllVisiblePositions(false) 256 , m_entersTextControls(false) 257 , m_emitsTextWithoutTranscoding(false) 258 , m_handledFirstLetter(false) 259 , m_ignoresStyleVisibility(false) 260 , m_emitsObjectReplacementCharacters(false) 261 { 262 } 263 264 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior) 265 : m_startContainer(0) 266 , m_startOffset(0) 267 , m_endContainer(0) 268 , m_endOffset(0) 269 , m_positionNode(0) 270 , m_textCharacters(0) 271 , m_textLength(0) 272 , m_remainingTextBox(0) 273 , m_firstLetterText(0) 274 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) 275 , m_entersTextControls(behavior & TextIteratorEntersTextControls) 276 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding) 277 , m_handledFirstLetter(false) 278 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility) 279 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters) 280 { 281 if (!r) 282 return; 283 284 // get and validate the range endpoints 285 Node* startContainer = r->startContainer(); 286 if (!startContainer) 287 return; 288 int startOffset = r->startOffset(); 289 Node* endContainer = r->endContainer(); 290 int endOffset = r->endOffset(); 291 292 // Callers should be handing us well-formed ranges. If we discover that this isn't 293 // the case, we could consider changing this assertion to an early return. 294 ASSERT(r->boundaryPointsValid()); 295 296 // remember range - this does not change 297 m_startContainer = startContainer; 298 m_startOffset = startOffset; 299 m_endContainer = endContainer; 300 m_endOffset = endOffset; 301 302 // set up the current node for processing 303 m_node = r->firstNode(); 304 if (!m_node) 305 return; 306 setUpFullyClippedStack(m_fullyClippedStack, m_node); 307 m_offset = m_node == m_startContainer ? m_startOffset : 0; 308 m_handledNode = false; 309 m_handledChildren = false; 310 311 // calculate first out of bounds node 312 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset); 313 314 // initialize node processing state 315 m_needsAnotherNewline = false; 316 m_textBox = 0; 317 318 // initialize record of previous node processing 319 m_hasEmitted = false; 320 m_lastTextNode = 0; 321 m_lastTextNodeEndedWithCollapsedSpace = false; 322 m_lastCharacter = 0; 323 324 #ifndef NDEBUG 325 // need this just because of the assert in advance() 326 m_positionNode = m_node; 327 #endif 328 329 // identify the first run 330 advance(); 331 } 332 333 TextIterator::~TextIterator() 334 { 335 } 336 337 void TextIterator::advance() 338 { 339 // reset the run information 340 m_positionNode = 0; 341 m_textLength = 0; 342 343 // handle remembered node that needed a newline after the text node's newline 344 if (m_needsAnotherNewline) { 345 // Emit the extra newline, and position it *inside* m_node, after m_node's 346 // contents, in case it's a block, in the same way that we position the first 347 // newline. The range for the emitted newline should start where the line 348 // break begins. 349 // FIXME: It would be cleaner if we emitted two newlines during the last 350 // iteration, instead of using m_needsAnotherNewline. 351 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 352 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 353 m_needsAnotherNewline = false; 354 return; 355 } 356 357 if (!m_textBox && m_remainingTextBox) { 358 m_textBox = m_remainingTextBox; 359 m_remainingTextBox = 0; 360 m_firstLetterText = 0; 361 m_offset = 0; 362 } 363 // handle remembered text box 364 if (m_textBox) { 365 handleTextBox(); 366 if (m_positionNode) 367 return; 368 } 369 370 while (m_node && m_node != m_pastEndNode) { 371 // if the range ends at offset 0 of an element, represent the 372 // position, but not the content, of that element e.g. if the 373 // node is a blockflow element, emit a newline that 374 // precedes the element 375 if (m_node == m_endContainer && m_endOffset == 0) { 376 representNodeOffsetZero(); 377 m_node = 0; 378 return; 379 } 380 381 RenderObject* renderer = m_node->renderer(); 382 if (!renderer) { 383 m_handledNode = true; 384 m_handledChildren = true; 385 } else { 386 // handle current node according to its type 387 if (!m_handledNode) { 388 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE? 389 m_handledNode = handleTextNode(); 390 else if (renderer && (renderer->isImage() || renderer->isWidget() || 391 (renderer->node() && renderer->node()->isElementNode() && 392 static_cast<Element*>(renderer->node())->isFormControlElement()))) 393 m_handledNode = handleReplacedElement(); 394 else 395 m_handledNode = handleNonTextNode(); 396 if (m_positionNode) 397 return; 398 } 399 } 400 401 // find a new current node to handle in depth-first manner, 402 // calling exitNode() as we come back thru a parent node 403 Node* next = m_handledChildren ? 0 : m_node->firstChild(); 404 m_offset = 0; 405 if (!next) { 406 next = m_node->nextSibling(); 407 if (!next) { 408 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode; 409 Node* parentNode = m_node->parentOrHostNode(); 410 while (!next && parentNode) { 411 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode)) 412 return; 413 bool haveRenderer = m_node->renderer(); 414 m_node = parentNode; 415 m_fullyClippedStack.pop(); 416 parentNode = m_node->parentOrHostNode(); 417 if (haveRenderer) 418 exitNode(); 419 if (m_positionNode) { 420 m_handledNode = true; 421 m_handledChildren = true; 422 return; 423 } 424 next = m_node->nextSibling(); 425 } 426 } 427 m_fullyClippedStack.pop(); 428 } 429 430 // set the new current node 431 m_node = next; 432 if (m_node) 433 pushFullyClippedState(m_fullyClippedStack, m_node); 434 m_handledNode = false; 435 m_handledChildren = false; 436 m_handledFirstLetter = false; 437 m_firstLetterText = 0; 438 439 // how would this ever be? 440 if (m_positionNode) 441 return; 442 } 443 } 444 445 bool TextIterator::handleTextNode() 446 { 447 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility) 448 return false; 449 450 RenderText* renderer = toRenderText(m_node->renderer()); 451 452 m_lastTextNode = m_node; 453 String str = renderer->text(); 454 455 // handle pre-formatted text 456 if (!renderer->style()->collapseWhiteSpace()) { 457 int runStart = m_offset; 458 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) { 459 emitCharacter(' ', m_node, 0, runStart, runStart); 460 return false; 461 } 462 if (!m_handledFirstLetter && renderer->isTextFragment()) { 463 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 464 if (m_firstLetterText) { 465 String firstLetter = m_firstLetterText->text(); 466 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length()); 467 m_firstLetterText = 0; 468 m_textBox = 0; 469 return false; 470 } 471 } 472 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 473 return false; 474 int strLength = str.length(); 475 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX; 476 int runEnd = min(strLength, end); 477 478 if (runStart >= runEnd) 479 return true; 480 481 emitText(m_node, runStart, runEnd); 482 return true; 483 } 484 485 if (!renderer->firstTextBox() && str.length() > 0) { 486 if (!m_handledFirstLetter && renderer->isTextFragment()) { 487 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 488 if (m_firstLetterText) { 489 handleTextBox(); 490 return false; 491 } 492 } 493 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 494 return false; 495 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space 496 return true; 497 } 498 499 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text) 500 if (renderer->containsReversedText()) { 501 m_sortedTextBoxes.clear(); 502 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) { 503 m_sortedTextBoxes.append(textBox); 504 } 505 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart); 506 m_sortedTextBoxesPosition = 0; 507 } 508 509 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox(); 510 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) 511 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 512 handleTextBox(); 513 return true; 514 } 515 516 void TextIterator::handleTextBox() 517 { 518 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer()); 519 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) { 520 m_textBox = 0; 521 return; 522 } 523 String str = renderer->text(); 524 unsigned start = m_offset; 525 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX; 526 while (m_textBox) { 527 unsigned textBoxStart = m_textBox->start(); 528 unsigned runStart = max(textBoxStart, start); 529 530 // Check for collapsed space at the start of this run. 531 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox(); 532 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace 533 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0); 534 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) { 535 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') { 536 unsigned spaceRunStart = runStart - 1; 537 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ') 538 --spaceRunStart; 539 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1); 540 } else 541 emitCharacter(' ', m_node, 0, runStart, runStart); 542 return; 543 } 544 unsigned textBoxEnd = textBoxStart + m_textBox->len(); 545 unsigned runEnd = min(textBoxEnd, end); 546 547 // Determine what the next text box will be, but don't advance yet 548 InlineTextBox* nextTextBox = 0; 549 if (renderer->containsReversedText()) { 550 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size()) 551 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1]; 552 } else 553 nextTextBox = m_textBox->nextTextBox(); 554 555 if (runStart < runEnd) { 556 // Handle either a single newline character (which becomes a space), 557 // or a run of characters that does not include a newline. 558 // This effectively translates newlines to spaces without copying the text. 559 if (str[runStart] == '\n') { 560 emitCharacter(' ', m_node, 0, runStart, runStart + 1); 561 m_offset = runStart + 1; 562 } else { 563 size_t subrunEnd = str.find('\n', runStart); 564 if (subrunEnd == notFound || subrunEnd > runEnd) 565 subrunEnd = runEnd; 566 567 m_offset = subrunEnd; 568 emitText(m_node, renderer, runStart, subrunEnd); 569 } 570 571 // If we are doing a subrun that doesn't go to the end of the text box, 572 // come back again to finish handling this text box; don't advance to the next one. 573 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd) 574 return; 575 576 // Advance and return 577 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length(); 578 if (nextRunStart > runEnd) 579 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end 580 m_textBox = nextTextBox; 581 if (renderer->containsReversedText()) 582 ++m_sortedTextBoxesPosition; 583 return; 584 } 585 // Advance and continue 586 m_textBox = nextTextBox; 587 if (renderer->containsReversedText()) 588 ++m_sortedTextBoxesPosition; 589 } 590 if (!m_textBox && m_remainingTextBox) { 591 m_textBox = m_remainingTextBox; 592 m_remainingTextBox = 0; 593 m_firstLetterText = 0; 594 m_offset = 0; 595 handleTextBox(); 596 } 597 } 598 599 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer) 600 { 601 if (renderer->firstLetter()) { 602 RenderObject* r = renderer->firstLetter(); 603 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 604 return; 605 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) { 606 if (currChild->isText()) { 607 RenderText* firstLetter = toRenderText(currChild); 608 m_handledFirstLetter = true; 609 m_remainingTextBox = m_textBox; 610 m_textBox = firstLetter->firstTextBox(); 611 m_firstLetterText = firstLetter; 612 return; 613 } 614 } 615 } 616 m_handledFirstLetter = true; 617 } 618 619 bool TextIterator::handleReplacedElement() 620 { 621 if (m_fullyClippedStack.top()) 622 return false; 623 624 RenderObject* renderer = m_node->renderer(); 625 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 626 return false; 627 628 if (m_lastTextNodeEndedWithCollapsedSpace) { 629 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1); 630 return false; 631 } 632 633 if (m_entersTextControls && renderer->isTextControl()) { 634 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) { 635 m_node = innerTextElement->shadowTreeRootNode(); 636 pushFullyClippedState(m_fullyClippedStack, m_node); 637 m_offset = 0; 638 return false; 639 } 640 } 641 642 m_hasEmitted = true; 643 644 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) { 645 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1); 646 return true; 647 } 648 649 if (m_emitsCharactersBetweenAllVisiblePositions) { 650 // We want replaced elements to behave like punctuation for boundary 651 // finding, and to simply take up space for the selection preservation 652 // code in moveParagraphs, so we use a comma. 653 emitCharacter(',', m_node->parentNode(), m_node, 0, 1); 654 return true; 655 } 656 657 m_positionNode = m_node->parentNode(); 658 m_positionOffsetBaseNode = m_node; 659 m_positionStartOffset = 0; 660 m_positionEndOffset = 1; 661 662 m_textCharacters = 0; 663 m_textLength = 0; 664 665 m_lastCharacter = 0; 666 667 return true; 668 } 669 670 bool TextIterator::hasVisibleTextNode(RenderText* renderer) 671 { 672 if (renderer->style()->visibility() == VISIBLE) 673 return true; 674 if (renderer->isTextFragment()) { 675 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer); 676 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE) 677 return true; 678 } 679 return false; 680 } 681 682 static bool shouldEmitTabBeforeNode(Node* node) 683 { 684 RenderObject* r = node->renderer(); 685 686 // Table cells are delimited by tabs. 687 if (!r || !isTableCell(node)) 688 return false; 689 690 // Want a tab before every cell other than the first one 691 RenderTableCell* rc = toRenderTableCell(r); 692 RenderTable* t = rc->table(); 693 return t && (t->cellBefore(rc) || t->cellAbove(rc)); 694 } 695 696 static bool shouldEmitNewlineForNode(Node* node) 697 { 698 // br elements are represented by a single newline. 699 RenderObject* r = node->renderer(); 700 if (!r) 701 return node->hasTagName(brTag); 702 703 return r->isBR(); 704 } 705 706 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node) 707 { 708 // Block flow (versus inline flow) is represented by having 709 // a newline both before and after the element. 710 RenderObject* r = node->renderer(); 711 if (!r) { 712 return (node->hasTagName(blockquoteTag) 713 || node->hasTagName(ddTag) 714 || node->hasTagName(divTag) 715 || node->hasTagName(dlTag) 716 || node->hasTagName(dtTag) 717 || node->hasTagName(h1Tag) 718 || node->hasTagName(h2Tag) 719 || node->hasTagName(h3Tag) 720 || node->hasTagName(h4Tag) 721 || node->hasTagName(h5Tag) 722 || node->hasTagName(h6Tag) 723 || node->hasTagName(hrTag) 724 || node->hasTagName(liTag) 725 || node->hasTagName(listingTag) 726 || node->hasTagName(olTag) 727 || node->hasTagName(pTag) 728 || node->hasTagName(preTag) 729 || node->hasTagName(trTag) 730 || node->hasTagName(ulTag)); 731 } 732 733 // Need to make an exception for table cells, because they are blocks, but we 734 // want them tab-delimited rather than having newlines before and after. 735 if (isTableCell(node)) 736 return false; 737 738 // Need to make an exception for table row elements, because they are neither 739 // "inline" or "RenderBlock", but we want newlines for them. 740 if (r->isTableRow()) { 741 RenderTable* t = toRenderTableRow(r)->table(); 742 if (t && !t->isInline()) 743 return true; 744 } 745 746 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody(); 747 } 748 749 static bool shouldEmitNewlineAfterNode(Node* node) 750 { 751 // FIXME: It should be better but slower to create a VisiblePosition here. 752 if (!shouldEmitNewlinesBeforeAndAfterNode(node)) 753 return false; 754 // Check if this is the very last renderer in the document. 755 // If so, then we should not emit a newline. 756 while ((node = node->traverseNextSibling())) 757 if (node->renderer()) 758 return true; 759 return false; 760 } 761 762 static bool shouldEmitNewlineBeforeNode(Node* node) 763 { 764 return shouldEmitNewlinesBeforeAndAfterNode(node); 765 } 766 767 static bool shouldEmitExtraNewlineForNode(Node* node) 768 { 769 // When there is a significant collapsed bottom margin, emit an extra 770 // newline for a more realistic result. We end up getting the right 771 // result even without margin collapsing. For example: <div><p>text</p></div> 772 // will work right even if both the <div> and the <p> have bottom margins. 773 RenderObject* r = node->renderer(); 774 if (!r || !r->isBox()) 775 return false; 776 777 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears 778 // not to do this at all 779 if (node->hasTagName(h1Tag) 780 || node->hasTagName(h2Tag) 781 || node->hasTagName(h3Tag) 782 || node->hasTagName(h4Tag) 783 || node->hasTagName(h5Tag) 784 || node->hasTagName(h6Tag) 785 || node->hasTagName(pTag)) { 786 RenderStyle* style = r->style(); 787 if (style) { 788 int bottomMargin = toRenderBox(r)->collapsedMarginAfter(); 789 int fontSize = style->fontDescription().computedPixelSize(); 790 if (bottomMargin * 2 >= fontSize) 791 return true; 792 } 793 } 794 795 return false; 796 } 797 798 static int collapsedSpaceLength(RenderText* renderer, int textEnd) 799 { 800 const UChar* characters = renderer->text()->characters(); 801 int length = renderer->text()->length(); 802 for (int i = textEnd; i < length; ++i) { 803 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i])) 804 return i - textEnd; 805 } 806 807 return length - textEnd; 808 } 809 810 static int maxOffsetIncludingCollapsedSpaces(Node* node) 811 { 812 int offset = caretMaxOffset(node); 813 814 if (node->renderer() && node->renderer()->isText()) 815 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset); 816 817 return offset; 818 } 819 820 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic). 821 bool TextIterator::shouldRepresentNodeOffsetZero() 822 { 823 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable()) 824 return true; 825 826 // Leave element positioned flush with start of a paragraph 827 // (e.g. do not insert tab before a table cell at the start of a paragraph) 828 if (m_lastCharacter == '\n') 829 return false; 830 831 // Otherwise, show the position if we have emitted any characters 832 if (m_hasEmitted) 833 return true; 834 835 // We've not emitted anything yet. Generally, there is no need for any positioning then. 836 // The only exception is when the element is visually not in the same line as 837 // the start of the range (e.g. the range starts at the end of the previous paragraph). 838 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we 839 // make quicker checks to possibly avoid that. Another check that we could make is 840 // is whether the inline vs block flow changed since the previous visible element. 841 // I think we're already in a special enough case that that won't be needed, tho. 842 843 // No character needed if this is the first node in the range. 844 if (m_node == m_startContainer) 845 return false; 846 847 // If we are outside the start container's subtree, assume we need to emit. 848 // FIXME: m_startContainer could be an inline block 849 if (!m_node->isDescendantOf(m_startContainer)) 850 return true; 851 852 // If we started as m_startContainer offset 0 and the current node is a descendant of 853 // the start container, we already had enough context to correctly decide whether to 854 // emit after a preceding block. We chose not to emit (m_hasEmitted is false), 855 // so don't second guess that now. 856 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably 857 // immaterial since we likely would have already emitted something by now. 858 if (m_startOffset == 0) 859 return false; 860 861 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning. 862 // Additionally, if the range we are iterating over contains huge sections of unrendered content, 863 // we would create VisiblePositions on every call to this function without this check. 864 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE) 865 return false; 866 867 // The startPos.isNotNull() check is needed because the start could be before the body, 868 // and in that case we'll get null. We don't want to put in newlines at the start in that case. 869 // The currPos.isNotNull() check is needed because positions in non-HTML content 870 // (like SVG) do not have visible positions, and we don't want to emit for them either. 871 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM); 872 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM); 873 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos); 874 } 875 876 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node) 877 { 878 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions); 879 } 880 881 void TextIterator::representNodeOffsetZero() 882 { 883 // Emit a character to show the positioning of m_node. 884 885 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can 886 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks 887 // on m_node to see if it necessitates emitting a character first and will early return 888 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior. 889 if (shouldEmitTabBeforeNode(m_node)) { 890 if (shouldRepresentNodeOffsetZero()) 891 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0); 892 } else if (shouldEmitNewlineBeforeNode(m_node)) { 893 if (shouldRepresentNodeOffsetZero()) 894 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0); 895 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) { 896 if (shouldRepresentNodeOffsetZero()) 897 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0); 898 } 899 } 900 901 bool TextIterator::handleNonTextNode() 902 { 903 if (shouldEmitNewlineForNode(m_node)) 904 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1); 905 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR()) 906 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1); 907 else 908 representNodeOffsetZero(); 909 910 return true; 911 } 912 913 void TextIterator::exitNode() 914 { 915 // prevent emitting a newline when exiting a collapsed block at beginning of the range 916 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could 917 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and 918 // therefore look like a blank line. 919 if (!m_hasEmitted) 920 return; 921 922 // Emit with a position *inside* m_node, after m_node's contents, in 923 // case it is a block, because the run should start where the 924 // emitted character is positioned visually. 925 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 926 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making 927 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use 928 // TextIterator in _web_attributedStringFromRange. 929 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems. 930 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) { 931 // use extra newline to represent margin bottom, as needed 932 bool addNewline = shouldEmitExtraNewlineForNode(m_node); 933 934 // FIXME: We need to emit a '\n' as we leave an empty block(s) that 935 // contain a VisiblePosition when doing selection preservation. 936 if (m_lastCharacter != '\n') { 937 // insert a newline with a position following this block's contents. 938 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 939 // remember whether to later add a newline for the current node 940 ASSERT(!m_needsAnotherNewline); 941 m_needsAnotherNewline = addNewline; 942 } else if (addNewline) 943 // insert a newline with a position following this block's contents. 944 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 945 } 946 947 // If nothing was emitted, see if we need to emit a space. 948 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node)) 949 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1); 950 } 951 952 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset) 953 { 954 m_hasEmitted = true; 955 956 // remember information with which to construct the TextIterator::range() 957 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode 958 m_positionNode = textNode; 959 m_positionOffsetBaseNode = offsetBaseNode; 960 m_positionStartOffset = textStartOffset; 961 m_positionEndOffset = textEndOffset; 962 963 // remember information with which to construct the TextIterator::characters() and length() 964 m_singleCharacterBuffer = c; 965 m_textCharacters = &m_singleCharacterBuffer; 966 m_textLength = 1; 967 968 // remember some iteration state 969 m_lastTextNodeEndedWithCollapsedSpace = false; 970 m_lastCharacter = c; 971 } 972 973 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset) 974 { 975 RenderText* renderer = toRenderText(renderObject); 976 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text(); 977 ASSERT(m_text.characters()); 978 979 m_positionNode = textNode; 980 m_positionOffsetBaseNode = 0; 981 m_positionStartOffset = textStartOffset; 982 m_positionEndOffset = textEndOffset; 983 m_textCharacters = m_text.characters() + textStartOffset; 984 m_textLength = textEndOffset - textStartOffset; 985 m_lastCharacter = m_text[textEndOffset - 1]; 986 987 m_lastTextNodeEndedWithCollapsedSpace = false; 988 m_hasEmitted = true; 989 } 990 991 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset) 992 { 993 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset); 994 } 995 996 PassRefPtr<Range> TextIterator::range() const 997 { 998 // use the current run information, if we have it 999 if (m_positionNode) { 1000 if (m_positionOffsetBaseNode) { 1001 int index = m_positionOffsetBaseNode->nodeIndex(); 1002 m_positionStartOffset += index; 1003 m_positionEndOffset += index; 1004 m_positionOffsetBaseNode = 0; 1005 } 1006 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 1007 } 1008 1009 // otherwise, return the end of the overall range we were given 1010 if (m_endContainer) 1011 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset); 1012 1013 return 0; 1014 } 1015 1016 Node* TextIterator::node() const 1017 { 1018 RefPtr<Range> textRange = range(); 1019 if (!textRange) 1020 return 0; 1021 1022 Node* node = textRange->startContainer(); 1023 if (!node) 1024 return 0; 1025 if (node->offsetInCharacters()) 1026 return node; 1027 1028 return node->childNode(textRange->startOffset()); 1029 } 1030 1031 // -------- 1032 1033 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator() 1034 : m_behavior(TextIteratorDefaultBehavior) 1035 , m_node(0) 1036 , m_positionNode(0) 1037 { 1038 } 1039 1040 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior) 1041 : m_behavior(behavior) 1042 , m_node(0) 1043 , m_positionNode(0) 1044 { 1045 ASSERT(m_behavior == TextIteratorDefaultBehavior); 1046 1047 if (!r) 1048 return; 1049 1050 Node* startNode = r->startContainer(); 1051 if (!startNode) 1052 return; 1053 Node* endNode = r->endContainer(); 1054 int startOffset = r->startOffset(); 1055 int endOffset = r->endOffset(); 1056 1057 if (!startNode->offsetInCharacters()) { 1058 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) { 1059 startNode = startNode->childNode(startOffset); 1060 startOffset = 0; 1061 } 1062 } 1063 if (!endNode->offsetInCharacters()) { 1064 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) { 1065 endNode = endNode->childNode(endOffset - 1); 1066 endOffset = lastOffsetInNode(endNode); 1067 } 1068 } 1069 1070 m_node = endNode; 1071 setUpFullyClippedStack(m_fullyClippedStack, m_node); 1072 m_offset = endOffset; 1073 m_handledNode = false; 1074 m_handledChildren = endOffset == 0; 1075 1076 m_startNode = startNode; 1077 m_startOffset = startOffset; 1078 m_endNode = endNode; 1079 m_endOffset = endOffset; 1080 1081 #ifndef NDEBUG 1082 // Need this just because of the assert. 1083 m_positionNode = endNode; 1084 #endif 1085 1086 m_lastTextNode = 0; 1087 m_lastCharacter = '\n'; 1088 1089 m_havePassedStartNode = false; 1090 1091 advance(); 1092 } 1093 1094 void SimplifiedBackwardsTextIterator::advance() 1095 { 1096 ASSERT(m_positionNode); 1097 1098 m_positionNode = 0; 1099 m_textLength = 0; 1100 1101 while (m_node && !m_havePassedStartNode) { 1102 // Don't handle node if we start iterating at [node, 0]. 1103 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) { 1104 RenderObject* renderer = m_node->renderer(); 1105 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) { 1106 // FIXME: What about CDATA_SECTION_NODE? 1107 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1108 m_handledNode = handleTextNode(); 1109 } else if (renderer && (renderer->isImage() || renderer->isWidget())) { 1110 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1111 m_handledNode = handleReplacedElement(); 1112 } else 1113 m_handledNode = handleNonTextNode(); 1114 if (m_positionNode) 1115 return; 1116 } 1117 1118 if (!m_handledChildren && m_node->hasChildNodes()) { 1119 m_node = m_node->lastChild(); 1120 pushFullyClippedState(m_fullyClippedStack, m_node); 1121 } else { 1122 // Exit empty containers as we pass over them or containers 1123 // where [container, 0] is where we started iterating. 1124 if (!m_handledNode 1125 && canHaveChildrenForEditing(m_node) 1126 && m_node->parentNode() 1127 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) { 1128 exitNode(); 1129 if (m_positionNode) { 1130 m_handledNode = true; 1131 m_handledChildren = true; 1132 return; 1133 } 1134 } 1135 1136 // Exit all other containers. 1137 while (!m_node->previousSibling()) { 1138 if (!advanceRespectingRange(m_node->parentOrHostNode())) 1139 break; 1140 m_fullyClippedStack.pop(); 1141 exitNode(); 1142 if (m_positionNode) { 1143 m_handledNode = true; 1144 m_handledChildren = true; 1145 return; 1146 } 1147 } 1148 1149 m_fullyClippedStack.pop(); 1150 if (advanceRespectingRange(m_node->previousSibling())) 1151 pushFullyClippedState(m_fullyClippedStack, m_node); 1152 else 1153 m_node = 0; 1154 } 1155 1156 // For the purpose of word boundary detection, 1157 // we should iterate all visible text and trailing (collapsed) whitespaces. 1158 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0; 1159 m_handledNode = false; 1160 m_handledChildren = false; 1161 1162 if (m_positionNode) 1163 return; 1164 } 1165 } 1166 1167 bool SimplifiedBackwardsTextIterator::handleTextNode() 1168 { 1169 m_lastTextNode = m_node; 1170 1171 RenderText* renderer = toRenderText(m_node->renderer()); 1172 String str = renderer->text(); 1173 1174 if (!renderer->firstTextBox() && str.length() > 0) 1175 return true; 1176 1177 m_positionEndOffset = m_offset; 1178 1179 m_offset = (m_node == m_startNode) ? m_startOffset : 0; 1180 m_positionNode = m_node; 1181 m_positionStartOffset = m_offset; 1182 m_textLength = m_positionEndOffset - m_positionStartOffset; 1183 m_textCharacters = str.characters() + m_positionStartOffset; 1184 1185 m_lastCharacter = str[m_positionEndOffset - 1]; 1186 1187 return true; 1188 } 1189 1190 bool SimplifiedBackwardsTextIterator::handleReplacedElement() 1191 { 1192 unsigned index = m_node->nodeIndex(); 1193 // We want replaced elements to behave like punctuation for boundary 1194 // finding, and to simply take up space for the selection preservation 1195 // code in moveParagraphs, so we use a comma. Unconditionally emit 1196 // here because this iterator is only used for boundary finding. 1197 emitCharacter(',', m_node->parentNode(), index, index + 1); 1198 return true; 1199 } 1200 1201 bool SimplifiedBackwardsTextIterator::handleNonTextNode() 1202 { 1203 // We can use a linefeed in place of a tab because this simple iterator is only used to 1204 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs. 1205 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1206 unsigned index = m_node->nodeIndex(); 1207 // The start of this emitted range is wrong. Ensuring correctness would require 1208 // VisiblePositions and so would be slow. previousBoundary expects this. 1209 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1); 1210 } 1211 return true; 1212 } 1213 1214 void SimplifiedBackwardsTextIterator::exitNode() 1215 { 1216 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1217 // The start of this emitted range is wrong. Ensuring correctness would require 1218 // VisiblePositions and so would be slow. previousBoundary expects this. 1219 emitCharacter('\n', m_node, 0, 0); 1220 } 1221 } 1222 1223 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset) 1224 { 1225 m_singleCharacterBuffer = c; 1226 m_positionNode = node; 1227 m_positionStartOffset = startOffset; 1228 m_positionEndOffset = endOffset; 1229 m_textCharacters = &m_singleCharacterBuffer; 1230 m_textLength = 1; 1231 m_lastCharacter = c; 1232 } 1233 1234 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next) 1235 { 1236 if (!next) 1237 return false; 1238 m_havePassedStartNode |= m_node == m_startNode; 1239 if (m_havePassedStartNode) 1240 return false; 1241 m_node = next; 1242 return true; 1243 } 1244 1245 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const 1246 { 1247 if (m_positionNode) 1248 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 1249 1250 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset); 1251 } 1252 1253 // -------- 1254 1255 CharacterIterator::CharacterIterator() 1256 : m_offset(0) 1257 , m_runOffset(0) 1258 , m_atBreak(true) 1259 { 1260 } 1261 1262 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior) 1263 : m_offset(0) 1264 , m_runOffset(0) 1265 , m_atBreak(true) 1266 , m_textIterator(r, behavior) 1267 { 1268 while (!atEnd() && m_textIterator.length() == 0) 1269 m_textIterator.advance(); 1270 } 1271 1272 PassRefPtr<Range> CharacterIterator::range() const 1273 { 1274 RefPtr<Range> r = m_textIterator.range(); 1275 if (!m_textIterator.atEnd()) { 1276 if (m_textIterator.length() <= 1) { 1277 ASSERT(m_runOffset == 0); 1278 } else { 1279 Node* n = r->startContainer(); 1280 ASSERT(n == r->endContainer()); 1281 int offset = r->startOffset() + m_runOffset; 1282 ExceptionCode ec = 0; 1283 r->setStart(n, offset, ec); 1284 r->setEnd(n, offset + 1, ec); 1285 ASSERT(!ec); 1286 } 1287 } 1288 return r.release(); 1289 } 1290 1291 void CharacterIterator::advance(int count) 1292 { 1293 if (count <= 0) { 1294 ASSERT(count == 0); 1295 return; 1296 } 1297 1298 m_atBreak = false; 1299 1300 // easy if there is enough left in the current m_textIterator run 1301 int remaining = m_textIterator.length() - m_runOffset; 1302 if (count < remaining) { 1303 m_runOffset += count; 1304 m_offset += count; 1305 return; 1306 } 1307 1308 // exhaust the current m_textIterator run 1309 count -= remaining; 1310 m_offset += remaining; 1311 1312 // move to a subsequent m_textIterator run 1313 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1314 int runLength = m_textIterator.length(); 1315 if (runLength == 0) 1316 m_atBreak = true; 1317 else { 1318 // see whether this is m_textIterator to use 1319 if (count < runLength) { 1320 m_runOffset = count; 1321 m_offset += count; 1322 return; 1323 } 1324 1325 // exhaust this m_textIterator run 1326 count -= runLength; 1327 m_offset += runLength; 1328 } 1329 } 1330 1331 // ran to the end of the m_textIterator... no more runs left 1332 m_atBreak = true; 1333 m_runOffset = 0; 1334 } 1335 1336 String CharacterIterator::string(int numChars) 1337 { 1338 Vector<UChar> result; 1339 result.reserveInitialCapacity(numChars); 1340 while (numChars > 0 && !atEnd()) { 1341 int runSize = min(numChars, length()); 1342 result.append(characters(), runSize); 1343 numChars -= runSize; 1344 advance(runSize); 1345 } 1346 return String::adopt(result); 1347 } 1348 1349 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length) 1350 { 1351 it.advance(offset); 1352 RefPtr<Range> start = it.range(); 1353 1354 if (length > 1) 1355 it.advance(length - 1); 1356 RefPtr<Range> end = it.range(); 1357 1358 return Range::create(start->startContainer()->document(), 1359 start->startContainer(), start->startOffset(), 1360 end->endContainer(), end->endOffset()); 1361 } 1362 1363 BackwardsCharacterIterator::BackwardsCharacterIterator() 1364 : m_offset(0) 1365 , m_runOffset(0) 1366 , m_atBreak(true) 1367 { 1368 } 1369 1370 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior) 1371 : m_offset(0) 1372 , m_runOffset(0) 1373 , m_atBreak(true) 1374 , m_textIterator(range, behavior) 1375 { 1376 while (!atEnd() && !m_textIterator.length()) 1377 m_textIterator.advance(); 1378 } 1379 1380 PassRefPtr<Range> BackwardsCharacterIterator::range() const 1381 { 1382 RefPtr<Range> r = m_textIterator.range(); 1383 if (!m_textIterator.atEnd()) { 1384 if (m_textIterator.length() <= 1) 1385 ASSERT(m_runOffset == 0); 1386 else { 1387 Node* n = r->startContainer(); 1388 ASSERT(n == r->endContainer()); 1389 int offset = r->endOffset() - m_runOffset; 1390 ExceptionCode ec = 0; 1391 r->setStart(n, offset - 1, ec); 1392 r->setEnd(n, offset, ec); 1393 ASSERT(!ec); 1394 } 1395 } 1396 return r.release(); 1397 } 1398 1399 void BackwardsCharacterIterator::advance(int count) 1400 { 1401 if (count <= 0) { 1402 ASSERT(!count); 1403 return; 1404 } 1405 1406 m_atBreak = false; 1407 1408 int remaining = m_textIterator.length() - m_runOffset; 1409 if (count < remaining) { 1410 m_runOffset += count; 1411 m_offset += count; 1412 return; 1413 } 1414 1415 count -= remaining; 1416 m_offset += remaining; 1417 1418 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1419 int runLength = m_textIterator.length(); 1420 if (runLength == 0) 1421 m_atBreak = true; 1422 else { 1423 if (count < runLength) { 1424 m_runOffset = count; 1425 m_offset += count; 1426 return; 1427 } 1428 1429 count -= runLength; 1430 m_offset += runLength; 1431 } 1432 } 1433 1434 m_atBreak = true; 1435 m_runOffset = 0; 1436 } 1437 1438 // -------- 1439 1440 WordAwareIterator::WordAwareIterator() 1441 : m_previousText(0) 1442 , m_didLookAhead(false) 1443 { 1444 } 1445 1446 WordAwareIterator::WordAwareIterator(const Range* r) 1447 : m_previousText(0) 1448 , m_didLookAhead(true) // so we consider the first chunk from the text iterator 1449 , m_textIterator(r) 1450 { 1451 advance(); // get in position over the first chunk of text 1452 } 1453 1454 WordAwareIterator::~WordAwareIterator() 1455 { 1456 } 1457 1458 // We're always in one of these modes: 1459 // - The current chunk in the text iterator is our current chunk 1460 // (typically its a piece of whitespace, or text that ended with whitespace) 1461 // - The previous chunk in the text iterator is our current chunk 1462 // (we looked ahead to the next chunk and found a word boundary) 1463 // - We built up our own chunk of text from many chunks from the text iterator 1464 1465 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries. 1466 1467 void WordAwareIterator::advance() 1468 { 1469 m_previousText = 0; 1470 m_buffer.clear(); // toss any old buffer we built up 1471 1472 // If last time we did a look-ahead, start with that looked-ahead chunk now 1473 if (!m_didLookAhead) { 1474 ASSERT(!m_textIterator.atEnd()); 1475 m_textIterator.advance(); 1476 } 1477 m_didLookAhead = false; 1478 1479 // Go to next non-empty chunk 1480 while (!m_textIterator.atEnd() && m_textIterator.length() == 0) 1481 m_textIterator.advance(); 1482 m_range = m_textIterator.range(); 1483 1484 if (m_textIterator.atEnd()) 1485 return; 1486 1487 while (1) { 1488 // If this chunk ends in whitespace we can just use it as our chunk. 1489 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1])) 1490 return; 1491 1492 // If this is the first chunk that failed, save it in previousText before look ahead 1493 if (m_buffer.isEmpty()) { 1494 m_previousText = m_textIterator.characters(); 1495 m_previousLength = m_textIterator.length(); 1496 } 1497 1498 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff 1499 m_textIterator.advance(); 1500 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) { 1501 m_didLookAhead = true; 1502 return; 1503 } 1504 1505 if (m_buffer.isEmpty()) { 1506 // Start gobbling chunks until we get to a suitable stopping point 1507 m_buffer.append(m_previousText, m_previousLength); 1508 m_previousText = 0; 1509 } 1510 m_buffer.append(m_textIterator.characters(), m_textIterator.length()); 1511 int exception = 0; 1512 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception); 1513 } 1514 } 1515 1516 int WordAwareIterator::length() const 1517 { 1518 if (!m_buffer.isEmpty()) 1519 return m_buffer.size(); 1520 if (m_previousText) 1521 return m_previousLength; 1522 return m_textIterator.length(); 1523 } 1524 1525 const UChar* WordAwareIterator::characters() const 1526 { 1527 if (!m_buffer.isEmpty()) 1528 return m_buffer.data(); 1529 if (m_previousText) 1530 return m_previousText; 1531 return m_textIterator.characters(); 1532 } 1533 1534 // -------- 1535 1536 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c) 1537 { 1538 switch (c) { 1539 case hebrewPunctuationGershayim: 1540 case leftDoubleQuotationMark: 1541 case rightDoubleQuotationMark: 1542 return '"'; 1543 case hebrewPunctuationGeresh: 1544 case leftSingleQuotationMark: 1545 case rightSingleQuotationMark: 1546 return '\''; 1547 case softHyphen: 1548 // Replace soft hyphen with an ignorable character so that their presence or absence will 1549 // not affect string comparison. 1550 return 0; 1551 default: 1552 return c; 1553 } 1554 } 1555 1556 static inline void foldQuoteMarksAndSoftHyphens(String& s) 1557 { 1558 s.replace(hebrewPunctuationGeresh, '\''); 1559 s.replace(hebrewPunctuationGershayim, '"'); 1560 s.replace(leftDoubleQuotationMark, '"'); 1561 s.replace(leftSingleQuotationMark, '\''); 1562 s.replace(rightDoubleQuotationMark, '"'); 1563 s.replace(rightSingleQuotationMark, '\''); 1564 // Replace soft hyphen with an ignorable character so that their presence or absence will 1565 // not affect string comparison. 1566 s.replace(softHyphen, 0); 1567 } 1568 1569 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 1570 1571 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) 1572 { 1573 for (size_t i = 0; i < length; ++i) 1574 data[i] = foldQuoteMarkOrSoftHyphen(data[i]); 1575 } 1576 1577 static const size_t minimumSearchBufferSize = 8192; 1578 1579 #ifndef NDEBUG 1580 static bool searcherInUse; 1581 #endif 1582 1583 static UStringSearch* createSearcher() 1584 { 1585 // Provide a non-empty pattern and non-empty text so usearch_open will not fail, 1586 // but it doesn't matter exactly what it is, since we don't perform any searches 1587 // without setting both the pattern and the text. 1588 UErrorCode status = U_ZERO_ERROR; 1589 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status); 1590 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); 1591 return searcher; 1592 } 1593 1594 static UStringSearch* searcher() 1595 { 1596 static UStringSearch* searcher = createSearcher(); 1597 return searcher; 1598 } 1599 1600 static inline void lockSearcher() 1601 { 1602 #ifndef NDEBUG 1603 ASSERT(!searcherInUse); 1604 searcherInUse = true; 1605 #endif 1606 } 1607 1608 static inline void unlockSearcher() 1609 { 1610 #ifndef NDEBUG 1611 ASSERT(searcherInUse); 1612 searcherInUse = false; 1613 #endif 1614 } 1615 1616 // ICU's search ignores the distinction between small kana letters and ones 1617 // that are not small, and also characters that differ only in the voicing 1618 // marks when considering only primary collation strength diffrences. 1619 // This is not helpful for end users, since these differences make words 1620 // distinct, so for our purposes we need these to be considered. 1621 // The Unicode folks do not think the collation algorithm should be 1622 // changed. To work around this, we would like to tailor the ICU searcher, 1623 // but we can't get that to work yet. So instead, we check for cases where 1624 // these differences occur, and skip those matches. 1625 1626 // We refer to the above technique as the "kana workaround". The next few 1627 // functions are helper functinos for the kana workaround. 1628 1629 static inline bool isKanaLetter(UChar character) 1630 { 1631 // Hiragana letters. 1632 if (character >= 0x3041 && character <= 0x3096) 1633 return true; 1634 1635 // Katakana letters. 1636 if (character >= 0x30A1 && character <= 0x30FA) 1637 return true; 1638 if (character >= 0x31F0 && character <= 0x31FF) 1639 return true; 1640 1641 // Halfwidth katakana letters. 1642 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) 1643 return true; 1644 1645 return false; 1646 } 1647 1648 static inline bool isSmallKanaLetter(UChar character) 1649 { 1650 ASSERT(isKanaLetter(character)); 1651 1652 switch (character) { 1653 case 0x3041: // HIRAGANA LETTER SMALL A 1654 case 0x3043: // HIRAGANA LETTER SMALL I 1655 case 0x3045: // HIRAGANA LETTER SMALL U 1656 case 0x3047: // HIRAGANA LETTER SMALL E 1657 case 0x3049: // HIRAGANA LETTER SMALL O 1658 case 0x3063: // HIRAGANA LETTER SMALL TU 1659 case 0x3083: // HIRAGANA LETTER SMALL YA 1660 case 0x3085: // HIRAGANA LETTER SMALL YU 1661 case 0x3087: // HIRAGANA LETTER SMALL YO 1662 case 0x308E: // HIRAGANA LETTER SMALL WA 1663 case 0x3095: // HIRAGANA LETTER SMALL KA 1664 case 0x3096: // HIRAGANA LETTER SMALL KE 1665 case 0x30A1: // KATAKANA LETTER SMALL A 1666 case 0x30A3: // KATAKANA LETTER SMALL I 1667 case 0x30A5: // KATAKANA LETTER SMALL U 1668 case 0x30A7: // KATAKANA LETTER SMALL E 1669 case 0x30A9: // KATAKANA LETTER SMALL O 1670 case 0x30C3: // KATAKANA LETTER SMALL TU 1671 case 0x30E3: // KATAKANA LETTER SMALL YA 1672 case 0x30E5: // KATAKANA LETTER SMALL YU 1673 case 0x30E7: // KATAKANA LETTER SMALL YO 1674 case 0x30EE: // KATAKANA LETTER SMALL WA 1675 case 0x30F5: // KATAKANA LETTER SMALL KA 1676 case 0x30F6: // KATAKANA LETTER SMALL KE 1677 case 0x31F0: // KATAKANA LETTER SMALL KU 1678 case 0x31F1: // KATAKANA LETTER SMALL SI 1679 case 0x31F2: // KATAKANA LETTER SMALL SU 1680 case 0x31F3: // KATAKANA LETTER SMALL TO 1681 case 0x31F4: // KATAKANA LETTER SMALL NU 1682 case 0x31F5: // KATAKANA LETTER SMALL HA 1683 case 0x31F6: // KATAKANA LETTER SMALL HI 1684 case 0x31F7: // KATAKANA LETTER SMALL HU 1685 case 0x31F8: // KATAKANA LETTER SMALL HE 1686 case 0x31F9: // KATAKANA LETTER SMALL HO 1687 case 0x31FA: // KATAKANA LETTER SMALL MU 1688 case 0x31FB: // KATAKANA LETTER SMALL RA 1689 case 0x31FC: // KATAKANA LETTER SMALL RI 1690 case 0x31FD: // KATAKANA LETTER SMALL RU 1691 case 0x31FE: // KATAKANA LETTER SMALL RE 1692 case 0x31FF: // KATAKANA LETTER SMALL RO 1693 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A 1694 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I 1695 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U 1696 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E 1697 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O 1698 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA 1699 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU 1700 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO 1701 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU 1702 return true; 1703 } 1704 return false; 1705 } 1706 1707 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; 1708 1709 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) 1710 { 1711 ASSERT(isKanaLetter(character)); 1712 1713 switch (character) { 1714 case 0x304C: // HIRAGANA LETTER GA 1715 case 0x304E: // HIRAGANA LETTER GI 1716 case 0x3050: // HIRAGANA LETTER GU 1717 case 0x3052: // HIRAGANA LETTER GE 1718 case 0x3054: // HIRAGANA LETTER GO 1719 case 0x3056: // HIRAGANA LETTER ZA 1720 case 0x3058: // HIRAGANA LETTER ZI 1721 case 0x305A: // HIRAGANA LETTER ZU 1722 case 0x305C: // HIRAGANA LETTER ZE 1723 case 0x305E: // HIRAGANA LETTER ZO 1724 case 0x3060: // HIRAGANA LETTER DA 1725 case 0x3062: // HIRAGANA LETTER DI 1726 case 0x3065: // HIRAGANA LETTER DU 1727 case 0x3067: // HIRAGANA LETTER DE 1728 case 0x3069: // HIRAGANA LETTER DO 1729 case 0x3070: // HIRAGANA LETTER BA 1730 case 0x3073: // HIRAGANA LETTER BI 1731 case 0x3076: // HIRAGANA LETTER BU 1732 case 0x3079: // HIRAGANA LETTER BE 1733 case 0x307C: // HIRAGANA LETTER BO 1734 case 0x3094: // HIRAGANA LETTER VU 1735 case 0x30AC: // KATAKANA LETTER GA 1736 case 0x30AE: // KATAKANA LETTER GI 1737 case 0x30B0: // KATAKANA LETTER GU 1738 case 0x30B2: // KATAKANA LETTER GE 1739 case 0x30B4: // KATAKANA LETTER GO 1740 case 0x30B6: // KATAKANA LETTER ZA 1741 case 0x30B8: // KATAKANA LETTER ZI 1742 case 0x30BA: // KATAKANA LETTER ZU 1743 case 0x30BC: // KATAKANA LETTER ZE 1744 case 0x30BE: // KATAKANA LETTER ZO 1745 case 0x30C0: // KATAKANA LETTER DA 1746 case 0x30C2: // KATAKANA LETTER DI 1747 case 0x30C5: // KATAKANA LETTER DU 1748 case 0x30C7: // KATAKANA LETTER DE 1749 case 0x30C9: // KATAKANA LETTER DO 1750 case 0x30D0: // KATAKANA LETTER BA 1751 case 0x30D3: // KATAKANA LETTER BI 1752 case 0x30D6: // KATAKANA LETTER BU 1753 case 0x30D9: // KATAKANA LETTER BE 1754 case 0x30DC: // KATAKANA LETTER BO 1755 case 0x30F4: // KATAKANA LETTER VU 1756 case 0x30F7: // KATAKANA LETTER VA 1757 case 0x30F8: // KATAKANA LETTER VI 1758 case 0x30F9: // KATAKANA LETTER VE 1759 case 0x30FA: // KATAKANA LETTER VO 1760 return VoicedSoundMark; 1761 case 0x3071: // HIRAGANA LETTER PA 1762 case 0x3074: // HIRAGANA LETTER PI 1763 case 0x3077: // HIRAGANA LETTER PU 1764 case 0x307A: // HIRAGANA LETTER PE 1765 case 0x307D: // HIRAGANA LETTER PO 1766 case 0x30D1: // KATAKANA LETTER PA 1767 case 0x30D4: // KATAKANA LETTER PI 1768 case 0x30D7: // KATAKANA LETTER PU 1769 case 0x30DA: // KATAKANA LETTER PE 1770 case 0x30DD: // KATAKANA LETTER PO 1771 return SemiVoicedSoundMark; 1772 } 1773 return NoVoicedSoundMark; 1774 } 1775 1776 static inline bool isCombiningVoicedSoundMark(UChar character) 1777 { 1778 switch (character) { 1779 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK 1780 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 1781 return true; 1782 } 1783 return false; 1784 } 1785 1786 static inline bool containsKanaLetters(const String& pattern) 1787 { 1788 const UChar* characters = pattern.characters(); 1789 unsigned length = pattern.length(); 1790 for (unsigned i = 0; i < length; ++i) { 1791 if (isKanaLetter(characters[i])) 1792 return true; 1793 } 1794 return false; 1795 } 1796 1797 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer) 1798 { 1799 ASSERT(length); 1800 1801 buffer.resize(length); 1802 1803 UErrorCode status = U_ZERO_ERROR; 1804 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); 1805 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); 1806 ASSERT(bufferSize); 1807 1808 buffer.resize(bufferSize); 1809 1810 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) 1811 return; 1812 1813 status = U_ZERO_ERROR; 1814 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); 1815 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); 1816 } 1817 1818 static bool isNonLatin1Separator(UChar32 character) 1819 { 1820 ASSERT_ARG(character, character >= 256); 1821 1822 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); 1823 } 1824 1825 static inline bool isSeparator(UChar32 character) 1826 { 1827 static const bool latin1SeparatorTable[256] = { 1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / 1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? 1832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ 1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ 1834 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` 1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ 1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1838 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1839 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1841 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1843 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 1844 }; 1845 1846 if (character < 256) 1847 return latin1SeparatorTable[character]; 1848 1849 return isNonLatin1Separator(character); 1850 } 1851 1852 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) 1853 : m_target(target) 1854 , m_options(options) 1855 , m_prefixLength(0) 1856 , m_atBreak(true) 1857 , m_needsMoreContext(options & AtWordStarts) 1858 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) 1859 { 1860 ASSERT(!m_target.isEmpty()); 1861 1862 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead 1863 // of doing it in a separate replacement pass here, but ICU doesn't offer a way 1864 // to add tailoring on top of the locale-specific tailoring as of this writing. 1865 foldQuoteMarksAndSoftHyphens(m_target); 1866 1867 size_t targetLength = m_target.length(); 1868 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize)); 1869 m_overlap = m_buffer.capacity() / 4; 1870 1871 if ((m_options & AtWordStarts) && targetLength) { 1872 UChar32 targetFirstCharacter; 1873 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter); 1874 // Characters in the separator category never really occur at the beginning of a word, 1875 // so if the target begins with such a character, we just ignore the AtWordStart option. 1876 if (isSeparator(targetFirstCharacter)) { 1877 m_options &= ~AtWordStarts; 1878 m_needsMoreContext = false; 1879 } 1880 } 1881 1882 // Grab the single global searcher. 1883 // If we ever have a reason to do more than once search buffer at once, we'll have 1884 // to move to multiple searchers. 1885 lockSearcher(); 1886 1887 UStringSearch* searcher = WebCore::searcher(); 1888 UCollator* collator = usearch_getCollator(searcher); 1889 1890 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; 1891 if (ucol_getStrength(collator) != strength) { 1892 ucol_setStrength(collator, strength); 1893 usearch_reset(searcher); 1894 } 1895 1896 UErrorCode status = U_ZERO_ERROR; 1897 usearch_setPattern(searcher, m_target.characters(), targetLength, &status); 1898 ASSERT(status == U_ZERO_ERROR); 1899 1900 // The kana workaround requires a normalized copy of the target string. 1901 if (m_targetRequiresKanaWorkaround) 1902 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget); 1903 } 1904 1905 inline SearchBuffer::~SearchBuffer() 1906 { 1907 // Leave the static object pointing to a valid string. 1908 UErrorCode status = U_ZERO_ERROR; 1909 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status); 1910 ASSERT(status == U_ZERO_ERROR); 1911 1912 unlockSearcher(); 1913 } 1914 1915 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 1916 { 1917 ASSERT(length); 1918 1919 if (m_atBreak) { 1920 m_buffer.shrink(0); 1921 m_prefixLength = 0; 1922 m_atBreak = false; 1923 } else if (m_buffer.size() == m_buffer.capacity()) { 1924 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); 1925 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap); 1926 m_buffer.shrink(m_overlap); 1927 } 1928 1929 size_t oldLength = m_buffer.size(); 1930 size_t usableLength = min(m_buffer.capacity() - oldLength, length); 1931 ASSERT(usableLength); 1932 m_buffer.append(characters, usableLength); 1933 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength); 1934 return usableLength; 1935 } 1936 1937 inline bool SearchBuffer::needsMoreContext() const 1938 { 1939 return m_needsMoreContext; 1940 } 1941 1942 inline void SearchBuffer::prependContext(const UChar* characters, size_t length) 1943 { 1944 ASSERT(m_needsMoreContext); 1945 ASSERT(m_prefixLength == m_buffer.size()); 1946 1947 if (!length) 1948 return; 1949 1950 m_atBreak = false; 1951 1952 size_t wordBoundaryContextStart = length; 1953 if (wordBoundaryContextStart) { 1954 U16_BACK_1(characters, 0, wordBoundaryContextStart); 1955 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart); 1956 } 1957 1958 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart); 1959 m_buffer.prepend(characters + length - usableLength, usableLength); 1960 m_prefixLength += usableLength; 1961 1962 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) 1963 m_needsMoreContext = false; 1964 } 1965 1966 inline bool SearchBuffer::atBreak() const 1967 { 1968 return m_atBreak; 1969 } 1970 1971 inline void SearchBuffer::reachedBreak() 1972 { 1973 m_atBreak = true; 1974 } 1975 1976 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const 1977 { 1978 // This function implements the kana workaround. If usearch treats 1979 // it as a match, but we do not want to, then it's a "bad match". 1980 if (!m_targetRequiresKanaWorkaround) 1981 return false; 1982 1983 // Normalize into a match buffer. We reuse a single buffer rather than 1984 // creating a new one each time. 1985 normalizeCharacters(match, matchLength, m_normalizedMatch); 1986 1987 const UChar* a = m_normalizedTarget.begin(); 1988 const UChar* aEnd = m_normalizedTarget.end(); 1989 1990 const UChar* b = m_normalizedMatch.begin(); 1991 const UChar* bEnd = m_normalizedMatch.end(); 1992 1993 while (true) { 1994 // Skip runs of non-kana-letter characters. This is necessary so we can 1995 // correctly handle strings where the target and match have different-length 1996 // runs of characters that match, while still double checking the correctness 1997 // of matches of kana letters with other kana letters. 1998 while (a != aEnd && !isKanaLetter(*a)) 1999 ++a; 2000 while (b != bEnd && !isKanaLetter(*b)) 2001 ++b; 2002 2003 // If we reached the end of either the target or the match, we should have 2004 // reached the end of both; both should have the same number of kana letters. 2005 if (a == aEnd || b == bEnd) { 2006 ASSERT(a == aEnd); 2007 ASSERT(b == bEnd); 2008 return false; 2009 } 2010 2011 // Check for differences in the kana letter character itself. 2012 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b)) 2013 return true; 2014 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b)) 2015 return true; 2016 ++a; 2017 ++b; 2018 2019 // Check for differences in combining voiced sound marks found after the letter. 2020 while (1) { 2021 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) { 2022 if (b != bEnd && isCombiningVoicedSoundMark(*b)) 2023 return true; 2024 break; 2025 } 2026 if (!(b != bEnd && isCombiningVoicedSoundMark(*b))) 2027 return true; 2028 if (*a != *b) 2029 return true; 2030 ++a; 2031 ++b; 2032 } 2033 } 2034 } 2035 2036 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const 2037 { 2038 ASSERT(m_options & AtWordStarts); 2039 2040 if (!start) 2041 return true; 2042 2043 if (m_options & TreatMedialCapitalAsWordStart) { 2044 int size = m_buffer.size(); 2045 int offset = start; 2046 UChar32 firstCharacter; 2047 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); 2048 UChar32 previousCharacter; 2049 U16_PREV(m_buffer.data(), 0, offset, previousCharacter); 2050 2051 if (isSeparator(firstCharacter)) { 2052 // The start of a separator run is a word start (".org" in "webkit.org"). 2053 if (!isSeparator(previousCharacter)) 2054 return true; 2055 } else if (isASCIIUpper(firstCharacter)) { 2056 // The start of an uppercase run is a word start ("Kit" in "WebKit"). 2057 if (!isASCIIUpper(previousCharacter)) 2058 return true; 2059 // The last character of an uppercase run followed by a non-separator, non-digit 2060 // is a word start ("Request" in "XMLHTTPRequest"). 2061 offset = start; 2062 U16_FWD_1(m_buffer.data(), offset, size); 2063 UChar32 nextCharacter = 0; 2064 if (offset < size) 2065 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); 2066 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) 2067 return true; 2068 } else if (isASCIIDigit(firstCharacter)) { 2069 // The start of a digit run is a word start ("2" in "WebKit2"). 2070 if (!isASCIIDigit(previousCharacter)) 2071 return true; 2072 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { 2073 // The start of a non-separator, non-uppercase, non-digit run is a word start, 2074 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). 2075 return true; 2076 } 2077 } 2078 2079 size_t wordBreakSearchStart = start + length; 2080 while (wordBreakSearchStart > start) 2081 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); 2082 return wordBreakSearchStart == start; 2083 } 2084 2085 inline size_t SearchBuffer::search(size_t& start) 2086 { 2087 size_t size = m_buffer.size(); 2088 if (m_atBreak) { 2089 if (!size) 2090 return 0; 2091 } else { 2092 if (size != m_buffer.capacity()) 2093 return 0; 2094 } 2095 2096 UStringSearch* searcher = WebCore::searcher(); 2097 2098 UErrorCode status = U_ZERO_ERROR; 2099 usearch_setText(searcher, m_buffer.data(), size, &status); 2100 ASSERT(status == U_ZERO_ERROR); 2101 2102 usearch_setOffset(searcher, m_prefixLength, &status); 2103 ASSERT(status == U_ZERO_ERROR); 2104 2105 int matchStart = usearch_next(searcher, &status); 2106 ASSERT(status == U_ZERO_ERROR); 2107 2108 nextMatch: 2109 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { 2110 ASSERT(matchStart == USEARCH_DONE); 2111 return 0; 2112 } 2113 2114 // Matches that start in the overlap area are only tentative. 2115 // The same match may appear later, matching more characters, 2116 // possibly including a combining character that's not yet in the buffer. 2117 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { 2118 size_t overlap = m_overlap; 2119 if (m_options & AtWordStarts) { 2120 // Ensure that there is sufficient context before matchStart the next time around for 2121 // determining if it is at a word boundary. 2122 int wordBoundaryContextStart = matchStart; 2123 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); 2124 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); 2125 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart)); 2126 } 2127 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); 2128 m_prefixLength -= min(m_prefixLength, size - overlap); 2129 m_buffer.shrink(overlap); 2130 return 0; 2131 } 2132 2133 size_t matchedLength = usearch_getMatchedLength(searcher); 2134 ASSERT(matchStart + matchedLength <= size); 2135 2136 // If this match is "bad", move on to the next match. 2137 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { 2138 matchStart = usearch_next(searcher, &status); 2139 ASSERT(status == U_ZERO_ERROR); 2140 goto nextMatch; 2141 } 2142 2143 size_t newSize = size - (matchStart + 1); 2144 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); 2145 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1); 2146 m_buffer.shrink(newSize); 2147 2148 start = size - matchStart; 2149 return matchedLength; 2150 } 2151 2152 #else // !ICU_UNICODE 2153 2154 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) 2155 : m_target(options & CaseInsensitive ? target.foldCase() : target) 2156 , m_options(options) 2157 , m_buffer(m_target.length()) 2158 , m_isCharacterStartBuffer(m_target.length()) 2159 , m_isBufferFull(false) 2160 , m_cursor(0) 2161 { 2162 ASSERT(!m_target.isEmpty()); 2163 m_target.replace(noBreakSpace, ' '); 2164 foldQuoteMarksAndSoftHyphens(m_target); 2165 } 2166 2167 inline SearchBuffer::~SearchBuffer() 2168 { 2169 } 2170 2171 inline void SearchBuffer::reachedBreak() 2172 { 2173 m_cursor = 0; 2174 m_isBufferFull = false; 2175 } 2176 2177 inline bool SearchBuffer::atBreak() const 2178 { 2179 return !m_cursor && !m_isBufferFull; 2180 } 2181 2182 inline void SearchBuffer::append(UChar c, bool isStart) 2183 { 2184 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c); 2185 m_isCharacterStartBuffer[m_cursor] = isStart; 2186 if (++m_cursor == m_target.length()) { 2187 m_cursor = 0; 2188 m_isBufferFull = true; 2189 } 2190 } 2191 2192 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 2193 { 2194 ASSERT(length); 2195 if (!(m_options & CaseInsensitive)) { 2196 append(characters[0], true); 2197 return 1; 2198 } 2199 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough 2200 UChar foldedCharacters[maxFoldedCharacters]; 2201 bool error; 2202 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error); 2203 ASSERT(!error); 2204 ASSERT(numFoldedCharacters); 2205 ASSERT(numFoldedCharacters <= maxFoldedCharacters); 2206 if (!error && numFoldedCharacters) { 2207 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters); 2208 append(foldedCharacters[0], true); 2209 for (int i = 1; i < numFoldedCharacters; ++i) 2210 append(foldedCharacters[i], false); 2211 } 2212 return 1; 2213 } 2214 2215 inline bool SearchBuffer::needsMoreContext() const 2216 { 2217 return false; 2218 } 2219 2220 void SearchBuffer::prependContext(const UChar*, size_t) 2221 { 2222 ASSERT_NOT_REACHED(); 2223 } 2224 2225 inline size_t SearchBuffer::search(size_t& start) 2226 { 2227 if (!m_isBufferFull) 2228 return 0; 2229 if (!m_isCharacterStartBuffer[m_cursor]) 2230 return 0; 2231 2232 size_t tailSpace = m_target.length() - m_cursor; 2233 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0) 2234 return 0; 2235 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0) 2236 return 0; 2237 2238 start = length(); 2239 2240 // Now that we've found a match once, we don't want to find it again, because those 2241 // are the SearchBuffer semantics, allowing for a buffer where you append more than one 2242 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if 2243 // we want to get rid of that in the future we could track this with a separate boolean 2244 // or even move the characters to the start of the buffer and set m_isBufferFull to false. 2245 m_isCharacterStartBuffer[m_cursor] = false; 2246 2247 return start; 2248 } 2249 2250 // Returns the number of characters that were appended to the buffer (what we are searching in). 2251 // That's not necessarily the same length as the passed-in target string, because case folding 2252 // can make two strings match even though they're not the same length. 2253 size_t SearchBuffer::length() const 2254 { 2255 size_t bufferSize = m_target.length(); 2256 size_t length = 0; 2257 for (size_t i = 0; i < bufferSize; ++i) 2258 length += m_isCharacterStartBuffer[i]; 2259 return length; 2260 } 2261 2262 #endif // !ICU_UNICODE 2263 2264 // -------- 2265 2266 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation) 2267 { 2268 int length = 0; 2269 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance()) 2270 length += it.length(); 2271 2272 return length; 2273 } 2274 2275 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount) 2276 { 2277 CharacterIterator entireRangeIterator(entireRange); 2278 return characterSubrange(entireRangeIterator, characterOffset, characterCount); 2279 } 2280 2281 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation) 2282 { 2283 RefPtr<Range> resultRange = scope->document()->createRange(); 2284 2285 int docTextPosition = 0; 2286 int rangeEnd = rangeLocation + rangeLength; 2287 bool startRangeFound = false; 2288 2289 RefPtr<Range> textRunRange; 2290 2291 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); 2292 2293 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>. 2294 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) { 2295 textRunRange = it.range(); 2296 2297 ExceptionCode ec = 0; 2298 resultRange->setStart(textRunRange->startContainer(), 0, ec); 2299 ASSERT(!ec); 2300 resultRange->setEnd(textRunRange->startContainer(), 0, ec); 2301 ASSERT(!ec); 2302 2303 return resultRange.release(); 2304 } 2305 2306 for (; !it.atEnd(); it.advance()) { 2307 int len = it.length(); 2308 textRunRange = it.range(); 2309 2310 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len; 2311 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len; 2312 2313 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only 2314 // in those cases that textRunRange is used. 2315 if (foundEnd) { 2316 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong 2317 // position for emitted '\n's. 2318 if (len == 1 && it.characters()[0] == '\n') { 2319 scope->document()->updateLayoutIgnorePendingStylesheets(); 2320 it.advance(); 2321 if (!it.atEnd()) { 2322 RefPtr<Range> range = it.range(); 2323 ExceptionCode ec = 0; 2324 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec); 2325 ASSERT(!ec); 2326 } else { 2327 Position runStart = textRunRange->startPosition(); 2328 Position runEnd = VisiblePosition(runStart).next().deepEquivalent(); 2329 if (runEnd.isNotNull()) { 2330 ExceptionCode ec = 0; 2331 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec); 2332 ASSERT(!ec); 2333 } 2334 } 2335 } 2336 } 2337 2338 if (foundStart) { 2339 startRangeFound = true; 2340 int exception = 0; 2341 if (textRunRange->startContainer()->isTextNode()) { 2342 int offset = rangeLocation - docTextPosition; 2343 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2344 } else { 2345 if (rangeLocation == docTextPosition) 2346 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2347 else 2348 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2349 } 2350 } 2351 2352 if (foundEnd) { 2353 int exception = 0; 2354 if (textRunRange->startContainer()->isTextNode()) { 2355 int offset = rangeEnd - docTextPosition; 2356 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2357 } else { 2358 if (rangeEnd == docTextPosition) 2359 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2360 else 2361 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2362 } 2363 docTextPosition += len; 2364 break; 2365 } 2366 docTextPosition += len; 2367 } 2368 2369 if (!startRangeFound) 2370 return 0; 2371 2372 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds 2373 int exception = 0; 2374 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2375 } 2376 2377 return resultRange.release(); 2378 } 2379 2380 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length) 2381 { 2382 location = notFound; 2383 length = 0; 2384 2385 if (!range->startContainer()) 2386 return false; 2387 2388 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement(); 2389 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement(); 2390 2391 // The critical assumption is that this only gets called with ranges that 2392 // concentrate on a given area containing the selection root. This is done 2393 // because of text fields and textareas. The DOM for those is not 2394 // directly in the document DOM, so ensure that the range does not cross a 2395 // boundary of one of those. 2396 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope)) 2397 return false; 2398 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope)) 2399 return false; 2400 2401 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset()); 2402 ASSERT(testRange->startContainer() == scope); 2403 location = TextIterator::rangeLength(testRange.get()); 2404 2405 ExceptionCode ec; 2406 testRange->setEnd(range->endContainer(), range->endOffset(), ec); 2407 ASSERT(testRange->startContainer() == scope); 2408 length = TextIterator::rangeLength(testRange.get()) - location; 2409 return true; 2410 } 2411 2412 // -------- 2413 2414 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior) 2415 { 2416 UChar* result = 0; 2417 2418 // Do this in pieces to avoid massive reallocations if there is a large amount of text. 2419 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS. 2420 static const unsigned cMaxSegmentSize = 1 << 16; 2421 bufferLength = 0; 2422 typedef pair<UChar*, unsigned> TextSegment; 2423 OwnPtr<Vector<TextSegment> > textSegments; 2424 Vector<UChar> textBuffer; 2425 textBuffer.reserveInitialCapacity(cMaxSegmentSize); 2426 TextIteratorBehavior behavior = defaultBehavior; 2427 if (!isDisplayString) 2428 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding); 2429 2430 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) { 2431 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) { 2432 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar))); 2433 if (!newSegmentBuffer) 2434 goto exit; 2435 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2436 if (!textSegments) 2437 textSegments = adoptPtr(new Vector<TextSegment>); 2438 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size())); 2439 textBuffer.clear(); 2440 } 2441 textBuffer.append(it.characters(), it.length()); 2442 bufferLength += it.length(); 2443 } 2444 2445 if (!bufferLength) 2446 return 0; 2447 2448 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc 2449 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar))); 2450 if (!result) 2451 goto exit; 2452 2453 { 2454 UChar* resultPos = result; 2455 if (textSegments) { 2456 unsigned size = textSegments->size(); 2457 for (unsigned i = 0; i < size; ++i) { 2458 const TextSegment& segment = textSegments->at(i); 2459 memcpy(resultPos, segment.first, segment.second * sizeof(UChar)); 2460 resultPos += segment.second; 2461 } 2462 } 2463 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2464 } 2465 2466 exit: 2467 if (textSegments) { 2468 unsigned size = textSegments->size(); 2469 for (unsigned i = 0; i < size; ++i) 2470 free(textSegments->at(i).first); 2471 } 2472 2473 if (isDisplayString && r->ownerDocument()) 2474 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength); 2475 2476 return result; 2477 } 2478 2479 String plainText(const Range* r, TextIteratorBehavior defaultBehavior) 2480 { 2481 unsigned length; 2482 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior); 2483 if (!buf) 2484 return ""; 2485 String result(buf, length); 2486 free(buf); 2487 return result; 2488 } 2489 2490 static inline bool isAllCollapsibleWhitespace(const String& string) 2491 { 2492 const UChar* characters = string.characters(); 2493 unsigned length = string.length(); 2494 for (unsigned i = 0; i < length; ++i) { 2495 if (!isCollapsibleWhitespace(characters[i])) 2496 return false; 2497 } 2498 return true; 2499 } 2500 2501 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward) 2502 { 2503 ExceptionCode ec = 0; 2504 RefPtr<Range> result = range->cloneRange(ec); 2505 ASSERT(!ec); 2506 result->collapse(!forward, ec); 2507 ASSERT(!ec); 2508 return result.release(); 2509 } 2510 2511 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart) 2512 { 2513 matchStart = 0; 2514 size_t matchLength = 0; 2515 2516 SearchBuffer buffer(target, options); 2517 2518 if (buffer.needsMoreContext()) { 2519 RefPtr<Range> startRange = it.range(); 2520 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange(); 2521 ExceptionCode ec = 0; 2522 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec); 2523 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) { 2524 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length()); 2525 if (!buffer.needsMoreContext()) 2526 break; 2527 } 2528 } 2529 2530 while (!it.atEnd()) { 2531 it.advance(buffer.append(it.characters(), it.length())); 2532 tryAgain: 2533 size_t matchStartOffset; 2534 if (size_t newMatchLength = buffer.search(matchStartOffset)) { 2535 // Note that we found a match, and where we found it. 2536 size_t lastCharacterInBufferOffset = it.characterOffset(); 2537 ASSERT(lastCharacterInBufferOffset >= matchStartOffset); 2538 matchStart = lastCharacterInBufferOffset - matchStartOffset; 2539 matchLength = newMatchLength; 2540 // If searching forward, stop on the first match. 2541 // If searching backward, don't stop, so we end up with the last match. 2542 if (!(options & Backwards)) 2543 break; 2544 goto tryAgain; 2545 } 2546 if (it.atBreak() && !buffer.atBreak()) { 2547 buffer.reachedBreak(); 2548 goto tryAgain; 2549 } 2550 } 2551 2552 return matchLength; 2553 } 2554 2555 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options) 2556 { 2557 // First, find the text. 2558 size_t matchStart; 2559 size_t matchLength; 2560 { 2561 CharacterIterator findIterator(range, TextIteratorEntersTextControls); 2562 matchLength = findPlainText(findIterator, target, options, matchStart); 2563 if (!matchLength) 2564 return collapsedToBoundary(range, !(options & Backwards)); 2565 } 2566 2567 // Then, find the document position of the start and the end of the text. 2568 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls); 2569 return characterSubrange(computeRangeIterator, matchStart, matchLength); 2570 } 2571 2572 } 2573