1 /* 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved. 3 * Copyright (C) 2005 Alexey Proskuryakov. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextIterator.h" 29 30 #include "CharacterNames.h" 31 #include "Document.h" 32 #include "HTMLElement.h" 33 #include "HTMLNames.h" 34 #include "htmlediting.h" 35 #include "InlineTextBox.h" 36 #include "Range.h" 37 #include "RenderTableCell.h" 38 #include "RenderTableRow.h" 39 #include "RenderTextControl.h" 40 #include "VisiblePosition.h" 41 #include "visible_units.h" 42 43 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 44 #include "TextBreakIteratorInternalICU.h" 45 #include <unicode/usearch.h> 46 #endif 47 48 using namespace WTF::Unicode; 49 using namespace std; 50 51 namespace WebCore { 52 53 using namespace HTMLNames; 54 55 // Buffer that knows how to compare with a search target. 56 // Keeps enough of the previous text to be able to search in the future, but no more. 57 // Non-breaking spaces are always equal to normal spaces. 58 // Case folding is also done if <isCaseSensitive> is false. 59 class SearchBuffer : public Noncopyable { 60 public: 61 SearchBuffer(const String& target, bool isCaseSensitive); 62 ~SearchBuffer(); 63 64 // Returns number of characters appended; guaranteed to be in the range [1, length]. 65 size_t append(const UChar*, size_t length); 66 void reachedBreak(); 67 68 // Result is the size in characters of what was found. 69 // And <startOffset> is the number of characters back to the start of what was found. 70 size_t search(size_t& startOffset); 71 bool atBreak() const; 72 73 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 74 75 private: 76 bool isBadMatch(const UChar*, size_t length) const; 77 78 String m_target; 79 Vector<UChar> m_buffer; 80 size_t m_overlap; 81 bool m_atBreak; 82 83 bool m_targetRequiresKanaWorkaround; 84 Vector<UChar> m_normalizedTarget; 85 mutable Vector<UChar> m_normalizedMatch; 86 87 #else 88 89 private: 90 void append(UChar, bool isCharacterStart); 91 size_t length() const; 92 93 String m_target; 94 bool m_isCaseSensitive; 95 96 Vector<UChar> m_buffer; 97 Vector<bool> m_isCharacterStartBuffer; 98 bool m_isBufferFull; 99 size_t m_cursor; 100 101 #endif 102 }; 103 104 // -------- 105 106 static const unsigned bitsInWord = sizeof(unsigned) * 8; 107 static const unsigned bitInWordMask = bitsInWord - 1; 108 109 BitStack::BitStack() 110 : m_size(0) 111 { 112 } 113 114 void BitStack::push(bool bit) 115 { 116 unsigned index = m_size / bitsInWord; 117 unsigned shift = m_size & bitInWordMask; 118 if (!shift && index == m_words.size()) { 119 m_words.grow(index + 1); 120 m_words[index] = 0; 121 } 122 unsigned& word = m_words[index]; 123 unsigned mask = 1U << shift; 124 if (bit) 125 word |= mask; 126 else 127 word &= ~mask; 128 ++m_size; 129 } 130 131 void BitStack::pop() 132 { 133 if (m_size) 134 --m_size; 135 } 136 137 bool BitStack::top() const 138 { 139 if (!m_size) 140 return false; 141 unsigned shift = (m_size - 1) & bitInWordMask; 142 return m_words.last() & (1U << shift); 143 } 144 145 unsigned BitStack::size() const 146 { 147 return m_size; 148 } 149 150 // -------- 151 152 static inline Node* parentCrossingShadowBoundaries(Node* node) 153 { 154 if (Node* parent = node->parentNode()) 155 return parent; 156 return node->shadowParentNode(); 157 } 158 159 #if !ASSERT_DISABLED 160 161 static unsigned depthCrossingShadowBoundaries(Node* node) 162 { 163 unsigned depth = 0; 164 for (Node* parent = parentCrossingShadowBoundaries(node); parent; parent = parentCrossingShadowBoundaries(parent)) 165 ++depth; 166 return depth; 167 } 168 169 #endif 170 171 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees. 172 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset) 173 { 174 if (!rangeEndContainer) 175 return 0; 176 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) { 177 if (Node* next = rangeEndContainer->childNode(rangeEndOffset)) 178 return next; 179 } 180 for (Node* node = rangeEndContainer; node; node = parentCrossingShadowBoundaries(node)) { 181 if (Node* next = node->nextSibling()) 182 return next; 183 } 184 return 0; 185 } 186 187 static Node* previousInPostOrderCrossingShadowBoundaries(Node* rangeStartContainer, int rangeStartOffset) 188 { 189 if (!rangeStartContainer) 190 return 0; 191 if (rangeStartOffset > 0 && !rangeStartContainer->offsetInCharacters()) { 192 if (Node* previous = rangeStartContainer->childNode(rangeStartOffset - 1)) 193 return previous; 194 } 195 for (Node* node = rangeStartContainer; node; node = parentCrossingShadowBoundaries(node)) { 196 if (Node* previous = node->previousSibling()) 197 return previous; 198 } 199 return 0; 200 } 201 202 // -------- 203 204 static inline bool fullyClipsContents(Node* node) 205 { 206 RenderObject* renderer = node->renderer(); 207 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip()) 208 return false; 209 return toRenderBox(renderer)->size().isEmpty(); 210 } 211 212 static inline bool ignoresContainerClip(Node* node) 213 { 214 RenderObject* renderer = node->renderer(); 215 if (!renderer || renderer->isText()) 216 return false; 217 EPosition position = renderer->style()->position(); 218 return position == AbsolutePosition || position == FixedPosition; 219 } 220 221 static void pushFullyClippedState(BitStack& stack, Node* node) 222 { 223 ASSERT(stack.size() == depthCrossingShadowBoundaries(node)); 224 225 // Push true if this node full clips its contents, or if a parent already has fully 226 // clipped and this is not a node that ignores its container's clip. 227 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node))); 228 } 229 230 static void setUpFullyClippedStack(BitStack& stack, Node* node) 231 { 232 // Put the nodes in a vector so we can iterate in reverse order. 233 Vector<Node*, 100> ancestry; 234 for (Node* parent = parentCrossingShadowBoundaries(node); parent; parent = parentCrossingShadowBoundaries(parent)) 235 ancestry.append(parent); 236 237 // Call pushFullyClippedState on each node starting with the earliest ancestor. 238 size_t size = ancestry.size(); 239 for (size_t i = 0; i < size; ++i) 240 pushFullyClippedState(stack, ancestry[size - i - 1]); 241 pushFullyClippedState(stack, node); 242 243 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node)); 244 } 245 246 // -------- 247 248 TextIterator::TextIterator() 249 : m_startContainer(0) 250 , m_startOffset(0) 251 , m_endContainer(0) 252 , m_endOffset(0) 253 , m_positionNode(0) 254 , m_textCharacters(0) 255 , m_textLength(0) 256 , m_lastCharacter(0) 257 , m_emitCharactersBetweenAllVisiblePositions(false) 258 , m_enterTextControls(false) 259 { 260 } 261 262 TextIterator::TextIterator(const Range* r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls) 263 : m_startContainer(0) 264 , m_startOffset(0) 265 , m_endContainer(0) 266 , m_endOffset(0) 267 , m_positionNode(0) 268 , m_textCharacters(0) 269 , m_textLength(0) 270 , m_emitCharactersBetweenAllVisiblePositions(emitCharactersBetweenAllVisiblePositions) 271 , m_enterTextControls(enterTextControls) 272 { 273 if (!r) 274 return; 275 276 // get and validate the range endpoints 277 Node* startContainer = r->startContainer(); 278 if (!startContainer) 279 return; 280 int startOffset = r->startOffset(); 281 Node* endContainer = r->endContainer(); 282 int endOffset = r->endOffset(); 283 284 // Callers should be handing us well-formed ranges. If we discover that this isn't 285 // the case, we could consider changing this assertion to an early return. 286 ASSERT(r->boundaryPointsValid()); 287 288 // remember range - this does not change 289 m_startContainer = startContainer; 290 m_startOffset = startOffset; 291 m_endContainer = endContainer; 292 m_endOffset = endOffset; 293 294 // set up the current node for processing 295 m_node = r->firstNode(); 296 if (!m_node) 297 return; 298 setUpFullyClippedStack(m_fullyClippedStack, m_node); 299 m_offset = m_node == m_startContainer ? m_startOffset : 0; 300 m_handledNode = false; 301 m_handledChildren = false; 302 303 // calculate first out of bounds node 304 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset); 305 306 // initialize node processing state 307 m_needAnotherNewline = false; 308 m_textBox = 0; 309 310 // initialize record of previous node processing 311 m_haveEmitted = false; 312 m_lastTextNode = 0; 313 m_lastTextNodeEndedWithCollapsedSpace = false; 314 m_lastCharacter = 0; 315 316 #ifndef NDEBUG 317 // need this just because of the assert in advance() 318 m_positionNode = m_node; 319 #endif 320 321 // identify the first run 322 advance(); 323 } 324 325 void TextIterator::advance() 326 { 327 // reset the run information 328 m_positionNode = 0; 329 m_textLength = 0; 330 331 // handle remembered node that needed a newline after the text node's newline 332 if (m_needAnotherNewline) { 333 // Emit the extra newline, and position it *inside* m_node, after m_node's 334 // contents, in case it's a block, in the same way that we position the first 335 // newline. The range for the emitted newline should start where the line 336 // break begins. 337 // FIXME: It would be cleaner if we emitted two newlines during the last 338 // iteration, instead of using m_needAnotherNewline. 339 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 340 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 341 m_needAnotherNewline = false; 342 return; 343 } 344 345 // handle remembered text box 346 if (m_textBox) { 347 handleTextBox(); 348 if (m_positionNode) 349 return; 350 } 351 352 while (m_node && m_node != m_pastEndNode) { 353 // if the range ends at offset 0 of an element, represent the 354 // position, but not the content, of that element e.g. if the 355 // node is a blockflow element, emit a newline that 356 // precedes the element 357 if (m_node == m_endContainer && m_endOffset == 0) { 358 representNodeOffsetZero(); 359 m_node = 0; 360 return; 361 } 362 363 RenderObject* renderer = m_node->renderer(); 364 if (!renderer) { 365 m_handledNode = true; 366 m_handledChildren = true; 367 } else { 368 // handle current node according to its type 369 if (!m_handledNode) { 370 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE? 371 m_handledNode = handleTextNode(); 372 else if (renderer && (renderer->isImage() || renderer->isWidget() || 373 (renderer->node() && renderer->node()->isElementNode() && 374 static_cast<Element*>(renderer->node())->isFormControlElement()))) 375 m_handledNode = handleReplacedElement(); 376 else 377 m_handledNode = handleNonTextNode(); 378 if (m_positionNode) 379 return; 380 } 381 } 382 383 // find a new current node to handle in depth-first manner, 384 // calling exitNode() as we come back thru a parent node 385 Node* next = m_handledChildren ? 0 : m_node->firstChild(); 386 m_offset = 0; 387 if (!next) { 388 next = m_node->nextSibling(); 389 if (!next) { 390 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode; 391 Node* parentNode = parentCrossingShadowBoundaries(m_node); 392 while (!next && parentNode) { 393 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode)) 394 return; 395 bool haveRenderer = m_node->renderer(); 396 m_node = parentNode; 397 m_fullyClippedStack.pop(); 398 parentNode = parentCrossingShadowBoundaries(m_node); 399 if (haveRenderer) 400 exitNode(); 401 if (m_positionNode) { 402 m_handledNode = true; 403 m_handledChildren = true; 404 return; 405 } 406 next = m_node->nextSibling(); 407 } 408 } 409 m_fullyClippedStack.pop(); 410 } 411 412 // set the new current node 413 m_node = next; 414 if (m_node) 415 pushFullyClippedState(m_fullyClippedStack, m_node); 416 m_handledNode = false; 417 m_handledChildren = false; 418 419 // how would this ever be? 420 if (m_positionNode) 421 return; 422 } 423 } 424 425 static inline bool compareBoxStart(const InlineTextBox* first, const InlineTextBox* second) 426 { 427 return first->start() < second->start(); 428 } 429 430 bool TextIterator::handleTextNode() 431 { 432 if (m_fullyClippedStack.top()) 433 return false; 434 435 RenderText* renderer = toRenderText(m_node->renderer()); 436 if (renderer->style()->visibility() != VISIBLE) 437 return false; 438 439 m_lastTextNode = m_node; 440 String str = renderer->text(); 441 442 // handle pre-formatted text 443 if (!renderer->style()->collapseWhiteSpace()) { 444 int runStart = m_offset; 445 if (m_lastTextNodeEndedWithCollapsedSpace) { 446 emitCharacter(' ', m_node, 0, runStart, runStart); 447 return false; 448 } 449 int strLength = str.length(); 450 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX; 451 int runEnd = min(strLength, end); 452 453 if (runStart >= runEnd) 454 return true; 455 456 emitText(m_node, runStart, runEnd); 457 return true; 458 } 459 460 if (!renderer->firstTextBox() && str.length() > 0) { 461 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space 462 return true; 463 } 464 465 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text) 466 if (renderer->containsReversedText()) { 467 m_sortedTextBoxes.clear(); 468 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) { 469 m_sortedTextBoxes.append(textBox); 470 } 471 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), compareBoxStart); 472 m_sortedTextBoxesPosition = 0; 473 } 474 475 m_textBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox(); 476 handleTextBox(); 477 return true; 478 } 479 480 void TextIterator::handleTextBox() 481 { 482 RenderText* renderer = toRenderText(m_node->renderer()); 483 String str = renderer->text(); 484 int start = m_offset; 485 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX; 486 while (m_textBox) { 487 int textBoxStart = m_textBox->start(); 488 int runStart = max(textBoxStart, start); 489 490 // Check for collapsed space at the start of this run. 491 InlineTextBox* firstTextBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox(); 492 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace 493 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0); 494 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) { 495 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') { 496 unsigned spaceRunStart = runStart - 1; 497 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ') 498 --spaceRunStart; 499 emitText(m_node, spaceRunStart, spaceRunStart + 1); 500 } else 501 emitCharacter(' ', m_node, 0, runStart, runStart); 502 return; 503 } 504 int textBoxEnd = textBoxStart + m_textBox->len(); 505 int runEnd = min(textBoxEnd, end); 506 507 // Determine what the next text box will be, but don't advance yet 508 InlineTextBox* nextTextBox = 0; 509 if (renderer->containsReversedText()) { 510 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size()) 511 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1]; 512 } else 513 nextTextBox = m_textBox->nextTextBox(); 514 515 if (runStart < runEnd) { 516 // Handle either a single newline character (which becomes a space), 517 // or a run of characters that does not include a newline. 518 // This effectively translates newlines to spaces without copying the text. 519 if (str[runStart] == '\n') { 520 emitCharacter(' ', m_node, 0, runStart, runStart + 1); 521 m_offset = runStart + 1; 522 } else { 523 int subrunEnd = str.find('\n', runStart); 524 if (subrunEnd == -1 || subrunEnd > runEnd) 525 subrunEnd = runEnd; 526 527 m_offset = subrunEnd; 528 emitText(m_node, runStart, subrunEnd); 529 } 530 531 // If we are doing a subrun that doesn't go to the end of the text box, 532 // come back again to finish handling this text box; don't advance to the next one. 533 if (m_positionEndOffset < textBoxEnd) 534 return; 535 536 // Advance and return 537 int nextRunStart = nextTextBox ? nextTextBox->start() : str.length(); 538 if (nextRunStart > runEnd) 539 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end 540 m_textBox = nextTextBox; 541 if (renderer->containsReversedText()) 542 ++m_sortedTextBoxesPosition; 543 return; 544 } 545 // Advance and continue 546 m_textBox = nextTextBox; 547 if (renderer->containsReversedText()) 548 ++m_sortedTextBoxesPosition; 549 } 550 } 551 552 bool TextIterator::handleReplacedElement() 553 { 554 if (m_fullyClippedStack.top()) 555 return false; 556 557 RenderObject* renderer = m_node->renderer(); 558 if (renderer->style()->visibility() != VISIBLE) 559 return false; 560 561 if (m_lastTextNodeEndedWithCollapsedSpace) { 562 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1); 563 return false; 564 } 565 566 if (m_enterTextControls && renderer->isTextControl()) { 567 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) { 568 m_node = innerTextElement->shadowTreeRootNode(); 569 pushFullyClippedState(m_fullyClippedStack, m_node); 570 m_offset = 0; 571 return false; 572 } 573 } 574 575 m_haveEmitted = true; 576 577 if (m_emitCharactersBetweenAllVisiblePositions) { 578 // We want replaced elements to behave like punctuation for boundary 579 // finding, and to simply take up space for the selection preservation 580 // code in moveParagraphs, so we use a comma. 581 emitCharacter(',', m_node->parentNode(), m_node, 0, 1); 582 return true; 583 } 584 585 m_positionNode = m_node->parentNode(); 586 m_positionOffsetBaseNode = m_node; 587 m_positionStartOffset = 0; 588 m_positionEndOffset = 1; 589 590 m_textCharacters = 0; 591 m_textLength = 0; 592 593 m_lastCharacter = 0; 594 595 return true; 596 } 597 598 static bool shouldEmitTabBeforeNode(Node* node) 599 { 600 RenderObject* r = node->renderer(); 601 602 // Table cells are delimited by tabs. 603 if (!r || !isTableCell(node)) 604 return false; 605 606 // Want a tab before every cell other than the first one 607 RenderTableCell* rc = toRenderTableCell(r); 608 RenderTable* t = rc->table(); 609 return t && (t->cellBefore(rc) || t->cellAbove(rc)); 610 } 611 612 static bool shouldEmitNewlineForNode(Node* node) 613 { 614 // br elements are represented by a single newline. 615 RenderObject* r = node->renderer(); 616 if (!r) 617 return node->hasTagName(brTag); 618 619 return r->isBR(); 620 } 621 622 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node) 623 { 624 // Block flow (versus inline flow) is represented by having 625 // a newline both before and after the element. 626 RenderObject* r = node->renderer(); 627 if (!r) { 628 return (node->hasTagName(blockquoteTag) 629 || node->hasTagName(ddTag) 630 || node->hasTagName(divTag) 631 || node->hasTagName(dlTag) 632 || node->hasTagName(dtTag) 633 || node->hasTagName(h1Tag) 634 || node->hasTagName(h2Tag) 635 || node->hasTagName(h3Tag) 636 || node->hasTagName(h4Tag) 637 || node->hasTagName(h5Tag) 638 || node->hasTagName(h6Tag) 639 || node->hasTagName(hrTag) 640 || node->hasTagName(liTag) 641 || node->hasTagName(listingTag) 642 || node->hasTagName(olTag) 643 || node->hasTagName(pTag) 644 || node->hasTagName(preTag) 645 || node->hasTagName(trTag) 646 || node->hasTagName(ulTag)); 647 } 648 649 // Need to make an exception for table cells, because they are blocks, but we 650 // want them tab-delimited rather than having newlines before and after. 651 if (isTableCell(node)) 652 return false; 653 654 // Need to make an exception for table row elements, because they are neither 655 // "inline" or "RenderBlock", but we want newlines for them. 656 if (r->isTableRow()) { 657 RenderTable* t = toRenderTableRow(r)->table(); 658 if (t && !t->isInline()) 659 return true; 660 } 661 662 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody(); 663 } 664 665 static bool shouldEmitNewlineAfterNode(Node* node) 666 { 667 // FIXME: It should be better but slower to create a VisiblePosition here. 668 if (!shouldEmitNewlinesBeforeAndAfterNode(node)) 669 return false; 670 // Check if this is the very last renderer in the document. 671 // If so, then we should not emit a newline. 672 while ((node = node->traverseNextSibling())) 673 if (node->renderer()) 674 return true; 675 return false; 676 } 677 678 static bool shouldEmitNewlineBeforeNode(Node* node) 679 { 680 return shouldEmitNewlinesBeforeAndAfterNode(node); 681 } 682 683 static bool shouldEmitExtraNewlineForNode(Node* node) 684 { 685 // When there is a significant collapsed bottom margin, emit an extra 686 // newline for a more realistic result. We end up getting the right 687 // result even without margin collapsing. For example: <div><p>text</p></div> 688 // will work right even if both the <div> and the <p> have bottom margins. 689 RenderObject* r = node->renderer(); 690 if (!r || !r->isBox()) 691 return false; 692 693 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears 694 // not to do this at all 695 if (node->hasTagName(h1Tag) 696 || node->hasTagName(h2Tag) 697 || node->hasTagName(h3Tag) 698 || node->hasTagName(h4Tag) 699 || node->hasTagName(h5Tag) 700 || node->hasTagName(h6Tag) 701 || node->hasTagName(pTag)) { 702 RenderStyle* style = r->style(); 703 if (style) { 704 int bottomMargin = toRenderBox(r)->collapsedMarginBottom(); 705 int fontSize = style->fontDescription().computedPixelSize(); 706 if (bottomMargin * 2 >= fontSize) 707 return true; 708 } 709 } 710 711 return false; 712 } 713 714 static int collapsedSpaceLength(RenderText* renderer, int textEnd) 715 { 716 const UChar* characters = renderer->text()->characters(); 717 int length = renderer->text()->length(); 718 for (int i = textEnd; i < length; ++i) { 719 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i])) 720 return i - textEnd; 721 } 722 723 return length - textEnd; 724 } 725 726 static int maxOffsetIncludingCollapsedSpaces(Node* node) 727 { 728 int offset = caretMaxOffset(node); 729 730 if (node->renderer() && node->renderer()->isText()) 731 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset); 732 733 return offset; 734 } 735 736 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic). 737 bool TextIterator::shouldRepresentNodeOffsetZero() 738 { 739 if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable()) 740 return true; 741 742 // Leave element positioned flush with start of a paragraph 743 // (e.g. do not insert tab before a table cell at the start of a paragraph) 744 if (m_lastCharacter == '\n') 745 return false; 746 747 // Otherwise, show the position if we have emitted any characters 748 if (m_haveEmitted) 749 return true; 750 751 // We've not emitted anything yet. Generally, there is no need for any positioning then. 752 // The only exception is when the element is visually not in the same line as 753 // the start of the range (e.g. the range starts at the end of the previous paragraph). 754 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we 755 // make quicker checks to possibly avoid that. Another check that we could make is 756 // is whether the inline vs block flow changed since the previous visible element. 757 // I think we're already in a special enough case that that won't be needed, tho. 758 759 // No character needed if this is the first node in the range. 760 if (m_node == m_startContainer) 761 return false; 762 763 // If we are outside the start container's subtree, assume we need to emit. 764 // FIXME: m_startContainer could be an inline block 765 if (!m_node->isDescendantOf(m_startContainer)) 766 return true; 767 768 // If we started as m_startContainer offset 0 and the current node is a descendant of 769 // the start container, we already had enough context to correctly decide whether to 770 // emit after a preceding block. We chose not to emit (m_haveEmitted is false), 771 // so don't second guess that now. 772 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably 773 // immaterial since we likely would have already emitted something by now. 774 if (m_startOffset == 0) 775 return false; 776 777 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning. 778 // Additionally, if the range we are iterating over contains huge sections of unrendered content, 779 // we would create VisiblePositions on every call to this function without this check. 780 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE) 781 return false; 782 783 // The startPos.isNotNull() check is needed because the start could be before the body, 784 // and in that case we'll get null. We don't want to put in newlines at the start in that case. 785 // The currPos.isNotNull() check is needed because positions in non-HTML content 786 // (like SVG) do not have visible positions, and we don't want to emit for them either. 787 VisiblePosition startPos = VisiblePosition(m_startContainer, m_startOffset, DOWNSTREAM); 788 VisiblePosition currPos = VisiblePosition(m_node, 0, DOWNSTREAM); 789 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos); 790 } 791 792 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node) 793 { 794 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitCharactersBetweenAllVisiblePositions); 795 } 796 797 void TextIterator::representNodeOffsetZero() 798 { 799 // Emit a character to show the positioning of m_node. 800 801 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can 802 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks 803 // on m_node to see if it necessitates emitting a character first and will early return 804 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior. 805 if (shouldEmitTabBeforeNode(m_node)) { 806 if (shouldRepresentNodeOffsetZero()) 807 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0); 808 } else if (shouldEmitNewlineBeforeNode(m_node)) { 809 if (shouldRepresentNodeOffsetZero()) 810 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0); 811 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) { 812 if (shouldRepresentNodeOffsetZero()) 813 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0); 814 } 815 } 816 817 bool TextIterator::handleNonTextNode() 818 { 819 if (shouldEmitNewlineForNode(m_node)) 820 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1); 821 else if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR()) 822 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1); 823 else 824 representNodeOffsetZero(); 825 826 return true; 827 } 828 829 void TextIterator::exitNode() 830 { 831 // prevent emitting a newline when exiting a collapsed block at beginning of the range 832 // FIXME: !m_haveEmitted does not necessarily mean there was a collapsed block... it could 833 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and 834 // therefore look like a blank line. 835 if (!m_haveEmitted) 836 return; 837 838 // Emit with a position *inside* m_node, after m_node's contents, in 839 // case it is a block, because the run should start where the 840 // emitted character is positioned visually. 841 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 842 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making 843 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use 844 // TextIterator in _web_attributedStringFromRange. 845 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems. 846 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) { 847 // use extra newline to represent margin bottom, as needed 848 bool addNewline = shouldEmitExtraNewlineForNode(m_node); 849 850 // FIXME: We need to emit a '\n' as we leave an empty block(s) that 851 // contain a VisiblePosition when doing selection preservation. 852 if (m_lastCharacter != '\n') { 853 // insert a newline with a position following this block's contents. 854 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 855 // remember whether to later add a newline for the current node 856 ASSERT(!m_needAnotherNewline); 857 m_needAnotherNewline = addNewline; 858 } else if (addNewline) 859 // insert a newline with a position following this block's contents. 860 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 861 } 862 863 // If nothing was emitted, see if we need to emit a space. 864 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node)) 865 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1); 866 } 867 868 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset) 869 { 870 m_haveEmitted = true; 871 872 // remember information with which to construct the TextIterator::range() 873 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode 874 m_positionNode = textNode; 875 m_positionOffsetBaseNode = offsetBaseNode; 876 m_positionStartOffset = textStartOffset; 877 m_positionEndOffset = textEndOffset; 878 879 // remember information with which to construct the TextIterator::characters() and length() 880 m_singleCharacterBuffer = c; 881 m_textCharacters = &m_singleCharacterBuffer; 882 m_textLength = 1; 883 884 // remember some iteration state 885 m_lastTextNodeEndedWithCollapsedSpace = false; 886 m_lastCharacter = c; 887 } 888 889 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset) 890 { 891 RenderText* renderer = toRenderText(m_node->renderer()); 892 String str = renderer->text(); 893 ASSERT(str.characters()); 894 895 m_positionNode = textNode; 896 m_positionOffsetBaseNode = 0; 897 m_positionStartOffset = textStartOffset; 898 m_positionEndOffset = textEndOffset; 899 m_textCharacters = str.characters() + textStartOffset; 900 m_textLength = textEndOffset - textStartOffset; 901 m_lastCharacter = str[textEndOffset - 1]; 902 903 m_lastTextNodeEndedWithCollapsedSpace = false; 904 m_haveEmitted = true; 905 } 906 907 PassRefPtr<Range> TextIterator::range() const 908 { 909 // use the current run information, if we have it 910 if (m_positionNode) { 911 if (m_positionOffsetBaseNode) { 912 int index = m_positionOffsetBaseNode->nodeIndex(); 913 m_positionStartOffset += index; 914 m_positionEndOffset += index; 915 m_positionOffsetBaseNode = 0; 916 } 917 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 918 } 919 920 // otherwise, return the end of the overall range we were given 921 if (m_endContainer) 922 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset); 923 924 return 0; 925 } 926 927 Node* TextIterator::node() const 928 { 929 RefPtr<Range> textRange = range(); 930 if (!textRange) 931 return 0; 932 933 Node* node = textRange->startContainer(); 934 if (!node) 935 return 0; 936 if (node->offsetInCharacters()) 937 return node; 938 939 return node->childNode(textRange->startOffset()); 940 } 941 942 // -------- 943 944 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator() 945 : m_positionNode(0) 946 { 947 } 948 949 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r) 950 : m_positionNode(0) 951 { 952 if (!r) 953 return; 954 955 Node* startNode = r->startContainer(); 956 if (!startNode) 957 return; 958 Node* endNode = r->endContainer(); 959 int startOffset = r->startOffset(); 960 int endOffset = r->endOffset(); 961 962 if (!startNode->offsetInCharacters()) { 963 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) { 964 startNode = startNode->childNode(startOffset); 965 startOffset = 0; 966 } 967 } 968 if (!endNode->offsetInCharacters()) { 969 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) { 970 endNode = endNode->childNode(endOffset - 1); 971 endOffset = lastOffsetInNode(endNode); 972 } 973 } 974 975 m_node = endNode; 976 setUpFullyClippedStack(m_fullyClippedStack, m_node); 977 m_offset = endOffset; 978 m_handledNode = false; 979 m_handledChildren = endOffset == 0; 980 981 m_startNode = startNode; 982 m_startOffset = startOffset; 983 m_endNode = endNode; 984 m_endOffset = endOffset; 985 986 #ifndef NDEBUG 987 // Need this just because of the assert. 988 m_positionNode = endNode; 989 #endif 990 991 m_lastTextNode = 0; 992 m_lastCharacter = '\n'; 993 994 m_pastStartNode = previousInPostOrderCrossingShadowBoundaries(startNode, startOffset); 995 996 advance(); 997 } 998 999 void SimplifiedBackwardsTextIterator::advance() 1000 { 1001 ASSERT(m_positionNode); 1002 1003 m_positionNode = 0; 1004 m_textLength = 0; 1005 1006 while (m_node && m_node != m_pastStartNode) { 1007 // Don't handle node if we start iterating at [node, 0]. 1008 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) { 1009 RenderObject* renderer = m_node->renderer(); 1010 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) { 1011 // FIXME: What about CDATA_SECTION_NODE? 1012 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1013 m_handledNode = handleTextNode(); 1014 } else if (renderer && (renderer->isImage() || renderer->isWidget())) { 1015 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1016 m_handledNode = handleReplacedElement(); 1017 } else 1018 m_handledNode = handleNonTextNode(); 1019 if (m_positionNode) 1020 return; 1021 } 1022 1023 Node* next = m_handledChildren ? 0 : m_node->lastChild(); 1024 if (!next) { 1025 // Exit empty containers as we pass over them or containers 1026 // where [container, 0] is where we started iterating. 1027 if (!m_handledNode && 1028 canHaveChildrenForEditing(m_node) && 1029 m_node->parentNode() && 1030 (!m_node->lastChild() || (m_node == m_endNode && m_endOffset == 0))) { 1031 exitNode(); 1032 if (m_positionNode) { 1033 m_handledNode = true; 1034 m_handledChildren = true; 1035 return; 1036 } 1037 } 1038 // Exit all other containers. 1039 next = m_node->previousSibling(); 1040 while (!next) { 1041 Node* parentNode = parentCrossingShadowBoundaries(m_node); 1042 if (!parentNode) 1043 break; 1044 m_node = parentNode; 1045 m_fullyClippedStack.pop(); 1046 exitNode(); 1047 if (m_positionNode) { 1048 m_handledNode = true; 1049 m_handledChildren = true; 1050 return; 1051 } 1052 next = m_node->previousSibling(); 1053 } 1054 m_fullyClippedStack.pop(); 1055 } 1056 1057 m_node = next; 1058 if (m_node) 1059 pushFullyClippedState(m_fullyClippedStack, m_node); 1060 // For the purpose of word boundary detection, 1061 // we should iterate all visible text and trailing (collapsed) whitespaces. 1062 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0; 1063 m_handledNode = false; 1064 m_handledChildren = false; 1065 1066 if (m_positionNode) 1067 return; 1068 } 1069 } 1070 1071 bool SimplifiedBackwardsTextIterator::handleTextNode() 1072 { 1073 m_lastTextNode = m_node; 1074 1075 RenderText* renderer = toRenderText(m_node->renderer()); 1076 String str = renderer->text(); 1077 1078 if (!renderer->firstTextBox() && str.length() > 0) 1079 return true; 1080 1081 m_positionEndOffset = m_offset; 1082 1083 m_offset = (m_node == m_startNode) ? m_startOffset : 0; 1084 m_positionNode = m_node; 1085 m_positionStartOffset = m_offset; 1086 m_textLength = m_positionEndOffset - m_positionStartOffset; 1087 m_textCharacters = str.characters() + m_positionStartOffset; 1088 1089 m_lastCharacter = str[m_positionEndOffset - 1]; 1090 1091 return true; 1092 } 1093 1094 bool SimplifiedBackwardsTextIterator::handleReplacedElement() 1095 { 1096 unsigned index = m_node->nodeIndex(); 1097 // We want replaced elements to behave like punctuation for boundary 1098 // finding, and to simply take up space for the selection preservation 1099 // code in moveParagraphs, so we use a comma. Unconditionally emit 1100 // here because this iterator is only used for boundary finding. 1101 emitCharacter(',', m_node->parentNode(), index, index + 1); 1102 return true; 1103 } 1104 1105 bool SimplifiedBackwardsTextIterator::handleNonTextNode() 1106 { 1107 // We can use a linefeed in place of a tab because this simple iterator is only used to 1108 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs. 1109 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1110 unsigned index = m_node->nodeIndex(); 1111 // The start of this emitted range is wrong. Ensuring correctness would require 1112 // VisiblePositions and so would be slow. previousBoundary expects this. 1113 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1); 1114 } 1115 return true; 1116 } 1117 1118 void SimplifiedBackwardsTextIterator::exitNode() 1119 { 1120 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1121 // The start of this emitted range is wrong. Ensuring correctness would require 1122 // VisiblePositions and so would be slow. previousBoundary expects this. 1123 emitCharacter('\n', m_node, 0, 0); 1124 } 1125 } 1126 1127 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset) 1128 { 1129 m_singleCharacterBuffer = c; 1130 m_positionNode = node; 1131 m_positionStartOffset = startOffset; 1132 m_positionEndOffset = endOffset; 1133 m_textCharacters = &m_singleCharacterBuffer; 1134 m_textLength = 1; 1135 m_lastCharacter = c; 1136 } 1137 1138 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const 1139 { 1140 if (m_positionNode) 1141 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 1142 1143 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset); 1144 } 1145 1146 // -------- 1147 1148 CharacterIterator::CharacterIterator() 1149 : m_offset(0) 1150 , m_runOffset(0) 1151 , m_atBreak(true) 1152 { 1153 } 1154 1155 CharacterIterator::CharacterIterator(const Range* r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls) 1156 : m_offset(0) 1157 , m_runOffset(0) 1158 , m_atBreak(true) 1159 , m_textIterator(r, emitCharactersBetweenAllVisiblePositions, enterTextControls) 1160 { 1161 while (!atEnd() && m_textIterator.length() == 0) 1162 m_textIterator.advance(); 1163 } 1164 1165 PassRefPtr<Range> CharacterIterator::range() const 1166 { 1167 RefPtr<Range> r = m_textIterator.range(); 1168 if (!m_textIterator.atEnd()) { 1169 if (m_textIterator.length() <= 1) { 1170 ASSERT(m_runOffset == 0); 1171 } else { 1172 Node* n = r->startContainer(); 1173 ASSERT(n == r->endContainer()); 1174 int offset = r->startOffset() + m_runOffset; 1175 ExceptionCode ec = 0; 1176 r->setStart(n, offset, ec); 1177 r->setEnd(n, offset + 1, ec); 1178 ASSERT(!ec); 1179 } 1180 } 1181 return r.release(); 1182 } 1183 1184 void CharacterIterator::advance(int count) 1185 { 1186 if (count <= 0) { 1187 ASSERT(count == 0); 1188 return; 1189 } 1190 1191 m_atBreak = false; 1192 1193 // easy if there is enough left in the current m_textIterator run 1194 int remaining = m_textIterator.length() - m_runOffset; 1195 if (count < remaining) { 1196 m_runOffset += count; 1197 m_offset += count; 1198 return; 1199 } 1200 1201 // exhaust the current m_textIterator run 1202 count -= remaining; 1203 m_offset += remaining; 1204 1205 // move to a subsequent m_textIterator run 1206 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1207 int runLength = m_textIterator.length(); 1208 if (runLength == 0) 1209 m_atBreak = true; 1210 else { 1211 // see whether this is m_textIterator to use 1212 if (count < runLength) { 1213 m_runOffset = count; 1214 m_offset += count; 1215 return; 1216 } 1217 1218 // exhaust this m_textIterator run 1219 count -= runLength; 1220 m_offset += runLength; 1221 } 1222 } 1223 1224 // ran to the end of the m_textIterator... no more runs left 1225 m_atBreak = true; 1226 m_runOffset = 0; 1227 } 1228 1229 String CharacterIterator::string(int numChars) 1230 { 1231 Vector<UChar> result; 1232 result.reserveInitialCapacity(numChars); 1233 while (numChars > 0 && !atEnd()) { 1234 int runSize = min(numChars, length()); 1235 result.append(characters(), runSize); 1236 numChars -= runSize; 1237 advance(runSize); 1238 } 1239 return String::adopt(result); 1240 } 1241 1242 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length) 1243 { 1244 it.advance(offset); 1245 RefPtr<Range> start = it.range(); 1246 1247 if (length > 1) 1248 it.advance(length - 1); 1249 RefPtr<Range> end = it.range(); 1250 1251 return Range::create(start->startContainer()->document(), 1252 start->startContainer(), start->startOffset(), 1253 end->endContainer(), end->endOffset()); 1254 } 1255 1256 BackwardsCharacterIterator::BackwardsCharacterIterator() 1257 : m_offset(0) 1258 , m_runOffset(0) 1259 , m_atBreak(true) 1260 { 1261 } 1262 1263 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range) 1264 : m_offset(0) 1265 , m_runOffset(0) 1266 , m_atBreak(true) 1267 , m_textIterator(range) 1268 { 1269 while (!atEnd() && !m_textIterator.length()) 1270 m_textIterator.advance(); 1271 } 1272 1273 PassRefPtr<Range> BackwardsCharacterIterator::range() const 1274 { 1275 RefPtr<Range> r = m_textIterator.range(); 1276 if (!m_textIterator.atEnd()) { 1277 if (m_textIterator.length() <= 1) 1278 ASSERT(m_runOffset == 0); 1279 else { 1280 Node* n = r->startContainer(); 1281 ASSERT(n == r->endContainer()); 1282 int offset = r->endOffset() - m_runOffset; 1283 ExceptionCode ec = 0; 1284 r->setStart(n, offset - 1, ec); 1285 r->setEnd(n, offset, ec); 1286 ASSERT(!ec); 1287 } 1288 } 1289 return r.release(); 1290 } 1291 1292 void BackwardsCharacterIterator::advance(int count) 1293 { 1294 if (count <= 0) { 1295 ASSERT(!count); 1296 return; 1297 } 1298 1299 m_atBreak = false; 1300 1301 int remaining = m_textIterator.length() - m_runOffset; 1302 if (count < remaining) { 1303 m_runOffset += count; 1304 m_offset += count; 1305 return; 1306 } 1307 1308 count -= remaining; 1309 m_offset += remaining; 1310 1311 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1312 int runLength = m_textIterator.length(); 1313 if (runLength == 0) 1314 m_atBreak = true; 1315 else { 1316 if (count < runLength) { 1317 m_runOffset = count; 1318 m_offset += count; 1319 return; 1320 } 1321 1322 count -= runLength; 1323 m_offset += runLength; 1324 } 1325 } 1326 1327 m_atBreak = true; 1328 m_runOffset = 0; 1329 } 1330 1331 // -------- 1332 1333 WordAwareIterator::WordAwareIterator() 1334 : m_previousText(0) 1335 , m_didLookAhead(false) 1336 { 1337 } 1338 1339 WordAwareIterator::WordAwareIterator(const Range* r) 1340 : m_previousText(0) 1341 , m_didLookAhead(true) // so we consider the first chunk from the text iterator 1342 , m_textIterator(r) 1343 { 1344 advance(); // get in position over the first chunk of text 1345 } 1346 1347 // We're always in one of these modes: 1348 // - The current chunk in the text iterator is our current chunk 1349 // (typically its a piece of whitespace, or text that ended with whitespace) 1350 // - The previous chunk in the text iterator is our current chunk 1351 // (we looked ahead to the next chunk and found a word boundary) 1352 // - We built up our own chunk of text from many chunks from the text iterator 1353 1354 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries. 1355 1356 void WordAwareIterator::advance() 1357 { 1358 m_previousText = 0; 1359 m_buffer.clear(); // toss any old buffer we built up 1360 1361 // If last time we did a look-ahead, start with that looked-ahead chunk now 1362 if (!m_didLookAhead) { 1363 ASSERT(!m_textIterator.atEnd()); 1364 m_textIterator.advance(); 1365 } 1366 m_didLookAhead = false; 1367 1368 // Go to next non-empty chunk 1369 while (!m_textIterator.atEnd() && m_textIterator.length() == 0) 1370 m_textIterator.advance(); 1371 m_range = m_textIterator.range(); 1372 1373 if (m_textIterator.atEnd()) 1374 return; 1375 1376 while (1) { 1377 // If this chunk ends in whitespace we can just use it as our chunk. 1378 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1])) 1379 return; 1380 1381 // If this is the first chunk that failed, save it in previousText before look ahead 1382 if (m_buffer.isEmpty()) { 1383 m_previousText = m_textIterator.characters(); 1384 m_previousLength = m_textIterator.length(); 1385 } 1386 1387 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff 1388 m_textIterator.advance(); 1389 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) { 1390 m_didLookAhead = true; 1391 return; 1392 } 1393 1394 if (m_buffer.isEmpty()) { 1395 // Start gobbling chunks until we get to a suitable stopping point 1396 m_buffer.append(m_previousText, m_previousLength); 1397 m_previousText = 0; 1398 } 1399 m_buffer.append(m_textIterator.characters(), m_textIterator.length()); 1400 int exception = 0; 1401 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception); 1402 } 1403 } 1404 1405 int WordAwareIterator::length() const 1406 { 1407 if (!m_buffer.isEmpty()) 1408 return m_buffer.size(); 1409 if (m_previousText) 1410 return m_previousLength; 1411 return m_textIterator.length(); 1412 } 1413 1414 const UChar* WordAwareIterator::characters() const 1415 { 1416 if (!m_buffer.isEmpty()) 1417 return m_buffer.data(); 1418 if (m_previousText) 1419 return m_previousText; 1420 return m_textIterator.characters(); 1421 } 1422 1423 // -------- 1424 1425 static inline UChar foldQuoteMark(UChar c) 1426 { 1427 switch (c) { 1428 case hebrewPunctuationGershayim: 1429 case leftDoubleQuotationMark: 1430 case rightDoubleQuotationMark: 1431 return '"'; 1432 case hebrewPunctuationGeresh: 1433 case leftSingleQuotationMark: 1434 case rightSingleQuotationMark: 1435 return '\''; 1436 default: 1437 return c; 1438 } 1439 } 1440 1441 static inline void foldQuoteMarks(String& s) 1442 { 1443 s.replace(hebrewPunctuationGeresh, '\''); 1444 s.replace(hebrewPunctuationGershayim, '"'); 1445 s.replace(leftDoubleQuotationMark, '"'); 1446 s.replace(leftSingleQuotationMark, '\''); 1447 s.replace(rightDoubleQuotationMark, '"'); 1448 s.replace(rightSingleQuotationMark, '\''); 1449 } 1450 1451 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 1452 1453 static inline void foldQuoteMarks(UChar* data, size_t length) 1454 { 1455 for (size_t i = 0; i < length; ++i) 1456 data[i] = foldQuoteMark(data[i]); 1457 } 1458 1459 static const size_t minimumSearchBufferSize = 8192; 1460 1461 #ifndef NDEBUG 1462 static bool searcherInUse; 1463 #endif 1464 1465 static UStringSearch* createSearcher() 1466 { 1467 // Provide a non-empty pattern and non-empty text so usearch_open will not fail, 1468 // but it doesn't matter exactly what it is, since we don't perform any searches 1469 // without setting both the pattern and the text. 1470 UErrorCode status = U_ZERO_ERROR; 1471 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status); 1472 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); 1473 return searcher; 1474 } 1475 1476 static UStringSearch* searcher() 1477 { 1478 static UStringSearch* searcher = createSearcher(); 1479 return searcher; 1480 } 1481 1482 static inline void lockSearcher() 1483 { 1484 #ifndef NDEBUG 1485 ASSERT(!searcherInUse); 1486 searcherInUse = true; 1487 #endif 1488 } 1489 1490 static inline void unlockSearcher() 1491 { 1492 #ifndef NDEBUG 1493 ASSERT(searcherInUse); 1494 searcherInUse = false; 1495 #endif 1496 } 1497 1498 // ICU's search ignores the distinction between small kana letters and ones 1499 // that are not small, and also characters that differ only in the voicing 1500 // marks when considering only primary collation strength diffrences. 1501 // This is not helpful for end users, since these differences make words 1502 // distinct, so for our purposes we need these to be considered. 1503 // The Unicode folks do not think the collation algorithm should be 1504 // changed. To work around this, we would like to tailor the ICU searcher, 1505 // but we can't get that to work yet. So instead, we check for cases where 1506 // these differences occur, and skip those matches. 1507 1508 // We refer to the above technique as the "kana workaround". The next few 1509 // functions are helper functinos for the kana workaround. 1510 1511 static inline bool isKanaLetter(UChar character) 1512 { 1513 // Hiragana letters. 1514 if (character >= 0x3041 && character <= 0x3096) 1515 return true; 1516 1517 // Katakana letters. 1518 if (character >= 0x30A1 && character <= 0x30FA) 1519 return true; 1520 if (character >= 0x31F0 && character <= 0x31FF) 1521 return true; 1522 1523 // Halfwidth katakana letters. 1524 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) 1525 return true; 1526 1527 return false; 1528 } 1529 1530 static inline bool isSmallKanaLetter(UChar character) 1531 { 1532 ASSERT(isKanaLetter(character)); 1533 1534 switch (character) { 1535 case 0x3041: // HIRAGANA LETTER SMALL A 1536 case 0x3043: // HIRAGANA LETTER SMALL I 1537 case 0x3045: // HIRAGANA LETTER SMALL U 1538 case 0x3047: // HIRAGANA LETTER SMALL E 1539 case 0x3049: // HIRAGANA LETTER SMALL O 1540 case 0x3063: // HIRAGANA LETTER SMALL TU 1541 case 0x3083: // HIRAGANA LETTER SMALL YA 1542 case 0x3085: // HIRAGANA LETTER SMALL YU 1543 case 0x3087: // HIRAGANA LETTER SMALL YO 1544 case 0x308E: // HIRAGANA LETTER SMALL WA 1545 case 0x3095: // HIRAGANA LETTER SMALL KA 1546 case 0x3096: // HIRAGANA LETTER SMALL KE 1547 case 0x30A1: // KATAKANA LETTER SMALL A 1548 case 0x30A3: // KATAKANA LETTER SMALL I 1549 case 0x30A5: // KATAKANA LETTER SMALL U 1550 case 0x30A7: // KATAKANA LETTER SMALL E 1551 case 0x30A9: // KATAKANA LETTER SMALL O 1552 case 0x30C3: // KATAKANA LETTER SMALL TU 1553 case 0x30E3: // KATAKANA LETTER SMALL YA 1554 case 0x30E5: // KATAKANA LETTER SMALL YU 1555 case 0x30E7: // KATAKANA LETTER SMALL YO 1556 case 0x30EE: // KATAKANA LETTER SMALL WA 1557 case 0x30F5: // KATAKANA LETTER SMALL KA 1558 case 0x30F6: // KATAKANA LETTER SMALL KE 1559 case 0x31F0: // KATAKANA LETTER SMALL KU 1560 case 0x31F1: // KATAKANA LETTER SMALL SI 1561 case 0x31F2: // KATAKANA LETTER SMALL SU 1562 case 0x31F3: // KATAKANA LETTER SMALL TO 1563 case 0x31F4: // KATAKANA LETTER SMALL NU 1564 case 0x31F5: // KATAKANA LETTER SMALL HA 1565 case 0x31F6: // KATAKANA LETTER SMALL HI 1566 case 0x31F7: // KATAKANA LETTER SMALL HU 1567 case 0x31F8: // KATAKANA LETTER SMALL HE 1568 case 0x31F9: // KATAKANA LETTER SMALL HO 1569 case 0x31FA: // KATAKANA LETTER SMALL MU 1570 case 0x31FB: // KATAKANA LETTER SMALL RA 1571 case 0x31FC: // KATAKANA LETTER SMALL RI 1572 case 0x31FD: // KATAKANA LETTER SMALL RU 1573 case 0x31FE: // KATAKANA LETTER SMALL RE 1574 case 0x31FF: // KATAKANA LETTER SMALL RO 1575 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A 1576 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I 1577 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U 1578 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E 1579 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O 1580 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA 1581 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU 1582 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO 1583 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU 1584 return true; 1585 } 1586 return false; 1587 } 1588 1589 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; 1590 1591 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) 1592 { 1593 ASSERT(isKanaLetter(character)); 1594 1595 switch (character) { 1596 case 0x304C: // HIRAGANA LETTER GA 1597 case 0x304E: // HIRAGANA LETTER GI 1598 case 0x3050: // HIRAGANA LETTER GU 1599 case 0x3052: // HIRAGANA LETTER GE 1600 case 0x3054: // HIRAGANA LETTER GO 1601 case 0x3056: // HIRAGANA LETTER ZA 1602 case 0x3058: // HIRAGANA LETTER ZI 1603 case 0x305A: // HIRAGANA LETTER ZU 1604 case 0x305C: // HIRAGANA LETTER ZE 1605 case 0x305E: // HIRAGANA LETTER ZO 1606 case 0x3060: // HIRAGANA LETTER DA 1607 case 0x3062: // HIRAGANA LETTER DI 1608 case 0x3065: // HIRAGANA LETTER DU 1609 case 0x3067: // HIRAGANA LETTER DE 1610 case 0x3069: // HIRAGANA LETTER DO 1611 case 0x3070: // HIRAGANA LETTER BA 1612 case 0x3073: // HIRAGANA LETTER BI 1613 case 0x3076: // HIRAGANA LETTER BU 1614 case 0x3079: // HIRAGANA LETTER BE 1615 case 0x307C: // HIRAGANA LETTER BO 1616 case 0x3094: // HIRAGANA LETTER VU 1617 case 0x30AC: // KATAKANA LETTER GA 1618 case 0x30AE: // KATAKANA LETTER GI 1619 case 0x30B0: // KATAKANA LETTER GU 1620 case 0x30B2: // KATAKANA LETTER GE 1621 case 0x30B4: // KATAKANA LETTER GO 1622 case 0x30B6: // KATAKANA LETTER ZA 1623 case 0x30B8: // KATAKANA LETTER ZI 1624 case 0x30BA: // KATAKANA LETTER ZU 1625 case 0x30BC: // KATAKANA LETTER ZE 1626 case 0x30BE: // KATAKANA LETTER ZO 1627 case 0x30C0: // KATAKANA LETTER DA 1628 case 0x30C2: // KATAKANA LETTER DI 1629 case 0x30C5: // KATAKANA LETTER DU 1630 case 0x30C7: // KATAKANA LETTER DE 1631 case 0x30C9: // KATAKANA LETTER DO 1632 case 0x30D0: // KATAKANA LETTER BA 1633 case 0x30D3: // KATAKANA LETTER BI 1634 case 0x30D6: // KATAKANA LETTER BU 1635 case 0x30D9: // KATAKANA LETTER BE 1636 case 0x30DC: // KATAKANA LETTER BO 1637 case 0x30F4: // KATAKANA LETTER VU 1638 case 0x30F7: // KATAKANA LETTER VA 1639 case 0x30F8: // KATAKANA LETTER VI 1640 case 0x30F9: // KATAKANA LETTER VE 1641 case 0x30FA: // KATAKANA LETTER VO 1642 return VoicedSoundMark; 1643 case 0x3071: // HIRAGANA LETTER PA 1644 case 0x3074: // HIRAGANA LETTER PI 1645 case 0x3077: // HIRAGANA LETTER PU 1646 case 0x307A: // HIRAGANA LETTER PE 1647 case 0x307D: // HIRAGANA LETTER PO 1648 case 0x30D1: // KATAKANA LETTER PA 1649 case 0x30D4: // KATAKANA LETTER PI 1650 case 0x30D7: // KATAKANA LETTER PU 1651 case 0x30DA: // KATAKANA LETTER PE 1652 case 0x30DD: // KATAKANA LETTER PO 1653 return SemiVoicedSoundMark; 1654 } 1655 return NoVoicedSoundMark; 1656 } 1657 1658 static inline bool isCombiningVoicedSoundMark(UChar character) 1659 { 1660 switch (character) { 1661 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK 1662 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 1663 return true; 1664 } 1665 return false; 1666 } 1667 1668 static inline bool containsKanaLetters(const String& pattern) 1669 { 1670 const UChar* characters = pattern.characters(); 1671 unsigned length = pattern.length(); 1672 for (unsigned i = 0; i < length; ++i) { 1673 if (isKanaLetter(characters[i])) 1674 return true; 1675 } 1676 return false; 1677 } 1678 1679 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer) 1680 { 1681 ASSERT(length); 1682 1683 buffer.resize(length); 1684 1685 UErrorCode status = U_ZERO_ERROR; 1686 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); 1687 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); 1688 ASSERT(bufferSize); 1689 1690 buffer.resize(bufferSize); 1691 1692 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) 1693 return; 1694 1695 status = U_ZERO_ERROR; 1696 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); 1697 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); 1698 } 1699 1700 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) 1701 : m_target(target) 1702 , m_atBreak(true) 1703 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) 1704 { 1705 ASSERT(!m_target.isEmpty()); 1706 1707 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead 1708 // of doing it in a separate replacement pass here, but ICU doesn't offer a way 1709 // to add tailoring on top of the locale-specific tailoring as of this writing. 1710 foldQuoteMarks(m_target); 1711 1712 size_t targetLength = m_target.length(); 1713 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize)); 1714 m_overlap = m_buffer.capacity() / 4; 1715 1716 // Grab the single global searcher. 1717 // If we ever have a reason to do more than once search buffer at once, we'll have 1718 // to move to multiple searchers. 1719 lockSearcher(); 1720 1721 UStringSearch* searcher = WebCore::searcher(); 1722 UCollator* collator = usearch_getCollator(searcher); 1723 1724 UCollationStrength strength = isCaseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY; 1725 if (ucol_getStrength(collator) != strength) { 1726 ucol_setStrength(collator, strength); 1727 usearch_reset(searcher); 1728 } 1729 1730 UErrorCode status = U_ZERO_ERROR; 1731 usearch_setPattern(searcher, m_target.characters(), targetLength, &status); 1732 ASSERT(status == U_ZERO_ERROR); 1733 1734 // The kana workaround requires a normalized copy of the target string. 1735 if (m_targetRequiresKanaWorkaround) 1736 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget); 1737 } 1738 1739 inline SearchBuffer::~SearchBuffer() 1740 { 1741 unlockSearcher(); 1742 } 1743 1744 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 1745 { 1746 ASSERT(length); 1747 1748 if (m_atBreak) { 1749 m_buffer.shrink(0); 1750 m_atBreak = false; 1751 } else if (m_buffer.size() == m_buffer.capacity()) { 1752 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); 1753 m_buffer.shrink(m_overlap); 1754 } 1755 1756 size_t oldLength = m_buffer.size(); 1757 size_t usableLength = min(m_buffer.capacity() - oldLength, length); 1758 ASSERT(usableLength); 1759 m_buffer.append(characters, usableLength); 1760 foldQuoteMarks(m_buffer.data() + oldLength, usableLength); 1761 return usableLength; 1762 } 1763 1764 inline bool SearchBuffer::atBreak() const 1765 { 1766 return m_atBreak; 1767 } 1768 1769 inline void SearchBuffer::reachedBreak() 1770 { 1771 m_atBreak = true; 1772 } 1773 1774 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const 1775 { 1776 // This function implements the kana workaround. If usearch treats 1777 // it as a match, but we do not want to, then it's a "bad match". 1778 if (!m_targetRequiresKanaWorkaround) 1779 return false; 1780 1781 // Normalize into a match buffer. We reuse a single buffer rather than 1782 // creating a new one each time. 1783 normalizeCharacters(match, matchLength, m_normalizedMatch); 1784 1785 const UChar* a = m_normalizedTarget.begin(); 1786 const UChar* aEnd = m_normalizedTarget.end(); 1787 1788 const UChar* b = m_normalizedMatch.begin(); 1789 const UChar* bEnd = m_normalizedMatch.end(); 1790 1791 while (true) { 1792 // Skip runs of non-kana-letter characters. This is necessary so we can 1793 // correctly handle strings where the target and match have different-length 1794 // runs of characters that match, while still double checking the correctness 1795 // of matches of kana letters with other kana letters. 1796 while (a != aEnd && !isKanaLetter(*a)) 1797 ++a; 1798 while (b != bEnd && !isKanaLetter(*b)) 1799 ++b; 1800 1801 // If we reached the end of either the target or the match, we should have 1802 // reached the end of both; both should have the same number of kana letters. 1803 if (a == aEnd || b == bEnd) { 1804 ASSERT(a == aEnd); 1805 ASSERT(b == bEnd); 1806 return false; 1807 } 1808 1809 // Check for differences in the kana letter character itself. 1810 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b)) 1811 return true; 1812 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b)) 1813 return true; 1814 ++a; 1815 ++b; 1816 1817 // Check for differences in combining voiced sound marks found after the letter. 1818 while (1) { 1819 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) { 1820 if (b != bEnd && isCombiningVoicedSoundMark(*b)) 1821 return true; 1822 break; 1823 } 1824 if (!(b != bEnd && isCombiningVoicedSoundMark(*b))) 1825 return true; 1826 if (*a != *b) 1827 return true; 1828 ++a; 1829 ++b; 1830 } 1831 } 1832 } 1833 1834 inline size_t SearchBuffer::search(size_t& start) 1835 { 1836 size_t size = m_buffer.size(); 1837 if (m_atBreak) { 1838 if (!size) 1839 return 0; 1840 } else { 1841 if (size != m_buffer.capacity()) 1842 return 0; 1843 } 1844 1845 UStringSearch* searcher = WebCore::searcher(); 1846 1847 UErrorCode status = U_ZERO_ERROR; 1848 usearch_setText(searcher, m_buffer.data(), size, &status); 1849 ASSERT(status == U_ZERO_ERROR); 1850 1851 int matchStart = usearch_first(searcher, &status); 1852 ASSERT(status == U_ZERO_ERROR); 1853 1854 nextMatch: 1855 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { 1856 ASSERT(matchStart == USEARCH_DONE); 1857 return 0; 1858 } 1859 1860 // Matches that start in the overlap area are only tentative. 1861 // The same match may appear later, matching more characters, 1862 // possibly including a combining character that's not yet in the buffer. 1863 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { 1864 memcpy(m_buffer.data(), m_buffer.data() + size - m_overlap, m_overlap * sizeof(UChar)); 1865 m_buffer.shrink(m_overlap); 1866 return 0; 1867 } 1868 1869 size_t matchedLength = usearch_getMatchedLength(searcher); 1870 ASSERT(matchStart + matchedLength <= size); 1871 1872 // If this match is "bad", move on to the next match. 1873 if (isBadMatch(m_buffer.data() + matchStart, matchedLength)) { 1874 matchStart = usearch_next(searcher, &status); 1875 ASSERT(status == U_ZERO_ERROR); 1876 goto nextMatch; 1877 } 1878 1879 size_t newSize = size - (matchStart + 1); 1880 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); 1881 m_buffer.shrink(newSize); 1882 1883 start = size - matchStart; 1884 return matchedLength; 1885 } 1886 1887 #else // !ICU_UNICODE 1888 1889 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive) 1890 : m_target(isCaseSensitive ? target : target.foldCase()) 1891 , m_isCaseSensitive(isCaseSensitive) 1892 , m_buffer(m_target.length()) 1893 , m_isCharacterStartBuffer(m_target.length()) 1894 , m_isBufferFull(false) 1895 , m_cursor(0) 1896 { 1897 ASSERT(!m_target.isEmpty()); 1898 m_target.replace(noBreakSpace, ' '); 1899 foldQuoteMarks(m_target); 1900 } 1901 1902 inline SearchBuffer::~SearchBuffer() 1903 { 1904 } 1905 1906 inline void SearchBuffer::reachedBreak() 1907 { 1908 m_cursor = 0; 1909 m_isBufferFull = false; 1910 } 1911 1912 inline bool SearchBuffer::atBreak() const 1913 { 1914 return !m_cursor && !m_isBufferFull; 1915 } 1916 1917 inline void SearchBuffer::append(UChar c, bool isStart) 1918 { 1919 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMark(c); 1920 m_isCharacterStartBuffer[m_cursor] = isStart; 1921 if (++m_cursor == m_target.length()) { 1922 m_cursor = 0; 1923 m_isBufferFull = true; 1924 } 1925 } 1926 1927 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 1928 { 1929 ASSERT(length); 1930 if (m_isCaseSensitive) { 1931 append(characters[0], true); 1932 return 1; 1933 } 1934 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough 1935 UChar foldedCharacters[maxFoldedCharacters]; 1936 bool error; 1937 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error); 1938 ASSERT(!error); 1939 ASSERT(numFoldedCharacters); 1940 ASSERT(numFoldedCharacters <= maxFoldedCharacters); 1941 if (!error && numFoldedCharacters) { 1942 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters); 1943 append(foldedCharacters[0], true); 1944 for (int i = 1; i < numFoldedCharacters; ++i) 1945 append(foldedCharacters[i], false); 1946 } 1947 return 1; 1948 } 1949 1950 inline size_t SearchBuffer::search(size_t& start) 1951 { 1952 if (!m_isBufferFull) 1953 return 0; 1954 if (!m_isCharacterStartBuffer[m_cursor]) 1955 return 0; 1956 1957 size_t tailSpace = m_target.length() - m_cursor; 1958 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0) 1959 return 0; 1960 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0) 1961 return 0; 1962 1963 start = length(); 1964 1965 // Now that we've found a match once, we don't want to find it again, because those 1966 // are the SearchBuffer semantics, allowing for a buffer where you append more than one 1967 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if 1968 // we want to get rid of that in the future we could track this with a separate boolean 1969 // or even move the characters to the start of the buffer and set m_isBufferFull to false. 1970 m_isCharacterStartBuffer[m_cursor] = false; 1971 1972 return start; 1973 } 1974 1975 // Returns the number of characters that were appended to the buffer (what we are searching in). 1976 // That's not necessarily the same length as the passed-in target string, because case folding 1977 // can make two strings match even though they're not the same length. 1978 size_t SearchBuffer::length() const 1979 { 1980 size_t bufferSize = m_target.length(); 1981 size_t length = 0; 1982 for (size_t i = 0; i < bufferSize; ++i) 1983 length += m_isCharacterStartBuffer[i]; 1984 return length; 1985 } 1986 1987 #endif // !ICU_UNICODE 1988 1989 // -------- 1990 1991 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation) 1992 { 1993 int length = 0; 1994 for (TextIterator it(r, forSelectionPreservation); !it.atEnd(); it.advance()) 1995 length += it.length(); 1996 1997 return length; 1998 } 1999 2000 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount) 2001 { 2002 CharacterIterator entireRangeIterator(entireRange); 2003 return characterSubrange(entireRangeIterator, characterOffset, characterCount); 2004 } 2005 2006 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation) 2007 { 2008 RefPtr<Range> resultRange = scope->document()->createRange(); 2009 2010 int docTextPosition = 0; 2011 int rangeEnd = rangeLocation + rangeLength; 2012 bool startRangeFound = false; 2013 2014 RefPtr<Range> textRunRange; 2015 2016 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation); 2017 2018 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>. 2019 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) { 2020 textRunRange = it.range(); 2021 2022 ExceptionCode ec = 0; 2023 resultRange->setStart(textRunRange->startContainer(), 0, ec); 2024 ASSERT(!ec); 2025 resultRange->setEnd(textRunRange->startContainer(), 0, ec); 2026 ASSERT(!ec); 2027 2028 return resultRange.release(); 2029 } 2030 2031 for (; !it.atEnd(); it.advance()) { 2032 int len = it.length(); 2033 textRunRange = it.range(); 2034 2035 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len; 2036 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len; 2037 2038 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only 2039 // in those cases that textRunRange is used. 2040 if (foundStart || foundEnd) { 2041 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong 2042 // position for emitted '\n's. 2043 if (len == 1 && it.characters()[0] == '\n') { 2044 Position runStart = textRunRange->startPosition(); 2045 Position runEnd = VisiblePosition(runStart).next().deepEquivalent(); 2046 if (runEnd.isNotNull()) { 2047 ExceptionCode ec = 0; 2048 textRunRange->setEnd(runEnd.node(), runEnd.deprecatedEditingOffset(), ec); 2049 ASSERT(!ec); 2050 } 2051 } 2052 } 2053 2054 if (foundStart) { 2055 startRangeFound = true; 2056 int exception = 0; 2057 if (textRunRange->startContainer()->isTextNode()) { 2058 int offset = rangeLocation - docTextPosition; 2059 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2060 } else { 2061 if (rangeLocation == docTextPosition) 2062 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2063 else 2064 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2065 } 2066 } 2067 2068 if (foundEnd) { 2069 int exception = 0; 2070 if (textRunRange->startContainer()->isTextNode()) { 2071 int offset = rangeEnd - docTextPosition; 2072 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2073 } else { 2074 if (rangeEnd == docTextPosition) 2075 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2076 else 2077 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2078 } 2079 docTextPosition += len; 2080 break; 2081 } 2082 docTextPosition += len; 2083 } 2084 2085 if (!startRangeFound) 2086 return 0; 2087 2088 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds 2089 int exception = 0; 2090 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2091 } 2092 2093 return resultRange.release(); 2094 } 2095 2096 // -------- 2097 2098 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString) 2099 { 2100 UChar* result = 0; 2101 2102 // Do this in pieces to avoid massive reallocations if there is a large amount of text. 2103 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS. 2104 static const unsigned cMaxSegmentSize = 1 << 16; 2105 bufferLength = 0; 2106 typedef pair<UChar*, unsigned> TextSegment; 2107 Vector<TextSegment>* textSegments = 0; 2108 Vector<UChar> textBuffer; 2109 textBuffer.reserveInitialCapacity(cMaxSegmentSize); 2110 for (TextIterator it(r); !it.atEnd(); it.advance()) { 2111 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) { 2112 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar))); 2113 if (!newSegmentBuffer) 2114 goto exit; 2115 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2116 if (!textSegments) 2117 textSegments = new Vector<TextSegment>; 2118 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size())); 2119 textBuffer.clear(); 2120 } 2121 textBuffer.append(it.characters(), it.length()); 2122 bufferLength += it.length(); 2123 } 2124 2125 if (!bufferLength) 2126 return 0; 2127 2128 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc 2129 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar))); 2130 if (!result) 2131 goto exit; 2132 2133 { 2134 UChar* resultPos = result; 2135 if (textSegments) { 2136 unsigned size = textSegments->size(); 2137 for (unsigned i = 0; i < size; ++i) { 2138 const TextSegment& segment = textSegments->at(i); 2139 memcpy(resultPos, segment.first, segment.second * sizeof(UChar)); 2140 resultPos += segment.second; 2141 } 2142 } 2143 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2144 } 2145 2146 exit: 2147 if (textSegments) { 2148 unsigned size = textSegments->size(); 2149 for (unsigned i = 0; i < size; ++i) 2150 free(textSegments->at(i).first); 2151 delete textSegments; 2152 } 2153 2154 if (isDisplayString && r->ownerDocument()) 2155 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength); 2156 2157 return result; 2158 } 2159 2160 String plainText(const Range* r) 2161 { 2162 unsigned length; 2163 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false); 2164 if (!buf) 2165 return ""; 2166 String result(buf, length); 2167 free(buf); 2168 return result; 2169 } 2170 2171 static inline bool isAllCollapsibleWhitespace(const String& string) 2172 { 2173 const UChar* characters = string.characters(); 2174 unsigned length = string.length(); 2175 for (unsigned i = 0; i < length; ++i) { 2176 if (!isCollapsibleWhitespace(characters[i])) 2177 return false; 2178 } 2179 return true; 2180 } 2181 2182 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward) 2183 { 2184 ExceptionCode ec = 0; 2185 RefPtr<Range> result = range->cloneRange(ec); 2186 ASSERT(!ec); 2187 result->collapse(!forward, ec); 2188 ASSERT(!ec); 2189 return result.release(); 2190 } 2191 2192 static size_t findPlainText(CharacterIterator& it, const String& target, bool forward, bool caseSensitive, size_t& matchStart) 2193 { 2194 matchStart = 0; 2195 size_t matchLength = 0; 2196 2197 SearchBuffer buffer(target, caseSensitive); 2198 2199 while (!it.atEnd()) { 2200 it.advance(buffer.append(it.characters(), it.length())); 2201 tryAgain: 2202 size_t matchStartOffset; 2203 if (size_t newMatchLength = buffer.search(matchStartOffset)) { 2204 // Note that we found a match, and where we found it. 2205 size_t lastCharacterInBufferOffset = it.characterOffset(); 2206 ASSERT(lastCharacterInBufferOffset >= matchStartOffset); 2207 matchStart = lastCharacterInBufferOffset - matchStartOffset; 2208 matchLength = newMatchLength; 2209 // If searching forward, stop on the first match. 2210 // If searching backward, don't stop, so we end up with the last match. 2211 if (forward) 2212 break; 2213 goto tryAgain; 2214 } 2215 if (it.atBreak() && !buffer.atBreak()) { 2216 buffer.reachedBreak(); 2217 goto tryAgain; 2218 } 2219 } 2220 2221 return matchLength; 2222 } 2223 2224 PassRefPtr<Range> findPlainText(const Range* range, const String& target, bool forward, bool caseSensitive) 2225 { 2226 // First, find the text. 2227 size_t matchStart; 2228 size_t matchLength; 2229 { 2230 CharacterIterator findIterator(range, false, true); 2231 matchLength = findPlainText(findIterator, target, forward, caseSensitive, matchStart); 2232 if (!matchLength) 2233 return collapsedToBoundary(range, forward); 2234 } 2235 2236 // Then, find the document position of the start and the end of the text. 2237 CharacterIterator computeRangeIterator(range, false, true); 2238 return characterSubrange(computeRangeIterator, matchStart, matchLength); 2239 } 2240 2241 } 2242