1 /* 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved. 3 * Copyright (C) 2005 Alexey Proskuryakov. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextIterator.h" 29 30 #include "Document.h" 31 #include "Frame.h" 32 #include "HTMLElement.h" 33 #include "HTMLNames.h" 34 #include "htmlediting.h" 35 #include "InlineTextBox.h" 36 #include "Range.h" 37 #include "RenderTableCell.h" 38 #include "RenderTableRow.h" 39 #include "RenderTextControl.h" 40 #include "RenderTextFragment.h" 41 #include "TextBoundaries.h" 42 #include "TextBreakIterator.h" 43 #include "VisiblePosition.h" 44 #include "visible_units.h" 45 #include <wtf/unicode/CharacterNames.h> 46 47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 48 #include "TextBreakIteratorInternalICU.h" 49 #include <unicode/usearch.h> 50 #endif 51 52 using namespace WTF::Unicode; 53 using namespace std; 54 55 namespace WebCore { 56 57 using namespace HTMLNames; 58 59 // Buffer that knows how to compare with a search target. 60 // Keeps enough of the previous text to be able to search in the future, but no more. 61 // Non-breaking spaces are always equal to normal spaces. 62 // Case folding is also done if the CaseInsensitive option is specified. 63 // Matches are further filtered if the AtWordStarts option is specified, although some 64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well. 65 class SearchBuffer { 66 WTF_MAKE_NONCOPYABLE(SearchBuffer); 67 public: 68 SearchBuffer(const String& target, FindOptions); 69 ~SearchBuffer(); 70 71 // Returns number of characters appended; guaranteed to be in the range [1, length]. 72 size_t append(const UChar*, size_t length); 73 bool needsMoreContext() const; 74 void prependContext(const UChar*, size_t length); 75 void reachedBreak(); 76 77 // Result is the size in characters of what was found. 78 // And <startOffset> is the number of characters back to the start of what was found. 79 size_t search(size_t& startOffset); 80 bool atBreak() const; 81 82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 83 84 private: 85 bool isBadMatch(const UChar*, size_t length) const; 86 bool isWordStartMatch(size_t start, size_t length) const; 87 88 String m_target; 89 FindOptions m_options; 90 91 Vector<UChar> m_buffer; 92 size_t m_overlap; 93 size_t m_prefixLength; 94 bool m_atBreak; 95 bool m_needsMoreContext; 96 97 bool m_targetRequiresKanaWorkaround; 98 Vector<UChar> m_normalizedTarget; 99 mutable Vector<UChar> m_normalizedMatch; 100 101 #else 102 103 private: 104 void append(UChar, bool isCharacterStart); 105 size_t length() const; 106 107 String m_target; 108 FindOptions m_options; 109 110 Vector<UChar> m_buffer; 111 Vector<bool> m_isCharacterStartBuffer; 112 bool m_isBufferFull; 113 size_t m_cursor; 114 115 #endif 116 }; 117 118 // -------- 119 120 static const unsigned bitsInWord = sizeof(unsigned) * 8; 121 static const unsigned bitInWordMask = bitsInWord - 1; 122 123 BitStack::BitStack() 124 : m_size(0) 125 { 126 } 127 128 BitStack::~BitStack() 129 { 130 } 131 132 void BitStack::push(bool bit) 133 { 134 unsigned index = m_size / bitsInWord; 135 unsigned shift = m_size & bitInWordMask; 136 if (!shift && index == m_words.size()) { 137 m_words.grow(index + 1); 138 m_words[index] = 0; 139 } 140 unsigned& word = m_words[index]; 141 unsigned mask = 1U << shift; 142 if (bit) 143 word |= mask; 144 else 145 word &= ~mask; 146 ++m_size; 147 } 148 149 void BitStack::pop() 150 { 151 if (m_size) 152 --m_size; 153 } 154 155 bool BitStack::top() const 156 { 157 if (!m_size) 158 return false; 159 unsigned shift = (m_size - 1) & bitInWordMask; 160 return m_words.last() & (1U << shift); 161 } 162 163 unsigned BitStack::size() const 164 { 165 return m_size; 166 } 167 168 // -------- 169 170 #if !ASSERT_DISABLED 171 172 static unsigned depthCrossingShadowBoundaries(Node* node) 173 { 174 unsigned depth = 0; 175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode()) 176 ++depth; 177 return depth; 178 } 179 180 #endif 181 182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees. 183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset) 184 { 185 if (!rangeEndContainer) 186 return 0; 187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) { 188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset)) 189 return next; 190 } 191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) { 192 if (Node* next = node->nextSibling()) 193 return next; 194 } 195 return 0; 196 } 197 198 // -------- 199 200 static inline bool fullyClipsContents(Node* node) 201 { 202 RenderObject* renderer = node->renderer(); 203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip()) 204 return false; 205 return toRenderBox(renderer)->size().isEmpty(); 206 } 207 208 static inline bool ignoresContainerClip(Node* node) 209 { 210 RenderObject* renderer = node->renderer(); 211 if (!renderer || renderer->isText()) 212 return false; 213 EPosition position = renderer->style()->position(); 214 return position == AbsolutePosition || position == FixedPosition; 215 } 216 217 static void pushFullyClippedState(BitStack& stack, Node* node) 218 { 219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node)); 220 221 // Push true if this node full clips its contents, or if a parent already has fully 222 // clipped and this is not a node that ignores its container's clip. 223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node))); 224 } 225 226 static void setUpFullyClippedStack(BitStack& stack, Node* node) 227 { 228 // Put the nodes in a vector so we can iterate in reverse order. 229 Vector<Node*, 100> ancestry; 230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode()) 231 ancestry.append(parent); 232 233 // Call pushFullyClippedState on each node starting with the earliest ancestor. 234 size_t size = ancestry.size(); 235 for (size_t i = 0; i < size; ++i) 236 pushFullyClippedState(stack, ancestry[size - i - 1]); 237 pushFullyClippedState(stack, node); 238 239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node)); 240 } 241 242 #if OS(ANDROID) 243 static bool checkFormControlElement(Node* startNode) 244 { 245 Node* node = startNode; 246 while (node) { 247 if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement()) 248 return true; 249 node = node->parentNode(); 250 } 251 return false; 252 } 253 #endif 254 255 256 // -------- 257 258 TextIterator::TextIterator() 259 : m_startContainer(0) 260 , m_startOffset(0) 261 , m_endContainer(0) 262 , m_endOffset(0) 263 , m_positionNode(0) 264 , m_textCharacters(0) 265 , m_textLength(0) 266 , m_remainingTextBox(0) 267 , m_firstLetterText(0) 268 , m_lastCharacter(0) 269 , m_emitsCharactersBetweenAllVisiblePositions(false) 270 , m_entersTextControls(false) 271 , m_emitsTextWithoutTranscoding(false) 272 , m_handledFirstLetter(false) 273 , m_ignoresStyleVisibility(false) 274 , m_emitsObjectReplacementCharacters(false) 275 #if OS(ANDROID) 276 , m_stopsOnFormControls(false) 277 , m_shouldStop(false) 278 #endif 279 { 280 } 281 282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior) 283 : m_startContainer(0) 284 , m_startOffset(0) 285 , m_endContainer(0) 286 , m_endOffset(0) 287 , m_positionNode(0) 288 , m_textCharacters(0) 289 , m_textLength(0) 290 , m_remainingTextBox(0) 291 , m_firstLetterText(0) 292 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) 293 , m_entersTextControls(behavior & TextIteratorEntersTextControls) 294 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding) 295 , m_handledFirstLetter(false) 296 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility) 297 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters) 298 #if OS(ANDROID) 299 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls) 300 , m_shouldStop(false) 301 , m_needsAnotherNewline(false) 302 #endif 303 { 304 if (!r) 305 return; 306 307 // get and validate the range endpoints 308 Node* startContainer = r->startContainer(); 309 if (!startContainer) 310 return; 311 int startOffset = r->startOffset(); 312 Node* endContainer = r->endContainer(); 313 int endOffset = r->endOffset(); 314 315 // Callers should be handing us well-formed ranges. If we discover that this isn't 316 // the case, we could consider changing this assertion to an early return. 317 ASSERT(r->boundaryPointsValid()); 318 319 // remember range - this does not change 320 m_startContainer = startContainer; 321 m_startOffset = startOffset; 322 m_endContainer = endContainer; 323 m_endOffset = endOffset; 324 325 // set up the current node for processing 326 m_node = r->firstNode(); 327 if (!m_node) 328 return; 329 setUpFullyClippedStack(m_fullyClippedStack, m_node); 330 m_offset = m_node == m_startContainer ? m_startOffset : 0; 331 m_handledNode = false; 332 m_handledChildren = false; 333 334 // calculate first out of bounds node 335 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset); 336 337 // initialize node processing state 338 m_needsAnotherNewline = false; 339 m_textBox = 0; 340 341 // initialize record of previous node processing 342 m_hasEmitted = false; 343 m_lastTextNode = 0; 344 m_lastTextNodeEndedWithCollapsedSpace = false; 345 m_lastCharacter = 0; 346 347 #ifndef NDEBUG 348 // need this just because of the assert in advance() 349 m_positionNode = m_node; 350 #endif 351 352 // identify the first run 353 advance(); 354 } 355 356 TextIterator::~TextIterator() 357 { 358 } 359 360 bool TextIterator::atEnd() const 361 { 362 #if OS(ANDROID) 363 return !m_positionNode || m_shouldStop; 364 #else 365 return !m_positionNode; 366 #endif 367 } 368 369 void TextIterator::advance() 370 { 371 #if OS(ANDROID) 372 if (m_shouldStop) 373 return; 374 #endif 375 // reset the run information 376 m_positionNode = 0; 377 m_textLength = 0; 378 379 // handle remembered node that needed a newline after the text node's newline 380 if (m_needsAnotherNewline) { 381 // Emit the extra newline, and position it *inside* m_node, after m_node's 382 // contents, in case it's a block, in the same way that we position the first 383 // newline. The range for the emitted newline should start where the line 384 // break begins. 385 // FIXME: It would be cleaner if we emitted two newlines during the last 386 // iteration, instead of using m_needsAnotherNewline. 387 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 388 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 389 m_needsAnotherNewline = false; 390 return; 391 } 392 393 if (!m_textBox && m_remainingTextBox) { 394 m_textBox = m_remainingTextBox; 395 m_remainingTextBox = 0; 396 m_firstLetterText = 0; 397 m_offset = 0; 398 } 399 // handle remembered text box 400 if (m_textBox) { 401 handleTextBox(); 402 if (m_positionNode) 403 return; 404 } 405 406 while (m_node && m_node != m_pastEndNode) { 407 #if OS(ANDROID) 408 if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node)) 409 m_shouldStop = true; 410 #endif 411 // if the range ends at offset 0 of an element, represent the 412 // position, but not the content, of that element e.g. if the 413 // node is a blockflow element, emit a newline that 414 // precedes the element 415 if (m_node == m_endContainer && m_endOffset == 0) { 416 representNodeOffsetZero(); 417 m_node = 0; 418 return; 419 } 420 421 RenderObject* renderer = m_node->renderer(); 422 if (!renderer) { 423 m_handledNode = true; 424 m_handledChildren = true; 425 } else { 426 // handle current node according to its type 427 if (!m_handledNode) { 428 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE? 429 m_handledNode = handleTextNode(); 430 else if (renderer && (renderer->isImage() || renderer->isWidget() || 431 (renderer->node() && renderer->node()->isElementNode() && 432 static_cast<Element*>(renderer->node())->isFormControlElement()))) 433 m_handledNode = handleReplacedElement(); 434 else 435 m_handledNode = handleNonTextNode(); 436 if (m_positionNode) 437 return; 438 } 439 } 440 441 // find a new current node to handle in depth-first manner, 442 // calling exitNode() as we come back thru a parent node 443 Node* next = m_handledChildren ? 0 : m_node->firstChild(); 444 m_offset = 0; 445 if (!next) { 446 next = m_node->nextSibling(); 447 if (!next) { 448 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode; 449 Node* parentNode = m_node->parentOrHostNode(); 450 while (!next && parentNode) { 451 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode)) 452 return; 453 bool haveRenderer = m_node->renderer(); 454 m_node = parentNode; 455 m_fullyClippedStack.pop(); 456 parentNode = m_node->parentOrHostNode(); 457 if (haveRenderer) 458 exitNode(); 459 if (m_positionNode) { 460 m_handledNode = true; 461 m_handledChildren = true; 462 return; 463 } 464 next = m_node->nextSibling(); 465 } 466 } 467 m_fullyClippedStack.pop(); 468 } 469 470 // set the new current node 471 m_node = next; 472 if (m_node) 473 pushFullyClippedState(m_fullyClippedStack, m_node); 474 m_handledNode = false; 475 m_handledChildren = false; 476 m_handledFirstLetter = false; 477 m_firstLetterText = 0; 478 479 // how would this ever be? 480 if (m_positionNode) 481 return; 482 } 483 } 484 485 bool TextIterator::handleTextNode() 486 { 487 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility) 488 return false; 489 490 RenderText* renderer = toRenderText(m_node->renderer()); 491 492 m_lastTextNode = m_node; 493 String str = renderer->text(); 494 495 // handle pre-formatted text 496 if (!renderer->style()->collapseWhiteSpace()) { 497 int runStart = m_offset; 498 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) { 499 emitCharacter(' ', m_node, 0, runStart, runStart); 500 return false; 501 } 502 if (!m_handledFirstLetter && renderer->isTextFragment()) { 503 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 504 if (m_firstLetterText) { 505 String firstLetter = m_firstLetterText->text(); 506 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length()); 507 m_firstLetterText = 0; 508 m_textBox = 0; 509 return false; 510 } 511 } 512 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 513 return false; 514 int strLength = str.length(); 515 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX; 516 int runEnd = min(strLength, end); 517 518 if (runStart >= runEnd) 519 return true; 520 521 emitText(m_node, runStart, runEnd); 522 return true; 523 } 524 525 if (!renderer->firstTextBox() && str.length() > 0) { 526 if (!m_handledFirstLetter && renderer->isTextFragment()) { 527 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 528 if (m_firstLetterText) { 529 handleTextBox(); 530 return false; 531 } 532 } 533 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 534 return false; 535 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space 536 return true; 537 } 538 539 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text) 540 if (renderer->containsReversedText()) { 541 m_sortedTextBoxes.clear(); 542 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) { 543 m_sortedTextBoxes.append(textBox); 544 } 545 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart); 546 m_sortedTextBoxesPosition = 0; 547 } 548 549 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox(); 550 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) 551 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer)); 552 handleTextBox(); 553 return true; 554 } 555 556 void TextIterator::handleTextBox() 557 { 558 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer()); 559 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) { 560 m_textBox = 0; 561 return; 562 } 563 String str = renderer->text(); 564 unsigned start = m_offset; 565 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX; 566 while (m_textBox) { 567 unsigned textBoxStart = m_textBox->start(); 568 unsigned runStart = max(textBoxStart, start); 569 570 // Check for collapsed space at the start of this run. 571 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox(); 572 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace 573 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0); 574 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) { 575 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') { 576 unsigned spaceRunStart = runStart - 1; 577 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ') 578 --spaceRunStart; 579 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1); 580 } else 581 emitCharacter(' ', m_node, 0, runStart, runStart); 582 return; 583 } 584 unsigned textBoxEnd = textBoxStart + m_textBox->len(); 585 unsigned runEnd = min(textBoxEnd, end); 586 587 // Determine what the next text box will be, but don't advance yet 588 InlineTextBox* nextTextBox = 0; 589 if (renderer->containsReversedText()) { 590 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size()) 591 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1]; 592 } else 593 nextTextBox = m_textBox->nextTextBox(); 594 595 if (runStart < runEnd) { 596 // Handle either a single newline character (which becomes a space), 597 // or a run of characters that does not include a newline. 598 // This effectively translates newlines to spaces without copying the text. 599 if (str[runStart] == '\n') { 600 emitCharacter(' ', m_node, 0, runStart, runStart + 1); 601 m_offset = runStart + 1; 602 } else { 603 size_t subrunEnd = str.find('\n', runStart); 604 if (subrunEnd == notFound || subrunEnd > runEnd) 605 subrunEnd = runEnd; 606 607 m_offset = subrunEnd; 608 emitText(m_node, renderer, runStart, subrunEnd); 609 } 610 611 // If we are doing a subrun that doesn't go to the end of the text box, 612 // come back again to finish handling this text box; don't advance to the next one. 613 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd) 614 return; 615 616 // Advance and return 617 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length(); 618 if (nextRunStart > runEnd) 619 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end 620 m_textBox = nextTextBox; 621 if (renderer->containsReversedText()) 622 ++m_sortedTextBoxesPosition; 623 return; 624 } 625 // Advance and continue 626 m_textBox = nextTextBox; 627 if (renderer->containsReversedText()) 628 ++m_sortedTextBoxesPosition; 629 } 630 if (!m_textBox && m_remainingTextBox) { 631 m_textBox = m_remainingTextBox; 632 m_remainingTextBox = 0; 633 m_firstLetterText = 0; 634 m_offset = 0; 635 handleTextBox(); 636 } 637 } 638 639 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer) 640 { 641 if (renderer->firstLetter()) { 642 RenderObject* r = renderer->firstLetter(); 643 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 644 return; 645 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) { 646 if (currChild->isText()) { 647 RenderText* firstLetter = toRenderText(currChild); 648 m_handledFirstLetter = true; 649 m_remainingTextBox = m_textBox; 650 m_textBox = firstLetter->firstTextBox(); 651 m_firstLetterText = firstLetter; 652 return; 653 } 654 } 655 } 656 m_handledFirstLetter = true; 657 } 658 659 bool TextIterator::handleReplacedElement() 660 { 661 if (m_fullyClippedStack.top()) 662 return false; 663 664 RenderObject* renderer = m_node->renderer(); 665 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) 666 return false; 667 668 if (m_lastTextNodeEndedWithCollapsedSpace) { 669 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1); 670 return false; 671 } 672 673 if (m_entersTextControls && renderer->isTextControl()) { 674 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) { 675 m_node = innerTextElement->shadowTreeRootNode(); 676 pushFullyClippedState(m_fullyClippedStack, m_node); 677 m_offset = 0; 678 return false; 679 } 680 } 681 682 m_hasEmitted = true; 683 684 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) { 685 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1); 686 return true; 687 } 688 689 if (m_emitsCharactersBetweenAllVisiblePositions) { 690 // We want replaced elements to behave like punctuation for boundary 691 // finding, and to simply take up space for the selection preservation 692 // code in moveParagraphs, so we use a comma. 693 emitCharacter(',', m_node->parentNode(), m_node, 0, 1); 694 return true; 695 } 696 697 m_positionNode = m_node->parentNode(); 698 m_positionOffsetBaseNode = m_node; 699 m_positionStartOffset = 0; 700 m_positionEndOffset = 1; 701 702 m_textCharacters = 0; 703 m_textLength = 0; 704 705 m_lastCharacter = 0; 706 707 return true; 708 } 709 710 bool TextIterator::hasVisibleTextNode(RenderText* renderer) 711 { 712 if (renderer->style()->visibility() == VISIBLE) 713 return true; 714 if (renderer->isTextFragment()) { 715 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer); 716 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE) 717 return true; 718 } 719 return false; 720 } 721 722 static bool shouldEmitTabBeforeNode(Node* node) 723 { 724 RenderObject* r = node->renderer(); 725 726 // Table cells are delimited by tabs. 727 if (!r || !isTableCell(node)) 728 return false; 729 730 // Want a tab before every cell other than the first one 731 RenderTableCell* rc = toRenderTableCell(r); 732 RenderTable* t = rc->table(); 733 return t && (t->cellBefore(rc) || t->cellAbove(rc)); 734 } 735 736 static bool shouldEmitNewlineForNode(Node* node) 737 { 738 // br elements are represented by a single newline. 739 RenderObject* r = node->renderer(); 740 if (!r) 741 return node->hasTagName(brTag); 742 743 return r->isBR(); 744 } 745 746 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node) 747 { 748 // Block flow (versus inline flow) is represented by having 749 // a newline both before and after the element. 750 RenderObject* r = node->renderer(); 751 if (!r) { 752 return (node->hasTagName(blockquoteTag) 753 || node->hasTagName(ddTag) 754 || node->hasTagName(divTag) 755 || node->hasTagName(dlTag) 756 || node->hasTagName(dtTag) 757 || node->hasTagName(h1Tag) 758 || node->hasTagName(h2Tag) 759 || node->hasTagName(h3Tag) 760 || node->hasTagName(h4Tag) 761 || node->hasTagName(h5Tag) 762 || node->hasTagName(h6Tag) 763 || node->hasTagName(hrTag) 764 || node->hasTagName(liTag) 765 || node->hasTagName(listingTag) 766 || node->hasTagName(olTag) 767 || node->hasTagName(pTag) 768 || node->hasTagName(preTag) 769 || node->hasTagName(trTag) 770 || node->hasTagName(ulTag)); 771 } 772 773 // Need to make an exception for table cells, because they are blocks, but we 774 // want them tab-delimited rather than having newlines before and after. 775 if (isTableCell(node)) 776 return false; 777 778 // Need to make an exception for table row elements, because they are neither 779 // "inline" or "RenderBlock", but we want newlines for them. 780 if (r->isTableRow()) { 781 RenderTable* t = toRenderTableRow(r)->table(); 782 if (t && !t->isInline()) 783 return true; 784 } 785 786 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody(); 787 } 788 789 static bool shouldEmitNewlineAfterNode(Node* node) 790 { 791 // FIXME: It should be better but slower to create a VisiblePosition here. 792 if (!shouldEmitNewlinesBeforeAndAfterNode(node)) 793 return false; 794 // Check if this is the very last renderer in the document. 795 // If so, then we should not emit a newline. 796 while ((node = node->traverseNextSibling())) 797 if (node->renderer()) 798 return true; 799 return false; 800 } 801 802 static bool shouldEmitNewlineBeforeNode(Node* node) 803 { 804 return shouldEmitNewlinesBeforeAndAfterNode(node); 805 } 806 807 static bool shouldEmitExtraNewlineForNode(Node* node) 808 { 809 // When there is a significant collapsed bottom margin, emit an extra 810 // newline for a more realistic result. We end up getting the right 811 // result even without margin collapsing. For example: <div><p>text</p></div> 812 // will work right even if both the <div> and the <p> have bottom margins. 813 RenderObject* r = node->renderer(); 814 if (!r || !r->isBox()) 815 return false; 816 817 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears 818 // not to do this at all 819 if (node->hasTagName(h1Tag) 820 || node->hasTagName(h2Tag) 821 || node->hasTagName(h3Tag) 822 || node->hasTagName(h4Tag) 823 || node->hasTagName(h5Tag) 824 || node->hasTagName(h6Tag) 825 || node->hasTagName(pTag)) { 826 RenderStyle* style = r->style(); 827 if (style) { 828 int bottomMargin = toRenderBox(r)->collapsedMarginAfter(); 829 int fontSize = style->fontDescription().computedPixelSize(); 830 if (bottomMargin * 2 >= fontSize) 831 return true; 832 } 833 } 834 835 return false; 836 } 837 838 static int collapsedSpaceLength(RenderText* renderer, int textEnd) 839 { 840 const UChar* characters = renderer->text()->characters(); 841 int length = renderer->text()->length(); 842 for (int i = textEnd; i < length; ++i) { 843 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i])) 844 return i - textEnd; 845 } 846 847 return length - textEnd; 848 } 849 850 static int maxOffsetIncludingCollapsedSpaces(Node* node) 851 { 852 int offset = caretMaxOffset(node); 853 854 if (node->renderer() && node->renderer()->isText()) 855 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset); 856 857 return offset; 858 } 859 860 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic). 861 bool TextIterator::shouldRepresentNodeOffsetZero() 862 { 863 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable()) 864 return true; 865 866 // Leave element positioned flush with start of a paragraph 867 // (e.g. do not insert tab before a table cell at the start of a paragraph) 868 if (m_lastCharacter == '\n') 869 return false; 870 871 // Otherwise, show the position if we have emitted any characters 872 if (m_hasEmitted) 873 return true; 874 875 // We've not emitted anything yet. Generally, there is no need for any positioning then. 876 // The only exception is when the element is visually not in the same line as 877 // the start of the range (e.g. the range starts at the end of the previous paragraph). 878 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we 879 // make quicker checks to possibly avoid that. Another check that we could make is 880 // is whether the inline vs block flow changed since the previous visible element. 881 // I think we're already in a special enough case that that won't be needed, tho. 882 883 // No character needed if this is the first node in the range. 884 if (m_node == m_startContainer) 885 return false; 886 887 // If we are outside the start container's subtree, assume we need to emit. 888 // FIXME: m_startContainer could be an inline block 889 if (!m_node->isDescendantOf(m_startContainer)) 890 return true; 891 892 // If we started as m_startContainer offset 0 and the current node is a descendant of 893 // the start container, we already had enough context to correctly decide whether to 894 // emit after a preceding block. We chose not to emit (m_hasEmitted is false), 895 // so don't second guess that now. 896 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably 897 // immaterial since we likely would have already emitted something by now. 898 if (m_startOffset == 0) 899 return false; 900 901 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning. 902 // Additionally, if the range we are iterating over contains huge sections of unrendered content, 903 // we would create VisiblePositions on every call to this function without this check. 904 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE) 905 return false; 906 907 // The startPos.isNotNull() check is needed because the start could be before the body, 908 // and in that case we'll get null. We don't want to put in newlines at the start in that case. 909 // The currPos.isNotNull() check is needed because positions in non-HTML content 910 // (like SVG) do not have visible positions, and we don't want to emit for them either. 911 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM); 912 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM); 913 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos); 914 } 915 916 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node) 917 { 918 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions); 919 } 920 921 void TextIterator::representNodeOffsetZero() 922 { 923 // Emit a character to show the positioning of m_node. 924 925 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can 926 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks 927 // on m_node to see if it necessitates emitting a character first and will early return 928 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior. 929 if (shouldEmitTabBeforeNode(m_node)) { 930 if (shouldRepresentNodeOffsetZero()) 931 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0); 932 } else if (shouldEmitNewlineBeforeNode(m_node)) { 933 if (shouldRepresentNodeOffsetZero()) 934 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0); 935 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) { 936 if (shouldRepresentNodeOffsetZero()) 937 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0); 938 } 939 } 940 941 bool TextIterator::handleNonTextNode() 942 { 943 if (shouldEmitNewlineForNode(m_node)) 944 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1); 945 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR()) 946 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1); 947 else 948 representNodeOffsetZero(); 949 950 return true; 951 } 952 953 void TextIterator::exitNode() 954 { 955 // prevent emitting a newline when exiting a collapsed block at beginning of the range 956 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could 957 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and 958 // therefore look like a blank line. 959 if (!m_hasEmitted) 960 return; 961 962 // Emit with a position *inside* m_node, after m_node's contents, in 963 // case it is a block, because the run should start where the 964 // emitted character is positioned visually. 965 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node; 966 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making 967 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use 968 // TextIterator in _web_attributedStringFromRange. 969 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems. 970 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) { 971 // use extra newline to represent margin bottom, as needed 972 bool addNewline = shouldEmitExtraNewlineForNode(m_node); 973 974 // FIXME: We need to emit a '\n' as we leave an empty block(s) that 975 // contain a VisiblePosition when doing selection preservation. 976 if (m_lastCharacter != '\n') { 977 // insert a newline with a position following this block's contents. 978 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 979 // remember whether to later add a newline for the current node 980 ASSERT(!m_needsAnotherNewline); 981 m_needsAnotherNewline = addNewline; 982 } else if (addNewline) 983 // insert a newline with a position following this block's contents. 984 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1); 985 } 986 987 // If nothing was emitted, see if we need to emit a space. 988 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node)) 989 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1); 990 } 991 992 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset) 993 { 994 m_hasEmitted = true; 995 996 // remember information with which to construct the TextIterator::range() 997 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode 998 m_positionNode = textNode; 999 m_positionOffsetBaseNode = offsetBaseNode; 1000 m_positionStartOffset = textStartOffset; 1001 m_positionEndOffset = textEndOffset; 1002 1003 // remember information with which to construct the TextIterator::characters() and length() 1004 m_singleCharacterBuffer = c; 1005 m_textCharacters = &m_singleCharacterBuffer; 1006 m_textLength = 1; 1007 1008 // remember some iteration state 1009 m_lastTextNodeEndedWithCollapsedSpace = false; 1010 m_lastCharacter = c; 1011 } 1012 1013 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset) 1014 { 1015 RenderText* renderer = toRenderText(renderObject); 1016 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text(); 1017 ASSERT(m_text.characters()); 1018 1019 m_positionNode = textNode; 1020 m_positionOffsetBaseNode = 0; 1021 m_positionStartOffset = textStartOffset; 1022 m_positionEndOffset = textEndOffset; 1023 m_textCharacters = m_text.characters() + textStartOffset; 1024 m_textLength = textEndOffset - textStartOffset; 1025 m_lastCharacter = m_text[textEndOffset - 1]; 1026 1027 m_lastTextNodeEndedWithCollapsedSpace = false; 1028 m_hasEmitted = true; 1029 } 1030 1031 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset) 1032 { 1033 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset); 1034 } 1035 1036 PassRefPtr<Range> TextIterator::range() const 1037 { 1038 // use the current run information, if we have it 1039 if (m_positionNode) { 1040 if (m_positionOffsetBaseNode) { 1041 int index = m_positionOffsetBaseNode->nodeIndex(); 1042 m_positionStartOffset += index; 1043 m_positionEndOffset += index; 1044 m_positionOffsetBaseNode = 0; 1045 } 1046 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 1047 } 1048 1049 // otherwise, return the end of the overall range we were given 1050 if (m_endContainer) 1051 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset); 1052 1053 return 0; 1054 } 1055 1056 Node* TextIterator::node() const 1057 { 1058 RefPtr<Range> textRange = range(); 1059 if (!textRange) 1060 return 0; 1061 1062 Node* node = textRange->startContainer(); 1063 if (!node) 1064 return 0; 1065 if (node->offsetInCharacters()) 1066 return node; 1067 1068 return node->childNode(textRange->startOffset()); 1069 } 1070 1071 // -------- 1072 1073 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator() 1074 : m_behavior(TextIteratorDefaultBehavior) 1075 , m_node(0) 1076 , m_positionNode(0) 1077 #if OS(ANDROID) 1078 , m_stopsOnFormControls(false) 1079 , m_shouldStop(false) 1080 #endif 1081 { 1082 } 1083 1084 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior) 1085 : m_behavior(behavior) 1086 , m_node(0) 1087 , m_positionNode(0) 1088 #if OS(ANDROID) 1089 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls) 1090 , m_shouldStop(false) 1091 #endif 1092 { 1093 #if OS(ANDROID) 1094 ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls); 1095 #else 1096 ASSERT(m_behavior == TextIteratorDefaultBehavior); 1097 #endif 1098 1099 if (!r) 1100 return; 1101 1102 Node* startNode = r->startContainer(); 1103 if (!startNode) 1104 return; 1105 Node* endNode = r->endContainer(); 1106 int startOffset = r->startOffset(); 1107 int endOffset = r->endOffset(); 1108 1109 if (!startNode->offsetInCharacters()) { 1110 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) { 1111 startNode = startNode->childNode(startOffset); 1112 startOffset = 0; 1113 } 1114 } 1115 if (!endNode->offsetInCharacters()) { 1116 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) { 1117 endNode = endNode->childNode(endOffset - 1); 1118 endOffset = lastOffsetInNode(endNode); 1119 } 1120 } 1121 1122 m_node = endNode; 1123 setUpFullyClippedStack(m_fullyClippedStack, m_node); 1124 m_offset = endOffset; 1125 m_handledNode = false; 1126 m_handledChildren = endOffset == 0; 1127 1128 m_startNode = startNode; 1129 m_startOffset = startOffset; 1130 m_endNode = endNode; 1131 m_endOffset = endOffset; 1132 1133 #ifndef NDEBUG 1134 // Need this just because of the assert. 1135 m_positionNode = endNode; 1136 #endif 1137 1138 m_lastTextNode = 0; 1139 m_lastCharacter = '\n'; 1140 1141 m_havePassedStartNode = false; 1142 1143 advance(); 1144 } 1145 1146 bool SimplifiedBackwardsTextIterator::atEnd() const 1147 { 1148 #if OS(ANDROID) 1149 return !m_positionNode || m_shouldStop; 1150 #else 1151 return !m_positionNode; 1152 #endif 1153 } 1154 1155 void SimplifiedBackwardsTextIterator::advance() 1156 { 1157 ASSERT(m_positionNode); 1158 1159 #if OS(ANDROID) 1160 if (m_shouldStop) 1161 return; 1162 1163 // Prevent changing the iterator position if a form control element was found and advance should stop on it. 1164 if (m_stopsOnFormControls && checkFormControlElement(m_node)) { 1165 m_shouldStop = true; 1166 return; 1167 } 1168 #endif 1169 1170 m_positionNode = 0; 1171 m_textLength = 0; 1172 1173 while (m_node && !m_havePassedStartNode) { 1174 // Don't handle node if we start iterating at [node, 0]. 1175 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) { 1176 RenderObject* renderer = m_node->renderer(); 1177 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) { 1178 // FIXME: What about CDATA_SECTION_NODE? 1179 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1180 m_handledNode = handleTextNode(); 1181 } else if (renderer && (renderer->isImage() || renderer->isWidget())) { 1182 if (renderer->style()->visibility() == VISIBLE && m_offset > 0) 1183 m_handledNode = handleReplacedElement(); 1184 } else 1185 m_handledNode = handleNonTextNode(); 1186 if (m_positionNode) 1187 return; 1188 } 1189 1190 if (!m_handledChildren && m_node->hasChildNodes()) { 1191 m_node = m_node->lastChild(); 1192 pushFullyClippedState(m_fullyClippedStack, m_node); 1193 } else { 1194 // Exit empty containers as we pass over them or containers 1195 // where [container, 0] is where we started iterating. 1196 if (!m_handledNode 1197 && canHaveChildrenForEditing(m_node) 1198 && m_node->parentNode() 1199 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) { 1200 exitNode(); 1201 if (m_positionNode) { 1202 m_handledNode = true; 1203 m_handledChildren = true; 1204 return; 1205 } 1206 } 1207 1208 // Exit all other containers. 1209 while (!m_node->previousSibling()) { 1210 if (!advanceRespectingRange(m_node->parentOrHostNode())) 1211 break; 1212 m_fullyClippedStack.pop(); 1213 exitNode(); 1214 if (m_positionNode) { 1215 m_handledNode = true; 1216 m_handledChildren = true; 1217 return; 1218 } 1219 } 1220 1221 m_fullyClippedStack.pop(); 1222 if (advanceRespectingRange(m_node->previousSibling())) 1223 pushFullyClippedState(m_fullyClippedStack, m_node); 1224 else 1225 m_node = 0; 1226 } 1227 1228 // For the purpose of word boundary detection, 1229 // we should iterate all visible text and trailing (collapsed) whitespaces. 1230 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0; 1231 m_handledNode = false; 1232 m_handledChildren = false; 1233 1234 if (m_positionNode) 1235 return; 1236 } 1237 } 1238 1239 bool SimplifiedBackwardsTextIterator::handleTextNode() 1240 { 1241 m_lastTextNode = m_node; 1242 1243 RenderText* renderer = toRenderText(m_node->renderer()); 1244 String str = renderer->text(); 1245 1246 if (!renderer->firstTextBox() && str.length() > 0) 1247 return true; 1248 1249 m_positionEndOffset = m_offset; 1250 1251 m_offset = (m_node == m_startNode) ? m_startOffset : 0; 1252 m_positionNode = m_node; 1253 m_positionStartOffset = m_offset; 1254 m_textLength = m_positionEndOffset - m_positionStartOffset; 1255 m_textCharacters = str.characters() + m_positionStartOffset; 1256 1257 m_lastCharacter = str[m_positionEndOffset - 1]; 1258 1259 return true; 1260 } 1261 1262 bool SimplifiedBackwardsTextIterator::handleReplacedElement() 1263 { 1264 unsigned index = m_node->nodeIndex(); 1265 // We want replaced elements to behave like punctuation for boundary 1266 // finding, and to simply take up space for the selection preservation 1267 // code in moveParagraphs, so we use a comma. Unconditionally emit 1268 // here because this iterator is only used for boundary finding. 1269 emitCharacter(',', m_node->parentNode(), index, index + 1); 1270 return true; 1271 } 1272 1273 bool SimplifiedBackwardsTextIterator::handleNonTextNode() 1274 { 1275 // We can use a linefeed in place of a tab because this simple iterator is only used to 1276 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs. 1277 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1278 unsigned index = m_node->nodeIndex(); 1279 // The start of this emitted range is wrong. Ensuring correctness would require 1280 // VisiblePositions and so would be slow. previousBoundary expects this. 1281 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1); 1282 } 1283 return true; 1284 } 1285 1286 void SimplifiedBackwardsTextIterator::exitNode() 1287 { 1288 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) { 1289 // The start of this emitted range is wrong. Ensuring correctness would require 1290 // VisiblePositions and so would be slow. previousBoundary expects this. 1291 emitCharacter('\n', m_node, 0, 0); 1292 } 1293 } 1294 1295 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset) 1296 { 1297 m_singleCharacterBuffer = c; 1298 m_positionNode = node; 1299 m_positionStartOffset = startOffset; 1300 m_positionEndOffset = endOffset; 1301 m_textCharacters = &m_singleCharacterBuffer; 1302 m_textLength = 1; 1303 m_lastCharacter = c; 1304 } 1305 1306 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next) 1307 { 1308 if (!next) 1309 return false; 1310 m_havePassedStartNode |= m_node == m_startNode; 1311 if (m_havePassedStartNode) 1312 return false; 1313 m_node = next; 1314 return true; 1315 } 1316 1317 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const 1318 { 1319 if (m_positionNode) 1320 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset); 1321 1322 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset); 1323 } 1324 1325 // -------- 1326 1327 CharacterIterator::CharacterIterator() 1328 : m_offset(0) 1329 , m_runOffset(0) 1330 , m_atBreak(true) 1331 { 1332 } 1333 1334 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior) 1335 : m_offset(0) 1336 , m_runOffset(0) 1337 , m_atBreak(true) 1338 , m_textIterator(r, behavior) 1339 { 1340 while (!atEnd() && m_textIterator.length() == 0) 1341 m_textIterator.advance(); 1342 } 1343 1344 PassRefPtr<Range> CharacterIterator::range() const 1345 { 1346 RefPtr<Range> r = m_textIterator.range(); 1347 if (!m_textIterator.atEnd()) { 1348 if (m_textIterator.length() <= 1) { 1349 ASSERT(m_runOffset == 0); 1350 } else { 1351 Node* n = r->startContainer(); 1352 ASSERT(n == r->endContainer()); 1353 int offset = r->startOffset() + m_runOffset; 1354 ExceptionCode ec = 0; 1355 r->setStart(n, offset, ec); 1356 r->setEnd(n, offset + 1, ec); 1357 ASSERT(!ec); 1358 } 1359 } 1360 return r.release(); 1361 } 1362 1363 void CharacterIterator::advance(int count) 1364 { 1365 if (count <= 0) { 1366 ASSERT(count == 0); 1367 return; 1368 } 1369 1370 m_atBreak = false; 1371 1372 // easy if there is enough left in the current m_textIterator run 1373 int remaining = m_textIterator.length() - m_runOffset; 1374 if (count < remaining) { 1375 m_runOffset += count; 1376 m_offset += count; 1377 return; 1378 } 1379 1380 // exhaust the current m_textIterator run 1381 count -= remaining; 1382 m_offset += remaining; 1383 1384 // move to a subsequent m_textIterator run 1385 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1386 int runLength = m_textIterator.length(); 1387 if (runLength == 0) 1388 m_atBreak = true; 1389 else { 1390 // see whether this is m_textIterator to use 1391 if (count < runLength) { 1392 m_runOffset = count; 1393 m_offset += count; 1394 return; 1395 } 1396 1397 // exhaust this m_textIterator run 1398 count -= runLength; 1399 m_offset += runLength; 1400 } 1401 } 1402 1403 // ran to the end of the m_textIterator... no more runs left 1404 m_atBreak = true; 1405 m_runOffset = 0; 1406 } 1407 1408 String CharacterIterator::string(int numChars) 1409 { 1410 Vector<UChar> result; 1411 result.reserveInitialCapacity(numChars); 1412 while (numChars > 0 && !atEnd()) { 1413 int runSize = min(numChars, length()); 1414 result.append(characters(), runSize); 1415 numChars -= runSize; 1416 advance(runSize); 1417 } 1418 return String::adopt(result); 1419 } 1420 1421 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length) 1422 { 1423 it.advance(offset); 1424 RefPtr<Range> start = it.range(); 1425 1426 if (length > 1) 1427 it.advance(length - 1); 1428 RefPtr<Range> end = it.range(); 1429 1430 return Range::create(start->startContainer()->document(), 1431 start->startContainer(), start->startOffset(), 1432 end->endContainer(), end->endOffset()); 1433 } 1434 1435 BackwardsCharacterIterator::BackwardsCharacterIterator() 1436 : m_offset(0) 1437 , m_runOffset(0) 1438 , m_atBreak(true) 1439 { 1440 } 1441 1442 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior) 1443 : m_offset(0) 1444 , m_runOffset(0) 1445 , m_atBreak(true) 1446 , m_textIterator(range, behavior) 1447 { 1448 while (!atEnd() && !m_textIterator.length()) 1449 m_textIterator.advance(); 1450 } 1451 1452 PassRefPtr<Range> BackwardsCharacterIterator::range() const 1453 { 1454 RefPtr<Range> r = m_textIterator.range(); 1455 if (!m_textIterator.atEnd()) { 1456 if (m_textIterator.length() <= 1) 1457 ASSERT(m_runOffset == 0); 1458 else { 1459 Node* n = r->startContainer(); 1460 ASSERT(n == r->endContainer()); 1461 int offset = r->endOffset() - m_runOffset; 1462 ExceptionCode ec = 0; 1463 r->setStart(n, offset - 1, ec); 1464 r->setEnd(n, offset, ec); 1465 ASSERT(!ec); 1466 } 1467 } 1468 return r.release(); 1469 } 1470 1471 void BackwardsCharacterIterator::advance(int count) 1472 { 1473 if (count <= 0) { 1474 ASSERT(!count); 1475 return; 1476 } 1477 1478 m_atBreak = false; 1479 1480 int remaining = m_textIterator.length() - m_runOffset; 1481 if (count < remaining) { 1482 m_runOffset += count; 1483 m_offset += count; 1484 return; 1485 } 1486 1487 count -= remaining; 1488 m_offset += remaining; 1489 1490 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) { 1491 int runLength = m_textIterator.length(); 1492 if (runLength == 0) 1493 m_atBreak = true; 1494 else { 1495 if (count < runLength) { 1496 m_runOffset = count; 1497 m_offset += count; 1498 return; 1499 } 1500 1501 count -= runLength; 1502 m_offset += runLength; 1503 } 1504 } 1505 1506 m_atBreak = true; 1507 m_runOffset = 0; 1508 } 1509 1510 // -------- 1511 1512 WordAwareIterator::WordAwareIterator() 1513 : m_previousText(0) 1514 , m_didLookAhead(false) 1515 { 1516 } 1517 1518 WordAwareIterator::WordAwareIterator(const Range* r) 1519 : m_previousText(0) 1520 , m_didLookAhead(true) // so we consider the first chunk from the text iterator 1521 , m_textIterator(r) 1522 { 1523 advance(); // get in position over the first chunk of text 1524 } 1525 1526 WordAwareIterator::~WordAwareIterator() 1527 { 1528 } 1529 1530 // We're always in one of these modes: 1531 // - The current chunk in the text iterator is our current chunk 1532 // (typically its a piece of whitespace, or text that ended with whitespace) 1533 // - The previous chunk in the text iterator is our current chunk 1534 // (we looked ahead to the next chunk and found a word boundary) 1535 // - We built up our own chunk of text from many chunks from the text iterator 1536 1537 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries. 1538 1539 void WordAwareIterator::advance() 1540 { 1541 m_previousText = 0; 1542 m_buffer.clear(); // toss any old buffer we built up 1543 1544 // If last time we did a look-ahead, start with that looked-ahead chunk now 1545 if (!m_didLookAhead) { 1546 ASSERT(!m_textIterator.atEnd()); 1547 m_textIterator.advance(); 1548 } 1549 m_didLookAhead = false; 1550 1551 // Go to next non-empty chunk 1552 while (!m_textIterator.atEnd() && m_textIterator.length() == 0) 1553 m_textIterator.advance(); 1554 m_range = m_textIterator.range(); 1555 1556 if (m_textIterator.atEnd()) 1557 return; 1558 1559 while (1) { 1560 // If this chunk ends in whitespace we can just use it as our chunk. 1561 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1])) 1562 return; 1563 1564 // If this is the first chunk that failed, save it in previousText before look ahead 1565 if (m_buffer.isEmpty()) { 1566 m_previousText = m_textIterator.characters(); 1567 m_previousLength = m_textIterator.length(); 1568 } 1569 1570 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff 1571 m_textIterator.advance(); 1572 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) { 1573 m_didLookAhead = true; 1574 return; 1575 } 1576 1577 if (m_buffer.isEmpty()) { 1578 // Start gobbling chunks until we get to a suitable stopping point 1579 m_buffer.append(m_previousText, m_previousLength); 1580 m_previousText = 0; 1581 } 1582 m_buffer.append(m_textIterator.characters(), m_textIterator.length()); 1583 int exception = 0; 1584 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception); 1585 } 1586 } 1587 1588 int WordAwareIterator::length() const 1589 { 1590 if (!m_buffer.isEmpty()) 1591 return m_buffer.size(); 1592 if (m_previousText) 1593 return m_previousLength; 1594 return m_textIterator.length(); 1595 } 1596 1597 const UChar* WordAwareIterator::characters() const 1598 { 1599 if (!m_buffer.isEmpty()) 1600 return m_buffer.data(); 1601 if (m_previousText) 1602 return m_previousText; 1603 return m_textIterator.characters(); 1604 } 1605 1606 // -------- 1607 1608 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c) 1609 { 1610 switch (c) { 1611 case hebrewPunctuationGershayim: 1612 case leftDoubleQuotationMark: 1613 case rightDoubleQuotationMark: 1614 return '"'; 1615 case hebrewPunctuationGeresh: 1616 case leftSingleQuotationMark: 1617 case rightSingleQuotationMark: 1618 return '\''; 1619 case softHyphen: 1620 // Replace soft hyphen with an ignorable character so that their presence or absence will 1621 // not affect string comparison. 1622 return 0; 1623 default: 1624 return c; 1625 } 1626 } 1627 1628 static inline void foldQuoteMarksAndSoftHyphens(String& s) 1629 { 1630 s.replace(hebrewPunctuationGeresh, '\''); 1631 s.replace(hebrewPunctuationGershayim, '"'); 1632 s.replace(leftDoubleQuotationMark, '"'); 1633 s.replace(leftSingleQuotationMark, '\''); 1634 s.replace(rightDoubleQuotationMark, '"'); 1635 s.replace(rightSingleQuotationMark, '\''); 1636 // Replace soft hyphen with an ignorable character so that their presence or absence will 1637 // not affect string comparison. 1638 s.replace(softHyphen, 0); 1639 } 1640 1641 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION 1642 1643 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) 1644 { 1645 for (size_t i = 0; i < length; ++i) 1646 data[i] = foldQuoteMarkOrSoftHyphen(data[i]); 1647 } 1648 1649 static const size_t minimumSearchBufferSize = 8192; 1650 1651 #ifndef NDEBUG 1652 static bool searcherInUse; 1653 #endif 1654 1655 static UStringSearch* createSearcher() 1656 { 1657 // Provide a non-empty pattern and non-empty text so usearch_open will not fail, 1658 // but it doesn't matter exactly what it is, since we don't perform any searches 1659 // without setting both the pattern and the text. 1660 UErrorCode status = U_ZERO_ERROR; 1661 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status); 1662 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING); 1663 return searcher; 1664 } 1665 1666 static UStringSearch* searcher() 1667 { 1668 static UStringSearch* searcher = createSearcher(); 1669 return searcher; 1670 } 1671 1672 static inline void lockSearcher() 1673 { 1674 #ifndef NDEBUG 1675 ASSERT(!searcherInUse); 1676 searcherInUse = true; 1677 #endif 1678 } 1679 1680 static inline void unlockSearcher() 1681 { 1682 #ifndef NDEBUG 1683 ASSERT(searcherInUse); 1684 searcherInUse = false; 1685 #endif 1686 } 1687 1688 // ICU's search ignores the distinction between small kana letters and ones 1689 // that are not small, and also characters that differ only in the voicing 1690 // marks when considering only primary collation strength diffrences. 1691 // This is not helpful for end users, since these differences make words 1692 // distinct, so for our purposes we need these to be considered. 1693 // The Unicode folks do not think the collation algorithm should be 1694 // changed. To work around this, we would like to tailor the ICU searcher, 1695 // but we can't get that to work yet. So instead, we check for cases where 1696 // these differences occur, and skip those matches. 1697 1698 // We refer to the above technique as the "kana workaround". The next few 1699 // functions are helper functinos for the kana workaround. 1700 1701 static inline bool isKanaLetter(UChar character) 1702 { 1703 // Hiragana letters. 1704 if (character >= 0x3041 && character <= 0x3096) 1705 return true; 1706 1707 // Katakana letters. 1708 if (character >= 0x30A1 && character <= 0x30FA) 1709 return true; 1710 if (character >= 0x31F0 && character <= 0x31FF) 1711 return true; 1712 1713 // Halfwidth katakana letters. 1714 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) 1715 return true; 1716 1717 return false; 1718 } 1719 1720 static inline bool isSmallKanaLetter(UChar character) 1721 { 1722 ASSERT(isKanaLetter(character)); 1723 1724 switch (character) { 1725 case 0x3041: // HIRAGANA LETTER SMALL A 1726 case 0x3043: // HIRAGANA LETTER SMALL I 1727 case 0x3045: // HIRAGANA LETTER SMALL U 1728 case 0x3047: // HIRAGANA LETTER SMALL E 1729 case 0x3049: // HIRAGANA LETTER SMALL O 1730 case 0x3063: // HIRAGANA LETTER SMALL TU 1731 case 0x3083: // HIRAGANA LETTER SMALL YA 1732 case 0x3085: // HIRAGANA LETTER SMALL YU 1733 case 0x3087: // HIRAGANA LETTER SMALL YO 1734 case 0x308E: // HIRAGANA LETTER SMALL WA 1735 case 0x3095: // HIRAGANA LETTER SMALL KA 1736 case 0x3096: // HIRAGANA LETTER SMALL KE 1737 case 0x30A1: // KATAKANA LETTER SMALL A 1738 case 0x30A3: // KATAKANA LETTER SMALL I 1739 case 0x30A5: // KATAKANA LETTER SMALL U 1740 case 0x30A7: // KATAKANA LETTER SMALL E 1741 case 0x30A9: // KATAKANA LETTER SMALL O 1742 case 0x30C3: // KATAKANA LETTER SMALL TU 1743 case 0x30E3: // KATAKANA LETTER SMALL YA 1744 case 0x30E5: // KATAKANA LETTER SMALL YU 1745 case 0x30E7: // KATAKANA LETTER SMALL YO 1746 case 0x30EE: // KATAKANA LETTER SMALL WA 1747 case 0x30F5: // KATAKANA LETTER SMALL KA 1748 case 0x30F6: // KATAKANA LETTER SMALL KE 1749 case 0x31F0: // KATAKANA LETTER SMALL KU 1750 case 0x31F1: // KATAKANA LETTER SMALL SI 1751 case 0x31F2: // KATAKANA LETTER SMALL SU 1752 case 0x31F3: // KATAKANA LETTER SMALL TO 1753 case 0x31F4: // KATAKANA LETTER SMALL NU 1754 case 0x31F5: // KATAKANA LETTER SMALL HA 1755 case 0x31F6: // KATAKANA LETTER SMALL HI 1756 case 0x31F7: // KATAKANA LETTER SMALL HU 1757 case 0x31F8: // KATAKANA LETTER SMALL HE 1758 case 0x31F9: // KATAKANA LETTER SMALL HO 1759 case 0x31FA: // KATAKANA LETTER SMALL MU 1760 case 0x31FB: // KATAKANA LETTER SMALL RA 1761 case 0x31FC: // KATAKANA LETTER SMALL RI 1762 case 0x31FD: // KATAKANA LETTER SMALL RU 1763 case 0x31FE: // KATAKANA LETTER SMALL RE 1764 case 0x31FF: // KATAKANA LETTER SMALL RO 1765 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A 1766 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I 1767 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U 1768 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E 1769 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O 1770 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA 1771 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU 1772 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO 1773 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU 1774 return true; 1775 } 1776 return false; 1777 } 1778 1779 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark }; 1780 1781 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) 1782 { 1783 ASSERT(isKanaLetter(character)); 1784 1785 switch (character) { 1786 case 0x304C: // HIRAGANA LETTER GA 1787 case 0x304E: // HIRAGANA LETTER GI 1788 case 0x3050: // HIRAGANA LETTER GU 1789 case 0x3052: // HIRAGANA LETTER GE 1790 case 0x3054: // HIRAGANA LETTER GO 1791 case 0x3056: // HIRAGANA LETTER ZA 1792 case 0x3058: // HIRAGANA LETTER ZI 1793 case 0x305A: // HIRAGANA LETTER ZU 1794 case 0x305C: // HIRAGANA LETTER ZE 1795 case 0x305E: // HIRAGANA LETTER ZO 1796 case 0x3060: // HIRAGANA LETTER DA 1797 case 0x3062: // HIRAGANA LETTER DI 1798 case 0x3065: // HIRAGANA LETTER DU 1799 case 0x3067: // HIRAGANA LETTER DE 1800 case 0x3069: // HIRAGANA LETTER DO 1801 case 0x3070: // HIRAGANA LETTER BA 1802 case 0x3073: // HIRAGANA LETTER BI 1803 case 0x3076: // HIRAGANA LETTER BU 1804 case 0x3079: // HIRAGANA LETTER BE 1805 case 0x307C: // HIRAGANA LETTER BO 1806 case 0x3094: // HIRAGANA LETTER VU 1807 case 0x30AC: // KATAKANA LETTER GA 1808 case 0x30AE: // KATAKANA LETTER GI 1809 case 0x30B0: // KATAKANA LETTER GU 1810 case 0x30B2: // KATAKANA LETTER GE 1811 case 0x30B4: // KATAKANA LETTER GO 1812 case 0x30B6: // KATAKANA LETTER ZA 1813 case 0x30B8: // KATAKANA LETTER ZI 1814 case 0x30BA: // KATAKANA LETTER ZU 1815 case 0x30BC: // KATAKANA LETTER ZE 1816 case 0x30BE: // KATAKANA LETTER ZO 1817 case 0x30C0: // KATAKANA LETTER DA 1818 case 0x30C2: // KATAKANA LETTER DI 1819 case 0x30C5: // KATAKANA LETTER DU 1820 case 0x30C7: // KATAKANA LETTER DE 1821 case 0x30C9: // KATAKANA LETTER DO 1822 case 0x30D0: // KATAKANA LETTER BA 1823 case 0x30D3: // KATAKANA LETTER BI 1824 case 0x30D6: // KATAKANA LETTER BU 1825 case 0x30D9: // KATAKANA LETTER BE 1826 case 0x30DC: // KATAKANA LETTER BO 1827 case 0x30F4: // KATAKANA LETTER VU 1828 case 0x30F7: // KATAKANA LETTER VA 1829 case 0x30F8: // KATAKANA LETTER VI 1830 case 0x30F9: // KATAKANA LETTER VE 1831 case 0x30FA: // KATAKANA LETTER VO 1832 return VoicedSoundMark; 1833 case 0x3071: // HIRAGANA LETTER PA 1834 case 0x3074: // HIRAGANA LETTER PI 1835 case 0x3077: // HIRAGANA LETTER PU 1836 case 0x307A: // HIRAGANA LETTER PE 1837 case 0x307D: // HIRAGANA LETTER PO 1838 case 0x30D1: // KATAKANA LETTER PA 1839 case 0x30D4: // KATAKANA LETTER PI 1840 case 0x30D7: // KATAKANA LETTER PU 1841 case 0x30DA: // KATAKANA LETTER PE 1842 case 0x30DD: // KATAKANA LETTER PO 1843 return SemiVoicedSoundMark; 1844 } 1845 return NoVoicedSoundMark; 1846 } 1847 1848 static inline bool isCombiningVoicedSoundMark(UChar character) 1849 { 1850 switch (character) { 1851 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK 1852 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 1853 return true; 1854 } 1855 return false; 1856 } 1857 1858 static inline bool containsKanaLetters(const String& pattern) 1859 { 1860 const UChar* characters = pattern.characters(); 1861 unsigned length = pattern.length(); 1862 for (unsigned i = 0; i < length; ++i) { 1863 if (isKanaLetter(characters[i])) 1864 return true; 1865 } 1866 return false; 1867 } 1868 1869 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer) 1870 { 1871 ASSERT(length); 1872 1873 buffer.resize(length); 1874 1875 UErrorCode status = U_ZERO_ERROR; 1876 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); 1877 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); 1878 ASSERT(bufferSize); 1879 1880 buffer.resize(bufferSize); 1881 1882 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) 1883 return; 1884 1885 status = U_ZERO_ERROR; 1886 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); 1887 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); 1888 } 1889 1890 static bool isNonLatin1Separator(UChar32 character) 1891 { 1892 ASSERT_ARG(character, character >= 256); 1893 1894 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); 1895 } 1896 1897 static inline bool isSeparator(UChar32 character) 1898 { 1899 static const bool latin1SeparatorTable[256] = { 1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1902 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / 1903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? 1904 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ 1905 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ 1906 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` 1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ 1908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1909 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1910 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1911 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1912 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1913 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1914 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1915 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 1916 }; 1917 1918 if (character < 256) 1919 return latin1SeparatorTable[character]; 1920 1921 return isNonLatin1Separator(character); 1922 } 1923 1924 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) 1925 : m_target(target) 1926 , m_options(options) 1927 , m_prefixLength(0) 1928 , m_atBreak(true) 1929 , m_needsMoreContext(options & AtWordStarts) 1930 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target)) 1931 { 1932 ASSERT(!m_target.isEmpty()); 1933 1934 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead 1935 // of doing it in a separate replacement pass here, but ICU doesn't offer a way 1936 // to add tailoring on top of the locale-specific tailoring as of this writing. 1937 foldQuoteMarksAndSoftHyphens(m_target); 1938 1939 size_t targetLength = m_target.length(); 1940 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize)); 1941 m_overlap = m_buffer.capacity() / 4; 1942 1943 if ((m_options & AtWordStarts) && targetLength) { 1944 UChar32 targetFirstCharacter; 1945 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter); 1946 // Characters in the separator category never really occur at the beginning of a word, 1947 // so if the target begins with such a character, we just ignore the AtWordStart option. 1948 if (isSeparator(targetFirstCharacter)) { 1949 m_options &= ~AtWordStarts; 1950 m_needsMoreContext = false; 1951 } 1952 } 1953 1954 // Grab the single global searcher. 1955 // If we ever have a reason to do more than once search buffer at once, we'll have 1956 // to move to multiple searchers. 1957 lockSearcher(); 1958 1959 UStringSearch* searcher = WebCore::searcher(); 1960 UCollator* collator = usearch_getCollator(searcher); 1961 1962 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; 1963 if (ucol_getStrength(collator) != strength) { 1964 ucol_setStrength(collator, strength); 1965 usearch_reset(searcher); 1966 } 1967 1968 UErrorCode status = U_ZERO_ERROR; 1969 usearch_setPattern(searcher, m_target.characters(), targetLength, &status); 1970 ASSERT(status == U_ZERO_ERROR); 1971 1972 // The kana workaround requires a normalized copy of the target string. 1973 if (m_targetRequiresKanaWorkaround) 1974 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget); 1975 } 1976 1977 inline SearchBuffer::~SearchBuffer() 1978 { 1979 // Leave the static object pointing to a valid string. 1980 UErrorCode status = U_ZERO_ERROR; 1981 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status); 1982 ASSERT(status == U_ZERO_ERROR); 1983 1984 unlockSearcher(); 1985 } 1986 1987 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 1988 { 1989 ASSERT(length); 1990 1991 if (m_atBreak) { 1992 m_buffer.shrink(0); 1993 m_prefixLength = 0; 1994 m_atBreak = false; 1995 } else if (m_buffer.size() == m_buffer.capacity()) { 1996 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar)); 1997 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap); 1998 m_buffer.shrink(m_overlap); 1999 } 2000 2001 size_t oldLength = m_buffer.size(); 2002 size_t usableLength = min(m_buffer.capacity() - oldLength, length); 2003 ASSERT(usableLength); 2004 m_buffer.append(characters, usableLength); 2005 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength); 2006 return usableLength; 2007 } 2008 2009 inline bool SearchBuffer::needsMoreContext() const 2010 { 2011 return m_needsMoreContext; 2012 } 2013 2014 inline void SearchBuffer::prependContext(const UChar* characters, size_t length) 2015 { 2016 ASSERT(m_needsMoreContext); 2017 ASSERT(m_prefixLength == m_buffer.size()); 2018 2019 if (!length) 2020 return; 2021 2022 m_atBreak = false; 2023 2024 size_t wordBoundaryContextStart = length; 2025 if (wordBoundaryContextStart) { 2026 U16_BACK_1(characters, 0, wordBoundaryContextStart); 2027 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart); 2028 } 2029 2030 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart); 2031 m_buffer.prepend(characters + length - usableLength, usableLength); 2032 m_prefixLength += usableLength; 2033 2034 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity()) 2035 m_needsMoreContext = false; 2036 } 2037 2038 inline bool SearchBuffer::atBreak() const 2039 { 2040 return m_atBreak; 2041 } 2042 2043 inline void SearchBuffer::reachedBreak() 2044 { 2045 m_atBreak = true; 2046 } 2047 2048 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const 2049 { 2050 // This function implements the kana workaround. If usearch treats 2051 // it as a match, but we do not want to, then it's a "bad match". 2052 if (!m_targetRequiresKanaWorkaround) 2053 return false; 2054 2055 // Normalize into a match buffer. We reuse a single buffer rather than 2056 // creating a new one each time. 2057 normalizeCharacters(match, matchLength, m_normalizedMatch); 2058 2059 const UChar* a = m_normalizedTarget.begin(); 2060 const UChar* aEnd = m_normalizedTarget.end(); 2061 2062 const UChar* b = m_normalizedMatch.begin(); 2063 const UChar* bEnd = m_normalizedMatch.end(); 2064 2065 while (true) { 2066 // Skip runs of non-kana-letter characters. This is necessary so we can 2067 // correctly handle strings where the target and match have different-length 2068 // runs of characters that match, while still double checking the correctness 2069 // of matches of kana letters with other kana letters. 2070 while (a != aEnd && !isKanaLetter(*a)) 2071 ++a; 2072 while (b != bEnd && !isKanaLetter(*b)) 2073 ++b; 2074 2075 // If we reached the end of either the target or the match, we should have 2076 // reached the end of both; both should have the same number of kana letters. 2077 if (a == aEnd || b == bEnd) { 2078 ASSERT(a == aEnd); 2079 ASSERT(b == bEnd); 2080 return false; 2081 } 2082 2083 // Check for differences in the kana letter character itself. 2084 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b)) 2085 return true; 2086 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b)) 2087 return true; 2088 ++a; 2089 ++b; 2090 2091 // Check for differences in combining voiced sound marks found after the letter. 2092 while (1) { 2093 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) { 2094 if (b != bEnd && isCombiningVoicedSoundMark(*b)) 2095 return true; 2096 break; 2097 } 2098 if (!(b != bEnd && isCombiningVoicedSoundMark(*b))) 2099 return true; 2100 if (*a != *b) 2101 return true; 2102 ++a; 2103 ++b; 2104 } 2105 } 2106 } 2107 2108 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const 2109 { 2110 ASSERT(m_options & AtWordStarts); 2111 2112 if (!start) 2113 return true; 2114 2115 if (m_options & TreatMedialCapitalAsWordStart) { 2116 int size = m_buffer.size(); 2117 int offset = start; 2118 UChar32 firstCharacter; 2119 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); 2120 UChar32 previousCharacter; 2121 U16_PREV(m_buffer.data(), 0, offset, previousCharacter); 2122 2123 if (isSeparator(firstCharacter)) { 2124 // The start of a separator run is a word start (".org" in "webkit.org"). 2125 if (!isSeparator(previousCharacter)) 2126 return true; 2127 } else if (isASCIIUpper(firstCharacter)) { 2128 // The start of an uppercase run is a word start ("Kit" in "WebKit"). 2129 if (!isASCIIUpper(previousCharacter)) 2130 return true; 2131 // The last character of an uppercase run followed by a non-separator, non-digit 2132 // is a word start ("Request" in "XMLHTTPRequest"). 2133 offset = start; 2134 U16_FWD_1(m_buffer.data(), offset, size); 2135 UChar32 nextCharacter = 0; 2136 if (offset < size) 2137 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); 2138 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) 2139 return true; 2140 } else if (isASCIIDigit(firstCharacter)) { 2141 // The start of a digit run is a word start ("2" in "WebKit2"). 2142 if (!isASCIIDigit(previousCharacter)) 2143 return true; 2144 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { 2145 // The start of a non-separator, non-uppercase, non-digit run is a word start, 2146 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). 2147 return true; 2148 } 2149 } 2150 2151 size_t wordBreakSearchStart = start + length; 2152 while (wordBreakSearchStart > start) 2153 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); 2154 return wordBreakSearchStart == start; 2155 } 2156 2157 inline size_t SearchBuffer::search(size_t& start) 2158 { 2159 size_t size = m_buffer.size(); 2160 if (m_atBreak) { 2161 if (!size) 2162 return 0; 2163 } else { 2164 if (size != m_buffer.capacity()) 2165 return 0; 2166 } 2167 2168 UStringSearch* searcher = WebCore::searcher(); 2169 2170 UErrorCode status = U_ZERO_ERROR; 2171 usearch_setText(searcher, m_buffer.data(), size, &status); 2172 ASSERT(status == U_ZERO_ERROR); 2173 2174 usearch_setOffset(searcher, m_prefixLength, &status); 2175 ASSERT(status == U_ZERO_ERROR); 2176 2177 int matchStart = usearch_next(searcher, &status); 2178 ASSERT(status == U_ZERO_ERROR); 2179 2180 nextMatch: 2181 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { 2182 ASSERT(matchStart == USEARCH_DONE); 2183 return 0; 2184 } 2185 2186 // Matches that start in the overlap area are only tentative. 2187 // The same match may appear later, matching more characters, 2188 // possibly including a combining character that's not yet in the buffer. 2189 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { 2190 size_t overlap = m_overlap; 2191 if (m_options & AtWordStarts) { 2192 // Ensure that there is sufficient context before matchStart the next time around for 2193 // determining if it is at a word boundary. 2194 int wordBoundaryContextStart = matchStart; 2195 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); 2196 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); 2197 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart)); 2198 } 2199 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); 2200 m_prefixLength -= min(m_prefixLength, size - overlap); 2201 m_buffer.shrink(overlap); 2202 return 0; 2203 } 2204 2205 size_t matchedLength = usearch_getMatchedLength(searcher); 2206 ASSERT(matchStart + matchedLength <= size); 2207 2208 // If this match is "bad", move on to the next match. 2209 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { 2210 matchStart = usearch_next(searcher, &status); 2211 ASSERT(status == U_ZERO_ERROR); 2212 goto nextMatch; 2213 } 2214 2215 size_t newSize = size - (matchStart + 1); 2216 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); 2217 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1); 2218 m_buffer.shrink(newSize); 2219 2220 start = size - matchStart; 2221 return matchedLength; 2222 } 2223 2224 #else // !ICU_UNICODE 2225 2226 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) 2227 : m_target(options & CaseInsensitive ? target.foldCase() : target) 2228 , m_options(options) 2229 , m_buffer(m_target.length()) 2230 , m_isCharacterStartBuffer(m_target.length()) 2231 , m_isBufferFull(false) 2232 , m_cursor(0) 2233 { 2234 ASSERT(!m_target.isEmpty()); 2235 m_target.replace(noBreakSpace, ' '); 2236 foldQuoteMarksAndSoftHyphens(m_target); 2237 } 2238 2239 inline SearchBuffer::~SearchBuffer() 2240 { 2241 } 2242 2243 inline void SearchBuffer::reachedBreak() 2244 { 2245 m_cursor = 0; 2246 m_isBufferFull = false; 2247 } 2248 2249 inline bool SearchBuffer::atBreak() const 2250 { 2251 return !m_cursor && !m_isBufferFull; 2252 } 2253 2254 inline void SearchBuffer::append(UChar c, bool isStart) 2255 { 2256 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c); 2257 m_isCharacterStartBuffer[m_cursor] = isStart; 2258 if (++m_cursor == m_target.length()) { 2259 m_cursor = 0; 2260 m_isBufferFull = true; 2261 } 2262 } 2263 2264 inline size_t SearchBuffer::append(const UChar* characters, size_t length) 2265 { 2266 ASSERT(length); 2267 if (!(m_options & CaseInsensitive)) { 2268 append(characters[0], true); 2269 return 1; 2270 } 2271 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough 2272 UChar foldedCharacters[maxFoldedCharacters]; 2273 bool error; 2274 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error); 2275 ASSERT(!error); 2276 ASSERT(numFoldedCharacters); 2277 ASSERT(numFoldedCharacters <= maxFoldedCharacters); 2278 if (!error && numFoldedCharacters) { 2279 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters); 2280 append(foldedCharacters[0], true); 2281 for (int i = 1; i < numFoldedCharacters; ++i) 2282 append(foldedCharacters[i], false); 2283 } 2284 return 1; 2285 } 2286 2287 inline bool SearchBuffer::needsMoreContext() const 2288 { 2289 return false; 2290 } 2291 2292 void SearchBuffer::prependContext(const UChar*, size_t) 2293 { 2294 ASSERT_NOT_REACHED(); 2295 } 2296 2297 inline size_t SearchBuffer::search(size_t& start) 2298 { 2299 if (!m_isBufferFull) 2300 return 0; 2301 if (!m_isCharacterStartBuffer[m_cursor]) 2302 return 0; 2303 2304 size_t tailSpace = m_target.length() - m_cursor; 2305 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0) 2306 return 0; 2307 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0) 2308 return 0; 2309 2310 start = length(); 2311 2312 // Now that we've found a match once, we don't want to find it again, because those 2313 // are the SearchBuffer semantics, allowing for a buffer where you append more than one 2314 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if 2315 // we want to get rid of that in the future we could track this with a separate boolean 2316 // or even move the characters to the start of the buffer and set m_isBufferFull to false. 2317 m_isCharacterStartBuffer[m_cursor] = false; 2318 2319 return start; 2320 } 2321 2322 // Returns the number of characters that were appended to the buffer (what we are searching in). 2323 // That's not necessarily the same length as the passed-in target string, because case folding 2324 // can make two strings match even though they're not the same length. 2325 size_t SearchBuffer::length() const 2326 { 2327 size_t bufferSize = m_target.length(); 2328 size_t length = 0; 2329 for (size_t i = 0; i < bufferSize; ++i) 2330 length += m_isCharacterStartBuffer[i]; 2331 return length; 2332 } 2333 2334 #endif // !ICU_UNICODE 2335 2336 // -------- 2337 2338 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation) 2339 { 2340 int length = 0; 2341 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance()) 2342 length += it.length(); 2343 2344 return length; 2345 } 2346 2347 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount) 2348 { 2349 CharacterIterator entireRangeIterator(entireRange); 2350 return characterSubrange(entireRangeIterator, characterOffset, characterCount); 2351 } 2352 2353 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation) 2354 { 2355 RefPtr<Range> resultRange = scope->document()->createRange(); 2356 2357 int docTextPosition = 0; 2358 int rangeEnd = rangeLocation + rangeLength; 2359 bool startRangeFound = false; 2360 2361 RefPtr<Range> textRunRange; 2362 2363 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); 2364 2365 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>. 2366 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) { 2367 textRunRange = it.range(); 2368 2369 ExceptionCode ec = 0; 2370 resultRange->setStart(textRunRange->startContainer(), 0, ec); 2371 ASSERT(!ec); 2372 resultRange->setEnd(textRunRange->startContainer(), 0, ec); 2373 ASSERT(!ec); 2374 2375 return resultRange.release(); 2376 } 2377 2378 for (; !it.atEnd(); it.advance()) { 2379 int len = it.length(); 2380 textRunRange = it.range(); 2381 2382 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len; 2383 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len; 2384 2385 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only 2386 // in those cases that textRunRange is used. 2387 if (foundEnd) { 2388 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong 2389 // position for emitted '\n's. 2390 if (len == 1 && it.characters()[0] == '\n') { 2391 scope->document()->updateLayoutIgnorePendingStylesheets(); 2392 it.advance(); 2393 if (!it.atEnd()) { 2394 RefPtr<Range> range = it.range(); 2395 ExceptionCode ec = 0; 2396 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec); 2397 ASSERT(!ec); 2398 } else { 2399 Position runStart = textRunRange->startPosition(); 2400 Position runEnd = VisiblePosition(runStart).next().deepEquivalent(); 2401 if (runEnd.isNotNull()) { 2402 ExceptionCode ec = 0; 2403 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec); 2404 ASSERT(!ec); 2405 } 2406 } 2407 } 2408 } 2409 2410 if (foundStart) { 2411 startRangeFound = true; 2412 int exception = 0; 2413 if (textRunRange->startContainer()->isTextNode()) { 2414 int offset = rangeLocation - docTextPosition; 2415 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2416 } else { 2417 if (rangeLocation == docTextPosition) 2418 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2419 else 2420 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2421 } 2422 } 2423 2424 if (foundEnd) { 2425 int exception = 0; 2426 if (textRunRange->startContainer()->isTextNode()) { 2427 int offset = rangeEnd - docTextPosition; 2428 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception); 2429 } else { 2430 if (rangeEnd == docTextPosition) 2431 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception); 2432 else 2433 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2434 } 2435 docTextPosition += len; 2436 break; 2437 } 2438 docTextPosition += len; 2439 } 2440 2441 if (!startRangeFound) 2442 return 0; 2443 2444 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds 2445 int exception = 0; 2446 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception); 2447 } 2448 2449 return resultRange.release(); 2450 } 2451 2452 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length) 2453 { 2454 location = notFound; 2455 length = 0; 2456 2457 if (!range->startContainer()) 2458 return false; 2459 2460 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement(); 2461 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement(); 2462 2463 // The critical assumption is that this only gets called with ranges that 2464 // concentrate on a given area containing the selection root. This is done 2465 // because of text fields and textareas. The DOM for those is not 2466 // directly in the document DOM, so ensure that the range does not cross a 2467 // boundary of one of those. 2468 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope)) 2469 return false; 2470 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope)) 2471 return false; 2472 2473 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset()); 2474 ASSERT(testRange->startContainer() == scope); 2475 location = TextIterator::rangeLength(testRange.get()); 2476 2477 ExceptionCode ec; 2478 testRange->setEnd(range->endContainer(), range->endOffset(), ec); 2479 ASSERT(testRange->startContainer() == scope); 2480 length = TextIterator::rangeLength(testRange.get()) - location; 2481 return true; 2482 } 2483 2484 // -------- 2485 2486 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior) 2487 { 2488 UChar* result = 0; 2489 2490 // Do this in pieces to avoid massive reallocations if there is a large amount of text. 2491 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS. 2492 static const unsigned cMaxSegmentSize = 1 << 16; 2493 bufferLength = 0; 2494 typedef pair<UChar*, unsigned> TextSegment; 2495 OwnPtr<Vector<TextSegment> > textSegments; 2496 Vector<UChar> textBuffer; 2497 textBuffer.reserveInitialCapacity(cMaxSegmentSize); 2498 TextIteratorBehavior behavior = defaultBehavior; 2499 if (!isDisplayString) 2500 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding); 2501 2502 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) { 2503 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) { 2504 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar))); 2505 if (!newSegmentBuffer) 2506 goto exit; 2507 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2508 if (!textSegments) 2509 textSegments = adoptPtr(new Vector<TextSegment>); 2510 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size())); 2511 textBuffer.clear(); 2512 } 2513 textBuffer.append(it.characters(), it.length()); 2514 bufferLength += it.length(); 2515 } 2516 2517 if (!bufferLength) 2518 return 0; 2519 2520 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc 2521 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar))); 2522 if (!result) 2523 goto exit; 2524 2525 { 2526 UChar* resultPos = result; 2527 if (textSegments) { 2528 unsigned size = textSegments->size(); 2529 for (unsigned i = 0; i < size; ++i) { 2530 const TextSegment& segment = textSegments->at(i); 2531 memcpy(resultPos, segment.first, segment.second * sizeof(UChar)); 2532 resultPos += segment.second; 2533 } 2534 } 2535 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar)); 2536 } 2537 2538 exit: 2539 if (textSegments) { 2540 unsigned size = textSegments->size(); 2541 for (unsigned i = 0; i < size; ++i) 2542 free(textSegments->at(i).first); 2543 } 2544 2545 if (isDisplayString && r->ownerDocument()) 2546 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength); 2547 2548 return result; 2549 } 2550 2551 String plainText(const Range* r, TextIteratorBehavior defaultBehavior) 2552 { 2553 unsigned length; 2554 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior); 2555 if (!buf) 2556 return ""; 2557 String result(buf, length); 2558 free(buf); 2559 return result; 2560 } 2561 2562 static inline bool isAllCollapsibleWhitespace(const String& string) 2563 { 2564 const UChar* characters = string.characters(); 2565 unsigned length = string.length(); 2566 for (unsigned i = 0; i < length; ++i) { 2567 if (!isCollapsibleWhitespace(characters[i])) 2568 return false; 2569 } 2570 return true; 2571 } 2572 2573 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward) 2574 { 2575 ExceptionCode ec = 0; 2576 RefPtr<Range> result = range->cloneRange(ec); 2577 ASSERT(!ec); 2578 result->collapse(!forward, ec); 2579 ASSERT(!ec); 2580 return result.release(); 2581 } 2582 2583 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart) 2584 { 2585 matchStart = 0; 2586 size_t matchLength = 0; 2587 2588 SearchBuffer buffer(target, options); 2589 2590 if (buffer.needsMoreContext()) { 2591 RefPtr<Range> startRange = it.range(); 2592 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange(); 2593 ExceptionCode ec = 0; 2594 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec); 2595 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) { 2596 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length()); 2597 if (!buffer.needsMoreContext()) 2598 break; 2599 } 2600 } 2601 2602 while (!it.atEnd()) { 2603 it.advance(buffer.append(it.characters(), it.length())); 2604 tryAgain: 2605 size_t matchStartOffset; 2606 if (size_t newMatchLength = buffer.search(matchStartOffset)) { 2607 // Note that we found a match, and where we found it. 2608 size_t lastCharacterInBufferOffset = it.characterOffset(); 2609 ASSERT(lastCharacterInBufferOffset >= matchStartOffset); 2610 matchStart = lastCharacterInBufferOffset - matchStartOffset; 2611 matchLength = newMatchLength; 2612 // If searching forward, stop on the first match. 2613 // If searching backward, don't stop, so we end up with the last match. 2614 if (!(options & Backwards)) 2615 break; 2616 goto tryAgain; 2617 } 2618 if (it.atBreak() && !buffer.atBreak()) { 2619 buffer.reachedBreak(); 2620 goto tryAgain; 2621 } 2622 } 2623 2624 return matchLength; 2625 } 2626 2627 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options) 2628 { 2629 // First, find the text. 2630 size_t matchStart; 2631 size_t matchLength; 2632 { 2633 CharacterIterator findIterator(range, TextIteratorEntersTextControls); 2634 matchLength = findPlainText(findIterator, target, options, matchStart); 2635 if (!matchLength) 2636 return collapsedToBoundary(range, !(options & Backwards)); 2637 } 2638 2639 // Then, find the document position of the start and the end of the text. 2640 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls); 2641 return characterSubrange(computeRangeIterator, matchStart, matchLength); 2642 } 2643 2644 } 2645