1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/bind.h" 6 #include "base/command_line.h" 7 #include "base/compiler_specific.h" 8 #include "base/containers/hash_tables.h" 9 #include "base/file_util.h" 10 #include "base/files/file_path.h" 11 #include "base/strings/string_util.h" 12 #include "base/strings/utf_string_conversions.h" 13 #include "content/public/common/content_switches.h" 14 #include "content/public/renderer/render_view.h" 15 #include "content/public/renderer/render_view_observer.h" 16 #include "content/public/test/content_browser_test.h" 17 #include "content/public/test/content_browser_test_utils.h" 18 #include "content/public/test/test_utils.h" 19 #include "content/renderer/savable_resources.h" 20 #include "content/shell/browser/shell.h" 21 #include "net/base/filename_util.h" 22 #include "net/url_request/url_request_context.h" 23 #include "third_party/WebKit/public/platform/WebCString.h" 24 #include "third_party/WebKit/public/platform/WebData.h" 25 #include "third_party/WebKit/public/platform/WebString.h" 26 #include "third_party/WebKit/public/platform/WebURL.h" 27 #include "third_party/WebKit/public/platform/WebVector.h" 28 #include "third_party/WebKit/public/web/WebDocument.h" 29 #include "third_party/WebKit/public/web/WebElement.h" 30 #include "third_party/WebKit/public/web/WebElementCollection.h" 31 #include "third_party/WebKit/public/web/WebLocalFrame.h" 32 #include "third_party/WebKit/public/web/WebNode.h" 33 #include "third_party/WebKit/public/web/WebNodeList.h" 34 #include "third_party/WebKit/public/web/WebPageSerializer.h" 35 #include "third_party/WebKit/public/web/WebPageSerializerClient.h" 36 #include "third_party/WebKit/public/web/WebView.h" 37 38 using blink::WebCString; 39 using blink::WebData; 40 using blink::WebDocument; 41 using blink::WebElement; 42 using blink::WebElementCollection; 43 using blink::WebFrame; 44 using blink::WebLocalFrame; 45 using blink::WebNode; 46 using blink::WebNodeList; 47 using blink::WebPageSerializer; 48 using blink::WebPageSerializerClient; 49 using blink::WebString; 50 using blink::WebURL; 51 using blink::WebView; 52 using blink::WebVector; 53 54 namespace { 55 56 // The first RenderFrame is routing ID 1, and the first RenderView is 2. 57 const int kRenderViewRoutingId = 2; 58 59 } 60 61 namespace content { 62 63 // Iterate recursively over sub-frames to find one with with a given url. 64 WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) { 65 if (!web_view->mainFrame()) 66 return NULL; 67 68 std::vector<WebFrame*> stack; 69 stack.push_back(web_view->mainFrame()); 70 71 while (!stack.empty()) { 72 WebFrame* current_frame = stack.back(); 73 stack.pop_back(); 74 if (GURL(current_frame->document().url()) == url) 75 return current_frame; 76 WebElementCollection all = current_frame->document().all(); 77 for (WebElement element = all.firstItem(); 78 !element.isNull(); element = all.nextItem()) { 79 // Check frame tag and iframe tag 80 if (!element.hasTagName("frame") && !element.hasTagName("iframe")) 81 continue; 82 WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element); 83 if (sub_frame) 84 stack.push_back(sub_frame); 85 } 86 } 87 return NULL; 88 } 89 90 // Helper function that test whether the first node in the doc is a doc type 91 // node. 92 bool HasDocType(const WebDocument& doc) { 93 WebNode node = doc.firstChild(); 94 if (node.isNull()) 95 return false; 96 return node.nodeType() == WebNode::DocumentTypeNode; 97 } 98 99 // Helper function for checking whether input node is META tag. Return true 100 // means it is META element, otherwise return false. The parameter charset_info 101 // return actual charset info if the META tag has charset declaration. 102 bool IsMetaElement(const WebNode& node, std::string& charset_info) { 103 if (!node.isElementNode()) 104 return false; 105 const WebElement meta = node.toConst<WebElement>(); 106 if (!meta.hasTagName("meta")) 107 return false; 108 charset_info.erase(0, charset_info.length()); 109 // Check the META charset declaration. 110 WebString httpEquiv = meta.getAttribute("http-equiv"); 111 if (LowerCaseEqualsASCII(httpEquiv, "content-type")) { 112 std::string content = meta.getAttribute("content").utf8(); 113 int pos = content.find("charset", 0); 114 if (pos > -1) { 115 // Add a dummy charset declaration to charset_info, which indicates this 116 // META tag has charset declaration although we do not get correct value 117 // yet. 118 charset_info.append("has-charset-declaration"); 119 int remaining_length = content.length() - pos - 7; 120 if (!remaining_length) 121 return true; 122 int start_pos = pos + 7; 123 // Find "=" symbol. 124 while (remaining_length--) 125 if (content[start_pos++] == L'=') 126 break; 127 // Skip beginning space. 128 while (remaining_length) { 129 if (content[start_pos] > 0x0020) 130 break; 131 ++start_pos; 132 --remaining_length; 133 } 134 if (!remaining_length) 135 return true; 136 int end_pos = start_pos; 137 // Now we find out the start point of charset info. Search the end point. 138 while (remaining_length--) { 139 if (content[end_pos] <= 0x0020 || content[end_pos] == L';') 140 break; 141 ++end_pos; 142 } 143 // Get actual charset info. 144 charset_info = content.substr(start_pos, end_pos - start_pos); 145 return true; 146 } 147 } 148 return true; 149 } 150 151 class LoadObserver : public RenderViewObserver { 152 public: 153 LoadObserver(RenderView* render_view, const base::Closure& quit_closure) 154 : RenderViewObserver(render_view), 155 quit_closure_(quit_closure) {} 156 157 virtual void DidFinishLoad(blink::WebLocalFrame* frame) OVERRIDE { 158 if (frame == render_view()->GetWebView()->mainFrame()) 159 quit_closure_.Run(); 160 } 161 162 private: 163 base::Closure quit_closure_; 164 }; 165 166 class DomSerializerTests : public ContentBrowserTest, 167 public WebPageSerializerClient { 168 public: 169 DomSerializerTests() 170 : serialized_(false), 171 local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {} 172 173 virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE { 174 command_line->AppendSwitch(switches::kSingleProcess); 175 #if defined(OS_WIN) 176 // Don't want to try to create a GPU process. 177 command_line->AppendSwitch(switches::kDisableGpu); 178 #endif 179 } 180 181 // DomSerializerDelegate. 182 virtual void didSerializeDataForFrame(const WebURL& frame_web_url, 183 const WebCString& data, 184 PageSerializationStatus status) { 185 186 GURL frame_url(frame_web_url); 187 // If the all frames are finished saving, check all finish status 188 if (status == WebPageSerializerClient::AllFramesAreFinished) { 189 SerializationFinishStatusMap::iterator it = 190 serialization_finish_status_.begin(); 191 for (; it != serialization_finish_status_.end(); ++it) 192 ASSERT_TRUE(it->second); 193 serialized_ = true; 194 return; 195 } 196 197 // Check finish status of current frame. 198 SerializationFinishStatusMap::iterator it = 199 serialization_finish_status_.find(frame_url.spec()); 200 // New frame, set initial status as false. 201 if (it == serialization_finish_status_.end()) 202 serialization_finish_status_[frame_url.spec()] = false; 203 204 it = serialization_finish_status_.find(frame_url.spec()); 205 ASSERT_TRUE(it != serialization_finish_status_.end()); 206 // In process frame, finish status should be false. 207 ASSERT_FALSE(it->second); 208 209 // Add data to corresponding frame's content. 210 serialized_frame_map_[frame_url.spec()] += data.data(); 211 212 // Current frame is completed saving, change the finish status. 213 if (status == WebPageSerializerClient::CurrentFrameIsFinished) 214 it->second = true; 215 } 216 217 bool HasSerializedFrame(const GURL& frame_url) { 218 return serialized_frame_map_.find(frame_url.spec()) != 219 serialized_frame_map_.end(); 220 } 221 222 const std::string& GetSerializedContentForFrame( 223 const GURL& frame_url) { 224 return serialized_frame_map_[frame_url.spec()]; 225 } 226 227 RenderView* GetRenderView() { 228 // We could have the test on the UI thread get the WebContent's routing ID, 229 // but we know this will be the first RV so skip that and just hardcode it. 230 return RenderView::FromRoutingID(kRenderViewRoutingId); 231 } 232 233 WebView* GetWebView() { 234 return GetRenderView()->GetWebView(); 235 } 236 237 WebFrame* GetMainFrame() { 238 return GetWebView()->mainFrame(); 239 } 240 241 // Load web page according to input content and relative URLs within 242 // the document. 243 void LoadContents(const std::string& contents, 244 const GURL& base_url, 245 const WebString encoding_info) { 246 scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner; 247 LoadObserver observer(GetRenderView(), runner->QuitClosure()); 248 249 // If input encoding is empty, use UTF-8 as default encoding. 250 if (encoding_info.isEmpty()) { 251 GetMainFrame()->loadHTMLString(contents, base_url); 252 } else { 253 WebData data(contents.data(), contents.length()); 254 255 // Do not use WebFrame.LoadHTMLString because it assumes that input 256 // html contents use UTF-8 encoding. 257 // TODO(darin): This should use WebFrame::loadData. 258 WebFrame* web_frame = GetMainFrame(); 259 260 ASSERT_TRUE(web_frame != NULL); 261 262 web_frame->loadData(data, "text/html", encoding_info, base_url); 263 } 264 265 runner->Run(); 266 } 267 268 // Serialize page DOM according to specific page URL. The parameter 269 // recursive_serialization indicates whether we will serialize all 270 // sub-frames. 271 void SerializeDomForURL(const GURL& page_url, 272 bool recursive_serialization) { 273 // Find corresponding WebFrame according to page_url. 274 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url); 275 ASSERT_TRUE(web_frame != NULL); 276 WebVector<WebURL> links; 277 links.assign(&page_url, 1); 278 WebString file_path = 279 base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe(); 280 WebVector<WebString> local_paths; 281 local_paths.assign(&file_path, 1); 282 // Start serializing DOM. 283 bool result = WebPageSerializer::serialize(web_frame->toWebLocalFrame(), 284 recursive_serialization, 285 static_cast<WebPageSerializerClient*>(this), 286 links, 287 local_paths, 288 local_directory_name_.AsUTF16Unsafe()); 289 ASSERT_TRUE(result); 290 ASSERT_TRUE(serialized_); 291 } 292 293 void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) { 294 // Make sure original contents have document type. 295 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 296 ASSERT_TRUE(web_frame != NULL); 297 WebDocument doc = web_frame->document(); 298 ASSERT_TRUE(HasDocType(doc)); 299 // Do serialization. 300 SerializeDomForURL(file_url, false); 301 // Load the serialized contents. 302 ASSERT_TRUE(HasSerializedFrame(file_url)); 303 const std::string& serialized_contents = 304 GetSerializedContentForFrame(file_url); 305 LoadContents(serialized_contents, file_url, 306 web_frame->document().encoding()); 307 // Make sure serialized contents still have document type. 308 web_frame = GetMainFrame(); 309 doc = web_frame->document(); 310 ASSERT_TRUE(HasDocType(doc)); 311 } 312 313 void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) { 314 // Make sure original contents do not have document type. 315 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 316 ASSERT_TRUE(web_frame != NULL); 317 WebDocument doc = web_frame->document(); 318 ASSERT_TRUE(!HasDocType(doc)); 319 // Do serialization. 320 SerializeDomForURL(file_url, false); 321 // Load the serialized contents. 322 ASSERT_TRUE(HasSerializedFrame(file_url)); 323 const std::string& serialized_contents = 324 GetSerializedContentForFrame(file_url); 325 LoadContents(serialized_contents, file_url, 326 web_frame->document().encoding()); 327 // Make sure serialized contents do not have document type. 328 web_frame = GetMainFrame(); 329 doc = web_frame->document(); 330 ASSERT_TRUE(!HasDocType(doc)); 331 } 332 333 void SerializeXMLDocWithBuiltInEntitiesOnRenderer( 334 const GURL& xml_file_url, const std::string& original_contents) { 335 // Do serialization. 336 SerializeDomForURL(xml_file_url, false); 337 // Compare the serialized contents with original contents. 338 ASSERT_TRUE(HasSerializedFrame(xml_file_url)); 339 const std::string& serialized_contents = 340 GetSerializedContentForFrame(xml_file_url); 341 ASSERT_EQ(original_contents, serialized_contents); 342 } 343 344 void SerializeHTMLDOMWithAddingMOTWOnRenderer( 345 const GURL& file_url, const std::string& original_contents) { 346 // Make sure original contents does not have MOTW; 347 std::string motw_declaration = 348 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 349 ASSERT_FALSE(motw_declaration.empty()); 350 // The encoding of original contents is ISO-8859-1, so we convert the MOTW 351 // declaration to ASCII and search whether original contents has it or not. 352 ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration)); 353 354 // Do serialization. 355 SerializeDomForURL(file_url, false); 356 // Make sure the serialized contents have MOTW ; 357 ASSERT_TRUE(HasSerializedFrame(file_url)); 358 const std::string& serialized_contents = 359 GetSerializedContentForFrame(file_url); 360 ASSERT_FALSE(std::string::npos == 361 serialized_contents.find(motw_declaration)); 362 } 363 364 void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer( 365 const GURL& file_url) { 366 // Make sure there is no META charset declaration in original document. 367 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 368 ASSERT_TRUE(web_frame != NULL); 369 WebDocument doc = web_frame->document(); 370 ASSERT_TRUE(doc.isHTMLDocument()); 371 WebElement head_element = doc.head(); 372 ASSERT_TRUE(!head_element.isNull()); 373 // Go through all children of HEAD element. 374 for (WebNode child = head_element.firstChild(); !child.isNull(); 375 child = child.nextSibling()) { 376 std::string charset_info; 377 if (IsMetaElement(child, charset_info)) 378 ASSERT_TRUE(charset_info.empty()); 379 } 380 // Do serialization. 381 SerializeDomForURL(file_url, false); 382 383 // Load the serialized contents. 384 ASSERT_TRUE(HasSerializedFrame(file_url)); 385 const std::string& serialized_contents = 386 GetSerializedContentForFrame(file_url); 387 LoadContents(serialized_contents, file_url, 388 web_frame->document().encoding()); 389 // Make sure the first child of HEAD element is META which has charset 390 // declaration in serialized contents. 391 web_frame = GetMainFrame(); 392 ASSERT_TRUE(web_frame != NULL); 393 doc = web_frame->document(); 394 ASSERT_TRUE(doc.isHTMLDocument()); 395 head_element = doc.head(); 396 ASSERT_TRUE(!head_element.isNull()); 397 WebNode meta_node = head_element.firstChild(); 398 ASSERT_TRUE(!meta_node.isNull()); 399 // Get meta charset info. 400 std::string charset_info2; 401 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 402 ASSERT_TRUE(!charset_info2.empty()); 403 ASSERT_EQ(charset_info2, 404 std::string(web_frame->document().encoding().utf8())); 405 406 // Make sure no more additional META tags which have charset declaration. 407 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 408 child = child.nextSibling()) { 409 std::string charset_info; 410 if (IsMetaElement(child, charset_info)) 411 ASSERT_TRUE(charset_info.empty()); 412 } 413 } 414 415 void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer( 416 const GURL& file_url) { 417 // Make sure there are multiple META charset declarations in original 418 // document. 419 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 420 ASSERT_TRUE(web_frame != NULL); 421 WebDocument doc = web_frame->document(); 422 ASSERT_TRUE(doc.isHTMLDocument()); 423 WebElement head_ele = doc.head(); 424 ASSERT_TRUE(!head_ele.isNull()); 425 // Go through all children of HEAD element. 426 int charset_declaration_count = 0; 427 for (WebNode child = head_ele.firstChild(); !child.isNull(); 428 child = child.nextSibling()) { 429 std::string charset_info; 430 if (IsMetaElement(child, charset_info) && !charset_info.empty()) 431 charset_declaration_count++; 432 } 433 // The original doc has more than META tags which have charset declaration. 434 ASSERT_TRUE(charset_declaration_count > 1); 435 436 // Do serialization. 437 SerializeDomForURL(file_url, false); 438 439 // Load the serialized contents. 440 ASSERT_TRUE(HasSerializedFrame(file_url)); 441 const std::string& serialized_contents = 442 GetSerializedContentForFrame(file_url); 443 LoadContents(serialized_contents, file_url, 444 web_frame->document().encoding()); 445 // Make sure only first child of HEAD element is META which has charset 446 // declaration in serialized contents. 447 web_frame = GetMainFrame(); 448 ASSERT_TRUE(web_frame != NULL); 449 doc = web_frame->document(); 450 ASSERT_TRUE(doc.isHTMLDocument()); 451 head_ele = doc.head(); 452 ASSERT_TRUE(!head_ele.isNull()); 453 WebNode meta_node = head_ele.firstChild(); 454 ASSERT_TRUE(!meta_node.isNull()); 455 // Get meta charset info. 456 std::string charset_info2; 457 ASSERT_TRUE(IsMetaElement(meta_node, charset_info2)); 458 ASSERT_TRUE(!charset_info2.empty()); 459 ASSERT_EQ(charset_info2, 460 std::string(web_frame->document().encoding().utf8())); 461 462 // Make sure no more additional META tags which have charset declaration. 463 for (WebNode child = meta_node.nextSibling(); !child.isNull(); 464 child = child.nextSibling()) { 465 std::string charset_info; 466 if (IsMetaElement(child, charset_info)) 467 ASSERT_TRUE(charset_info.empty()); 468 } 469 } 470 471 void SerializeHTMLDOMWithEntitiesInTextOnRenderer() { 472 base::FilePath page_file_path = GetTestFilePath( 473 "dom_serializer", "dom_serializer/htmlentities_in_text.htm"); 474 // Get file URL. The URL is dummy URL to identify the following loading 475 // actions. The test content is in constant:original_contents. 476 GURL file_url = net::FilePathToFileURL(page_file_path); 477 ASSERT_TRUE(file_url.SchemeIsFile()); 478 // Test contents. 479 static const char* const original_contents = 480 "<html><body>&<>\"\'</body></html>"; 481 // Load the test contents. 482 LoadContents(original_contents, file_url, WebString()); 483 484 // Get BODY's text content in DOM. 485 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 486 ASSERT_TRUE(web_frame != NULL); 487 WebDocument doc = web_frame->document(); 488 ASSERT_TRUE(doc.isHTMLDocument()); 489 WebElement body_ele = doc.body(); 490 ASSERT_TRUE(!body_ele.isNull()); 491 WebNode text_node = body_ele.firstChild(); 492 ASSERT_TRUE(text_node.isTextNode()); 493 ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) == 494 "&<>\"\'"); 495 // Do serialization. 496 SerializeDomForURL(file_url, false); 497 // Compare the serialized contents with original contents. 498 ASSERT_TRUE(HasSerializedFrame(file_url)); 499 const std::string& serialized_contents = 500 GetSerializedContentForFrame(file_url); 501 // Compare the serialized contents with original contents to make sure 502 // they are same. 503 // Because we add MOTW when serializing DOM, so before comparison, we also 504 // need to add MOTW to original_contents. 505 std::string original_str = 506 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 507 original_str += original_contents; 508 // Since WebCore now inserts a new HEAD element if there is no HEAD element 509 // when creating BODY element. (Please see 510 // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and 511 // corresponding META content if we find WebCore-generated HEAD element. 512 if (!doc.head().isNull()) { 513 WebString encoding = web_frame->document().encoding(); 514 std::string htmlTag("<html>"); 515 std::string::size_type pos = original_str.find(htmlTag); 516 ASSERT_NE(std::string::npos, pos); 517 pos += htmlTag.length(); 518 std::string head_part("<head>"); 519 head_part += 520 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 521 head_part += "</head>"; 522 original_str.insert(pos, head_part); 523 } 524 ASSERT_EQ(original_str, serialized_contents); 525 } 526 527 void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() { 528 base::FilePath page_file_path = GetTestFilePath( 529 "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm"); 530 // Get file URL. The URL is dummy URL to identify the following loading 531 // actions. The test content is in constant:original_contents. 532 GURL file_url = net::FilePathToFileURL(page_file_path); 533 ASSERT_TRUE(file_url.SchemeIsFile()); 534 // Test contents. 535 static const char* const original_contents = 536 "<html><body title=\"&<>"'\"></body></html>"; 537 // Load the test contents. 538 LoadContents(original_contents, file_url, WebString()); 539 // Get value of BODY's title attribute in DOM. 540 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 541 ASSERT_TRUE(web_frame != NULL); 542 WebDocument doc = web_frame->document(); 543 ASSERT_TRUE(doc.isHTMLDocument()); 544 WebElement body_ele = doc.body(); 545 ASSERT_TRUE(!body_ele.isNull()); 546 WebString value = body_ele.getAttribute("title"); 547 ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'"); 548 // Do serialization. 549 SerializeDomForURL(file_url, false); 550 // Compare the serialized contents with original contents. 551 ASSERT_TRUE(HasSerializedFrame(file_url)); 552 const std::string& serialized_contents = 553 GetSerializedContentForFrame(file_url); 554 // Compare the serialized contents with original contents to make sure 555 // they are same. 556 std::string original_str = 557 WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8(); 558 original_str += original_contents; 559 if (!doc.isNull()) { 560 WebString encoding = web_frame->document().encoding(); 561 std::string htmlTag("<html>"); 562 std::string::size_type pos = original_str.find(htmlTag); 563 ASSERT_NE(std::string::npos, pos); 564 pos += htmlTag.length(); 565 std::string head_part("<head>"); 566 head_part += 567 WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8(); 568 head_part += "</head>"; 569 original_str.insert(pos, head_part); 570 } 571 ASSERT_EQ(original_str, serialized_contents); 572 } 573 574 void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) { 575 // Get value of BODY's title attribute in DOM. 576 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 577 WebDocument doc = web_frame->document(); 578 ASSERT_TRUE(doc.isHTMLDocument()); 579 WebElement body_element = doc.body(); 580 // Unescaped string for "%⊅¹'". 581 static const wchar_t parsed_value[] = { 582 '%', 0x2285, 0x00b9, '\'', 0 583 }; 584 WebString value = body_element.getAttribute("title"); 585 ASSERT_TRUE(base::UTF16ToWide(value) == parsed_value); 586 ASSERT_TRUE(base::UTF16ToWide(body_element.innerText()) == parsed_value); 587 588 // Do serialization. 589 SerializeDomForURL(file_url, false); 590 // Check the serialized string. 591 ASSERT_TRUE(HasSerializedFrame(file_url)); 592 const std::string& serialized_contents = 593 GetSerializedContentForFrame(file_url); 594 // Confirm that the serialized string has no non-standard HTML entities. 595 ASSERT_EQ(std::string::npos, serialized_contents.find("%")); 596 ASSERT_EQ(std::string::npos, serialized_contents.find("⊅")); 597 ASSERT_EQ(std::string::npos, serialized_contents.find("¹")); 598 ASSERT_EQ(std::string::npos, serialized_contents.find("'")); 599 } 600 601 void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url, 602 const GURL& path_dir_url) { 603 // There are total 2 available base tags in this test file. 604 const int kTotalBaseTagCountInTestFile = 2; 605 606 // Since for this test, we assume there is no savable sub-resource links for 607 // this test file, also all links are relative URLs in this test file, so we 608 // need to check those relative URLs and make sure document has BASE tag. 609 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 610 ASSERT_TRUE(web_frame != NULL); 611 WebDocument doc = web_frame->document(); 612 ASSERT_TRUE(doc.isHTMLDocument()); 613 // Go through all descent nodes. 614 WebElementCollection all = doc.all(); 615 int original_base_tag_count = 0; 616 for (WebElement element = all.firstItem(); !element.isNull(); 617 element = all.nextItem()) { 618 if (element.hasTagName("base")) { 619 original_base_tag_count++; 620 } else { 621 // Get link. 622 WebString value = GetSubResourceLinkFromElement(element); 623 if (value.isNull() && element.hasTagName("a")) { 624 value = element.getAttribute("href"); 625 if (value.isEmpty()) 626 value = WebString(); 627 } 628 // Each link is relative link. 629 if (!value.isNull()) { 630 GURL link(value.utf8()); 631 ASSERT_TRUE(link.scheme().empty()); 632 } 633 } 634 } 635 ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile); 636 // Make sure in original document, the base URL is not equal with the 637 // |path_dir_url|. 638 GURL original_base_url(doc.baseURL()); 639 ASSERT_NE(original_base_url, path_dir_url); 640 641 // Do serialization. 642 SerializeDomForURL(file_url, false); 643 644 // Load the serialized contents. 645 ASSERT_TRUE(HasSerializedFrame(file_url)); 646 const std::string& serialized_contents = 647 GetSerializedContentForFrame(file_url); 648 LoadContents(serialized_contents, file_url, 649 web_frame->document().encoding()); 650 651 // Make sure all links are absolute URLs and doc there are some number of 652 // BASE tags in serialized HTML data. Each of those BASE tags have same base 653 // URL which is as same as URL of current test file. 654 web_frame = GetMainFrame(); 655 ASSERT_TRUE(web_frame != NULL); 656 doc = web_frame->document(); 657 ASSERT_TRUE(doc.isHTMLDocument()); 658 // Go through all descent nodes. 659 all = doc.all(); 660 int new_base_tag_count = 0; 661 for (WebNode node = all.firstItem(); !node.isNull(); 662 node = all.nextItem()) { 663 if (!node.isElementNode()) 664 continue; 665 WebElement element = node.to<WebElement>(); 666 if (element.hasTagName("base")) { 667 new_base_tag_count++; 668 } else { 669 // Get link. 670 WebString value = GetSubResourceLinkFromElement(element); 671 if (value.isNull() && element.hasTagName("a")) { 672 value = element.getAttribute("href"); 673 if (value.isEmpty()) 674 value = WebString(); 675 } 676 // Each link is absolute link. 677 if (!value.isNull()) { 678 GURL link(std::string(value.utf8())); 679 ASSERT_FALSE(link.scheme().empty()); 680 } 681 } 682 } 683 // We have one more added BASE tag which is generated by JavaScript. 684 ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1); 685 // Make sure in new document, the base URL is equal with the |path_dir_url|. 686 GURL new_base_url(doc.baseURL()); 687 ASSERT_EQ(new_base_url, path_dir_url); 688 } 689 690 void SerializeHTMLDOMWithEmptyHeadOnRenderer() { 691 base::FilePath page_file_path = GetTestFilePath( 692 "dom_serializer", "empty_head.htm"); 693 GURL file_url = net::FilePathToFileURL(page_file_path); 694 ASSERT_TRUE(file_url.SchemeIsFile()); 695 696 // Load the test html content. 697 static const char* const empty_head_contents = 698 "<html><head></head><body>hello world</body></html>"; 699 LoadContents(empty_head_contents, file_url, WebString()); 700 701 // Make sure the head tag is empty. 702 WebFrame* web_frame = GetMainFrame(); 703 ASSERT_TRUE(web_frame != NULL); 704 WebDocument doc = web_frame->document(); 705 ASSERT_TRUE(doc.isHTMLDocument()); 706 WebElement head_element = doc.head(); 707 ASSERT_TRUE(!head_element.isNull()); 708 ASSERT_TRUE(!head_element.hasChildNodes()); 709 ASSERT_TRUE(head_element.childNodes().length() == 0); 710 711 // Do serialization. 712 SerializeDomForURL(file_url, false); 713 // Make sure the serialized contents have META ; 714 ASSERT_TRUE(HasSerializedFrame(file_url)); 715 const std::string& serialized_contents = 716 GetSerializedContentForFrame(file_url); 717 718 // Reload serialized contents and make sure there is only one META tag. 719 LoadContents(serialized_contents, file_url, 720 web_frame->document().encoding()); 721 web_frame = GetMainFrame(); 722 ASSERT_TRUE(web_frame != NULL); 723 doc = web_frame->document(); 724 ASSERT_TRUE(doc.isHTMLDocument()); 725 head_element = doc.head(); 726 ASSERT_TRUE(!head_element.isNull()); 727 ASSERT_TRUE(head_element.hasChildNodes()); 728 ASSERT_TRUE(head_element.childNodes().length() == 1); 729 WebNode meta_node = head_element.firstChild(); 730 ASSERT_TRUE(!meta_node.isNull()); 731 // Get meta charset info. 732 std::string charset_info; 733 ASSERT_TRUE(IsMetaElement(meta_node, charset_info)); 734 ASSERT_TRUE(!charset_info.empty()); 735 ASSERT_EQ(charset_info, 736 std::string(web_frame->document().encoding().utf8())); 737 738 // Check the body's first node is text node and its contents are 739 // "hello world" 740 WebElement body_element = doc.body(); 741 ASSERT_TRUE(!body_element.isNull()); 742 WebNode text_node = body_element.firstChild(); 743 ASSERT_TRUE(text_node.isTextNode()); 744 WebString text_node_contents = text_node.nodeValue(); 745 ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world"); 746 } 747 748 void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) { 749 // Do a recursive serialization. We pass if we don't crash. 750 SerializeDomForURL(file_url, true); 751 } 752 753 void SubResourceForElementsInNonHTMLNamespaceOnRenderer( 754 const GURL& file_url) { 755 WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url); 756 ASSERT_TRUE(web_frame != NULL); 757 WebDocument doc = web_frame->document(); 758 WebNode lastNodeInBody = doc.body().lastChild(); 759 ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType()); 760 WebString uri = GetSubResourceLinkFromElement( 761 lastNodeInBody.to<WebElement>()); 762 EXPECT_TRUE(uri.isNull()); 763 } 764 765 private: 766 // Map frame_url to corresponding serialized_content. 767 typedef base::hash_map<std::string, std::string> SerializedFrameContentMap; 768 SerializedFrameContentMap serialized_frame_map_; 769 // Map frame_url to corresponding status of serialization finish. 770 typedef base::hash_map<std::string, bool> SerializationFinishStatusMap; 771 SerializationFinishStatusMap serialization_finish_status_; 772 // Flag indicates whether the process of serializing DOM is finished or not. 773 bool serialized_; 774 // The local_directory_name_ is dummy relative path of directory which 775 // contain all saved auxiliary files included all sub frames and resources. 776 const base::FilePath local_directory_name_; 777 }; 778 779 // If original contents have document type, the serialized contents also have 780 // document type. 781 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) { 782 base::FilePath page_file_path = 783 GetTestFilePath("dom_serializer", "youtube_1.htm"); 784 GURL file_url = net::FilePathToFileURL(page_file_path); 785 ASSERT_TRUE(file_url.SchemeIsFile()); 786 // Load the test file. 787 NavigateToURL(shell(), file_url); 788 789 PostTaskToInProcessRendererAndWait( 790 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer, 791 base::Unretained(this), file_url)); 792 } 793 794 // If original contents do not have document type, the serialized contents 795 // also do not have document type. 796 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) { 797 base::FilePath page_file_path = 798 GetTestFilePath("dom_serializer", "youtube_2.htm"); 799 GURL file_url = net::FilePathToFileURL(page_file_path); 800 ASSERT_TRUE(file_url.SchemeIsFile()); 801 // Load the test file. 802 NavigateToURL(shell(), file_url); 803 804 PostTaskToInProcessRendererAndWait( 805 base::Bind( 806 &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer, 807 base::Unretained(this), file_url)); 808 } 809 810 // Serialize XML document which has all 5 built-in entities. After 811 // finishing serialization, the serialized contents should be same 812 // with original XML document. 813 // 814 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge -- 815 // XML headers are handled differently in the merged serializer. 816 // Bug: http://crbug.com/328354 817 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 818 DISABLED_SerializeXMLDocWithBuiltInEntities) { 819 base::FilePath page_file_path = 820 GetTestFilePath("dom_serializer", "note.html"); 821 base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml"); 822 // Read original contents for later comparison. 823 std::string original_contents; 824 ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents)); 825 // Get file URL. 826 GURL file_url = net::FilePathToFileURL(page_file_path); 827 GURL xml_file_url = net::FilePathToFileURL(xml_file_path); 828 ASSERT_TRUE(file_url.SchemeIsFile()); 829 // Load the test file. 830 NavigateToURL(shell(), file_url); 831 832 PostTaskToInProcessRendererAndWait( 833 base::Bind( 834 &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer, 835 base::Unretained(this), xml_file_url, original_contents)); 836 } 837 838 // When serializing DOM, we add MOTW declaration before html tag. 839 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) { 840 base::FilePath page_file_path = 841 GetTestFilePath("dom_serializer", "youtube_2.htm"); 842 // Read original contents for later comparison . 843 std::string original_contents; 844 ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents)); 845 // Get file URL. 846 GURL file_url = net::FilePathToFileURL(page_file_path); 847 ASSERT_TRUE(file_url.SchemeIsFile()); 848 849 // Load the test file. 850 NavigateToURL(shell(), file_url); 851 852 PostTaskToInProcessRendererAndWait( 853 base::Bind( 854 &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer, 855 base::Unretained(this), file_url, original_contents)); 856 } 857 858 // When serializing DOM, we will add the META which have correct charset 859 // declaration as first child of HEAD element for resolving WebKit bug: 860 // http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document 861 // does not have META charset declaration. 862 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 863 SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) { 864 base::FilePath page_file_path = 865 GetTestFilePath("dom_serializer", "youtube_1.htm"); 866 // Get file URL. 867 GURL file_url = net::FilePathToFileURL(page_file_path); 868 ASSERT_TRUE(file_url.SchemeIsFile()); 869 // Load the test file. 870 NavigateToURL(shell(), file_url); 871 872 PostTaskToInProcessRendererAndWait( 873 base::Bind( 874 &DomSerializerTests:: 875 SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer, 876 base::Unretained(this), file_url)); 877 } 878 879 // When serializing DOM, if the original document has multiple META charset 880 // declaration, we will add the META which have correct charset declaration 881 // as first child of HEAD element and remove all original META charset 882 // declarations. 883 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 884 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) { 885 base::FilePath page_file_path = 886 GetTestFilePath("dom_serializer", "youtube_2.htm"); 887 // Get file URL. 888 GURL file_url = net::FilePathToFileURL(page_file_path); 889 ASSERT_TRUE(file_url.SchemeIsFile()); 890 // Load the test file. 891 NavigateToURL(shell(), file_url); 892 893 PostTaskToInProcessRendererAndWait( 894 base::Bind( 895 &DomSerializerTests:: 896 SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer, 897 base::Unretained(this), file_url)); 898 } 899 900 // Test situation of html entities in text when serializing HTML DOM. 901 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) { 902 // Need to spin up the renderer and also navigate to a file url so that the 903 // renderer code doesn't attempt a fork when it sees a load to file scheme 904 // from non-file scheme. 905 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 906 907 PostTaskToInProcessRendererAndWait( 908 base::Bind( 909 &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer, 910 base::Unretained(this))); 911 } 912 913 // Test situation of html entities in attribute value when serializing 914 // HTML DOM. 915 // This test started to fail at WebKit r65388. See http://crbug.com/52279. 916 // 917 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge -- 918 // Some attributes are handled differently in the merged serializer. 919 // Bug: http://crbug.com/328354 920 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 921 DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue) { 922 // Need to spin up the renderer and also navigate to a file url so that the 923 // renderer code doesn't attempt a fork when it sees a load to file scheme 924 // from non-file scheme. 925 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 926 927 PostTaskToInProcessRendererAndWait( 928 base::Bind( 929 &DomSerializerTests:: 930 SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer, 931 base::Unretained(this))); 932 } 933 934 // Test situation of non-standard HTML entities when serializing HTML DOM. 935 // This test started to fail at WebKit r65351. See http://crbug.com/52279. 936 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 937 SerializeHTMLDOMWithNonStandardEntities) { 938 // Make a test file URL and load it. 939 base::FilePath page_file_path = GetTestFilePath( 940 "dom_serializer", "nonstandard_htmlentities.htm"); 941 GURL file_url = net::FilePathToFileURL(page_file_path); 942 NavigateToURL(shell(), file_url); 943 944 PostTaskToInProcessRendererAndWait( 945 base::Bind( 946 &DomSerializerTests:: 947 SerializeHTMLDOMWithNonStandardEntitiesOnRenderer, 948 base::Unretained(this), file_url)); 949 } 950 951 // Test situation of BASE tag in original document when serializing HTML DOM. 952 // When serializing, we should comment the BASE tag, append a new BASE tag. 953 // rewrite all the savable URLs to relative local path, and change other URLs 954 // to absolute URLs. 955 // 956 // TODO(tiger (at) opera.com): Disabled in preparation of page serializer merge -- 957 // Base tags are handled a bit different in merged version. 958 // Bug: http://crbug.com/328354 959 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 960 DISABLED_SerializeHTMLDOMWithBaseTag) { 961 base::FilePath page_file_path = GetTestFilePath( 962 "dom_serializer", "html_doc_has_base_tag.htm"); 963 964 // Get page dir URL which is base URL of this file. 965 base::FilePath dir_name = page_file_path.DirName(); 966 dir_name = dir_name.Append( 967 base::FilePath::StringType(base::FilePath::kSeparators[0], 1)); 968 GURL path_dir_url = net::FilePathToFileURL(dir_name); 969 970 // Get file URL. 971 GURL file_url = net::FilePathToFileURL(page_file_path); 972 ASSERT_TRUE(file_url.SchemeIsFile()); 973 // Load the test file. 974 NavigateToURL(shell(), file_url); 975 976 PostTaskToInProcessRendererAndWait( 977 base::Bind( 978 &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer, 979 base::Unretained(this), file_url, path_dir_url)); 980 } 981 982 // Serializing page which has an empty HEAD tag. 983 IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) { 984 // Need to spin up the renderer and also navigate to a file url so that the 985 // renderer code doesn't attempt a fork when it sees a load to file scheme 986 // from non-file scheme. 987 NavigateToURL(shell(), GetTestUrl(".", "simple_page.html")); 988 989 PostTaskToInProcessRendererAndWait( 990 base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer, 991 base::Unretained(this))); 992 } 993 994 // Test that we don't crash when the page contains an iframe that 995 // was handled as a download (http://crbug.com/42212). 996 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 997 SerializeDocumentWithDownloadedIFrame) { 998 base::FilePath page_file_path = GetTestFilePath( 999 "dom_serializer", "iframe-src-is-exe.htm"); 1000 GURL file_url = net::FilePathToFileURL(page_file_path); 1001 ASSERT_TRUE(file_url.SchemeIsFile()); 1002 // Load the test file. 1003 NavigateToURL(shell(), file_url); 1004 1005 PostTaskToInProcessRendererAndWait( 1006 base::Bind( 1007 &DomSerializerTests:: 1008 SerializeDocumentWithDownloadedIFrameOnRenderer, 1009 base::Unretained(this), file_url)); 1010 } 1011 1012 IN_PROC_BROWSER_TEST_F(DomSerializerTests, 1013 SubResourceForElementsInNonHTMLNamespace) { 1014 base::FilePath page_file_path = GetTestFilePath( 1015 "dom_serializer", "non_html_namespace.htm"); 1016 GURL file_url = net::FilePathToFileURL(page_file_path); 1017 NavigateToURL(shell(), file_url); 1018 1019 PostTaskToInProcessRendererAndWait( 1020 base::Bind( 1021 &DomSerializerTests:: 1022 SubResourceForElementsInNonHTMLNamespaceOnRenderer, 1023 base::Unretained(this), file_url)); 1024 } 1025 1026 } // namespace content 1027