Home | History | Annotate | Download | only in fpdfsdk
      1 // Copyright 2015 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <memory>
      6 
      7 #include "core/fxcrt/fx_memory.h"
      8 #include "public/fpdf_text.h"
      9 #include "public/fpdfview.h"
     10 #include "testing/embedder_test.h"
     11 #include "testing/gtest/include/gtest/gtest.h"
     12 #include "testing/test_support.h"
     13 
     14 namespace {
     15 
     16 bool check_unsigned_shorts(const char* expected,
     17                            const unsigned short* actual,
     18                            size_t length) {
     19   if (length > strlen(expected) + 1)
     20     return false;
     21 
     22   for (size_t i = 0; i < length; ++i) {
     23     if (actual[i] != static_cast<unsigned short>(expected[i]))
     24       return false;
     25   }
     26   return true;
     27 }
     28 
     29 }  // namespace
     30 
     31 class FPDFTextEmbeddertest : public EmbedderTest {};
     32 
     33 TEST_F(FPDFTextEmbeddertest, Text) {
     34   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
     35   FPDF_PAGE page = LoadPage(0);
     36   EXPECT_TRUE(page);
     37 
     38   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
     39   EXPECT_TRUE(textpage);
     40 
     41   static const char expected[] = "Hello, world!\r\nGoodbye, world!";
     42   unsigned short fixed_buffer[128];
     43   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
     44 
     45   // Check that edge cases are handled gracefully
     46   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, 128, nullptr));
     47   EXPECT_EQ(0, FPDFText_GetText(textpage, -1, 128, fixed_buffer));
     48   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, -1, fixed_buffer));
     49   EXPECT_EQ(1, FPDFText_GetText(textpage, 0, 0, fixed_buffer));
     50   EXPECT_EQ(0, fixed_buffer[0]);
     51 
     52   // Keep going and check the next case.
     53   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
     54   EXPECT_EQ(2, FPDFText_GetText(textpage, 0, 1, fixed_buffer));
     55   EXPECT_EQ(expected[0], fixed_buffer[0]);
     56   EXPECT_EQ(0, fixed_buffer[1]);
     57 
     58   // Check includes the terminating NUL that is provided.
     59   int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
     60   ASSERT_GE(num_chars, 0);
     61   EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
     62   EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
     63 
     64   // Count does not include the terminating NUL in the string literal.
     65   EXPECT_EQ(sizeof(expected) - 1,
     66             static_cast<size_t>(FPDFText_CountChars(textpage)));
     67   for (size_t i = 0; i < sizeof(expected) - 1; ++i) {
     68     EXPECT_EQ(static_cast<unsigned int>(expected[i]),
     69               FPDFText_GetUnicode(textpage, i))
     70         << " at " << i;
     71   }
     72 
     73   // Extracting using a buffer that will be completely filled. Small buffer is
     74   // 12 elements long, since it will need 2 locations per displayed character in
     75   // the expected string, plus 2 more for the terminating character.
     76   static const char small_expected[] = "Hello";
     77   unsigned short small_buffer[12];
     78   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
     79   EXPECT_EQ(6, FPDFText_GetText(textpage, 0, 5, small_buffer));
     80   EXPECT_TRUE(check_unsigned_shorts(small_expected, small_buffer,
     81                                     sizeof(small_expected)));
     82 
     83   EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
     84   EXPECT_EQ(16.0, FPDFText_GetFontSize(textpage, 15));
     85 
     86   double left = 0.0;
     87   double right = 0.0;
     88   double bottom = 0.0;
     89   double top = 0.0;
     90   EXPECT_FALSE(FPDFText_GetCharBox(nullptr, 4, &left, &right, &bottom, &top));
     91   EXPECT_DOUBLE_EQ(0.0, left);
     92   EXPECT_DOUBLE_EQ(0.0, right);
     93   EXPECT_DOUBLE_EQ(0.0, bottom);
     94   EXPECT_DOUBLE_EQ(0.0, top);
     95   EXPECT_FALSE(FPDFText_GetCharBox(textpage, -1, &left, &right, &bottom, &top));
     96   EXPECT_DOUBLE_EQ(0.0, left);
     97   EXPECT_DOUBLE_EQ(0.0, right);
     98   EXPECT_DOUBLE_EQ(0.0, bottom);
     99   EXPECT_DOUBLE_EQ(0.0, top);
    100   EXPECT_FALSE(FPDFText_GetCharBox(textpage, 55, &left, &right, &bottom, &top));
    101   EXPECT_DOUBLE_EQ(0.0, left);
    102   EXPECT_DOUBLE_EQ(0.0, right);
    103   EXPECT_DOUBLE_EQ(0.0, bottom);
    104   EXPECT_DOUBLE_EQ(0.0, top);
    105 
    106   EXPECT_TRUE(FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, &top));
    107   EXPECT_NEAR(41.071, left, 0.001);
    108   EXPECT_NEAR(46.243, right, 0.001);
    109   EXPECT_NEAR(49.844, bottom, 0.001);
    110   EXPECT_NEAR(55.520, top, 0.001);
    111 
    112   double x = 0.0;
    113   double y = 0.0;
    114   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
    115   EXPECT_NEAR(40.664, x, 0.001);
    116   EXPECT_NEAR(50.000, y, 0.001);
    117 
    118   EXPECT_EQ(4, FPDFText_GetCharIndexAtPos(textpage, 42.0, 50.0, 1.0, 1.0));
    119   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 0.0, 0.0, 1.0, 1.0));
    120   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 199.0, 199.0, 1.0, 1.0));
    121 
    122   // Test out of range indicies.
    123   EXPECT_EQ(-1,
    124             FPDFText_GetCharIndexAtPos(textpage, 42.0, 10000000.0, 1.0, 1.0));
    125   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, -1.0, 50.0, 1.0, 1.0));
    126 
    127   // Count does not include the terminating NUL in the string literal.
    128   EXPECT_EQ(2, FPDFText_CountRects(textpage, 0, sizeof(expected) - 1));
    129 
    130   left = 0.0;
    131   right = 0.0;
    132   bottom = 0.0;
    133   top = 0.0;
    134   EXPECT_TRUE(FPDFText_GetRect(textpage, 1, &left, &top, &right, &bottom));
    135   EXPECT_NEAR(20.847, left, 0.001);
    136   EXPECT_NEAR(135.167, right, 0.001);
    137   EXPECT_NEAR(96.655, bottom, 0.001);
    138   EXPECT_NEAR(116.000, top, 0.001);
    139 
    140   // Test out of range indicies set outputs to (0.0, 0.0, 0.0, 0.0).
    141   left = -1.0;
    142   right = -1.0;
    143   bottom = -1.0;
    144   top = -1.0;
    145   EXPECT_FALSE(FPDFText_GetRect(textpage, -1, &left, &top, &right, &bottom));
    146   EXPECT_EQ(0.0, left);
    147   EXPECT_EQ(0.0, right);
    148   EXPECT_EQ(0.0, bottom);
    149   EXPECT_EQ(0.0, top);
    150 
    151   left = -2.0;
    152   right = -2.0;
    153   bottom = -2.0;
    154   top = -2.0;
    155   EXPECT_FALSE(FPDFText_GetRect(textpage, 2, &left, &top, &right, &bottom));
    156   EXPECT_EQ(0.0, left);
    157   EXPECT_EQ(0.0, right);
    158   EXPECT_EQ(0.0, bottom);
    159   EXPECT_EQ(0.0, top);
    160 
    161   EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, 0, 0));
    162 
    163   // Extract starting at character 4 as above.
    164   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    165   EXPECT_EQ(1, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
    166                                        fixed_buffer, 1));
    167   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 1));
    168   EXPECT_EQ(0xbdbd, fixed_buffer[1]);
    169 
    170   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    171   EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
    172                                        fixed_buffer, 9));
    173   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
    174   EXPECT_EQ(0xbdbd, fixed_buffer[9]);
    175 
    176   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    177   EXPECT_EQ(10, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
    178                                         fixed_buffer, 128));
    179   EXPECT_TRUE(check_unsigned_shorts(expected + 4, fixed_buffer, 9));
    180   EXPECT_EQ(0u, fixed_buffer[9]);
    181   EXPECT_EQ(0xbdbd, fixed_buffer[10]);
    182 
    183   FPDFText_ClosePage(textpage);
    184   UnloadPage(page);
    185 }
    186 
    187 TEST_F(FPDFTextEmbeddertest, TextSearch) {
    188   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
    189   FPDF_PAGE page = LoadPage(0);
    190   EXPECT_TRUE(page);
    191 
    192   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    193   EXPECT_TRUE(textpage);
    194 
    195   std::unique_ptr<unsigned short, pdfium::FreeDeleter> nope =
    196       GetFPDFWideString(L"nope");
    197   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world =
    198       GetFPDFWideString(L"world");
    199   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_caps =
    200       GetFPDFWideString(L"WORLD");
    201   std::unique_ptr<unsigned short, pdfium::FreeDeleter> world_substr =
    202       GetFPDFWideString(L"orld");
    203 
    204   // No occurences of "nope" in test page.
    205   FPDF_SCHHANDLE search = FPDFText_FindStart(textpage, nope.get(), 0, 0);
    206   EXPECT_TRUE(search);
    207   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
    208   EXPECT_EQ(0, FPDFText_GetSchCount(search));
    209 
    210   // Advancing finds nothing.
    211   EXPECT_FALSE(FPDFText_FindNext(search));
    212   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
    213   EXPECT_EQ(0, FPDFText_GetSchCount(search));
    214 
    215   // Retreating finds nothing.
    216   EXPECT_FALSE(FPDFText_FindPrev(search));
    217   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
    218   EXPECT_EQ(0, FPDFText_GetSchCount(search));
    219   FPDFText_FindClose(search);
    220 
    221   // Two occurences of "world" in test page.
    222   search = FPDFText_FindStart(textpage, world.get(), 0, 2);
    223   EXPECT_TRUE(search);
    224 
    225   // Remains not found until advanced.
    226   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
    227   EXPECT_EQ(0, FPDFText_GetSchCount(search));
    228 
    229   // First occurence of "world" in this test page.
    230   EXPECT_TRUE(FPDFText_FindNext(search));
    231   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
    232   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    233 
    234   // Last occurence of "world" in this test page.
    235   EXPECT_TRUE(FPDFText_FindNext(search));
    236   EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
    237   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    238 
    239   // Found position unchanged when fails to advance.
    240   EXPECT_FALSE(FPDFText_FindNext(search));
    241   EXPECT_EQ(24, FPDFText_GetSchResultIndex(search));
    242   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    243 
    244   // Back to first occurence.
    245   EXPECT_TRUE(FPDFText_FindPrev(search));
    246   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
    247   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    248 
    249   // Found position unchanged when fails to retreat.
    250   EXPECT_FALSE(FPDFText_FindPrev(search));
    251   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
    252   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    253   FPDFText_FindClose(search);
    254 
    255   // Exact search unaffected by case sensitiity and whole word flags.
    256   search = FPDFText_FindStart(textpage, world.get(),
    257                               FPDF_MATCHCASE | FPDF_MATCHWHOLEWORD, 0);
    258   EXPECT_TRUE(search);
    259   EXPECT_TRUE(FPDFText_FindNext(search));
    260   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
    261   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    262   FPDFText_FindClose(search);
    263 
    264   // Default is case-insensitive, so matching agaist caps works.
    265   search = FPDFText_FindStart(textpage, world_caps.get(), 0, 0);
    266   EXPECT_TRUE(search);
    267   EXPECT_TRUE(FPDFText_FindNext(search));
    268   EXPECT_EQ(7, FPDFText_GetSchResultIndex(search));
    269   EXPECT_EQ(5, FPDFText_GetSchCount(search));
    270   FPDFText_FindClose(search);
    271 
    272   // But can be made case sensitive, in which case this fails.
    273   search = FPDFText_FindStart(textpage, world_caps.get(), FPDF_MATCHCASE, 0);
    274   EXPECT_FALSE(FPDFText_FindNext(search));
    275   EXPECT_EQ(0, FPDFText_GetSchResultIndex(search));
    276   EXPECT_EQ(0, FPDFText_GetSchCount(search));
    277   FPDFText_FindClose(search);
    278 
    279   // Default is match anywhere within word, so matching substirng works.
    280   search = FPDFText_FindStart(textpage, world_substr.get(), 0, 0);
    281   EXPECT_TRUE(FPDFText_FindNext(search));
    282   EXPECT_EQ(8, FPDFText_GetSchResultIndex(search));
    283   EXPECT_EQ(4, FPDFText_GetSchCount(search));
    284   FPDFText_FindClose(search);
    285 
    286   // But can be made to mach word boundaries, in which case this fails.
    287   search =
    288       FPDFText_FindStart(textpage, world_substr.get(), FPDF_MATCHWHOLEWORD, 0);
    289   EXPECT_FALSE(FPDFText_FindNext(search));
    290   // TODO(tsepez): investigate strange index/count values in this state.
    291   FPDFText_FindClose(search);
    292 
    293   FPDFText_ClosePage(textpage);
    294   UnloadPage(page);
    295 }
    296 
    297 // Test that the page has characters despite a bad stream length.
    298 TEST_F(FPDFTextEmbeddertest, StreamLengthPastEndOfFile) {
    299   EXPECT_TRUE(OpenDocument("bug_57.pdf"));
    300   FPDF_PAGE page = LoadPage(0);
    301   EXPECT_TRUE(page);
    302 
    303   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    304   EXPECT_TRUE(textpage);
    305   EXPECT_EQ(13, FPDFText_CountChars(textpage));
    306 
    307   FPDFText_ClosePage(textpage);
    308   UnloadPage(page);
    309 }
    310 
    311 TEST_F(FPDFTextEmbeddertest, WebLinks) {
    312   EXPECT_TRUE(OpenDocument("weblinks.pdf"));
    313   FPDF_PAGE page = LoadPage(0);
    314   EXPECT_TRUE(page);
    315 
    316   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    317   EXPECT_TRUE(textpage);
    318 
    319   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
    320   EXPECT_TRUE(pagelink);
    321 
    322   // Page contains two HTTP-style URLs.
    323   EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
    324 
    325   // Only a terminating NUL required for bogus links.
    326   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 2, nullptr, 0));
    327   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 1400, nullptr, 0));
    328   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, -1, nullptr, 0));
    329 
    330   // Query the number of characters required for each link (incl NUL).
    331   EXPECT_EQ(25, FPDFLink_GetURL(pagelink, 0, nullptr, 0));
    332   EXPECT_EQ(26, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
    333 
    334   static const char expected_url[] = "http://example.com?q=foo";
    335   static const size_t expected_len = sizeof(expected_url);
    336   unsigned short fixed_buffer[128];
    337 
    338   // Retrieve a link with too small a buffer.  Buffer will not be
    339   // NUL-terminated, but must not be modified past indicated length,
    340   // so pre-fill with a pattern to check write bounds.
    341   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    342   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 0, fixed_buffer, 1));
    343   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, 1));
    344   EXPECT_EQ(0xbdbd, fixed_buffer[1]);
    345 
    346   // Check buffer that doesn't have space for a terminating NUL.
    347   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    348   EXPECT_EQ(static_cast<int>(expected_len - 1),
    349             FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len - 1));
    350   EXPECT_TRUE(
    351       check_unsigned_shorts(expected_url, fixed_buffer, expected_len - 1));
    352   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len - 1]);
    353 
    354   // Retreive link with exactly-sized buffer.
    355   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    356   EXPECT_EQ(static_cast<int>(expected_len),
    357             FPDFLink_GetURL(pagelink, 0, fixed_buffer, expected_len));
    358   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
    359   EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
    360   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
    361 
    362   // Retreive link with ample-sized-buffer.
    363   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    364   EXPECT_EQ(static_cast<int>(expected_len),
    365             FPDFLink_GetURL(pagelink, 0, fixed_buffer, 128));
    366   EXPECT_TRUE(check_unsigned_shorts(expected_url, fixed_buffer, expected_len));
    367   EXPECT_EQ(0u, fixed_buffer[expected_len - 1]);
    368   EXPECT_EQ(0xbdbd, fixed_buffer[expected_len]);
    369 
    370   // Each link rendered in a single rect in this test page.
    371   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 0));
    372   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 1));
    373 
    374   // Each link rendered in a single rect in this test page.
    375   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, -1));
    376   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 2));
    377   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 10000));
    378 
    379   // Check boundary of valid link index with valid rect index.
    380   double left = 0.0;
    381   double right = 0.0;
    382   double top = 0.0;
    383   double bottom = 0.0;
    384   EXPECT_TRUE(FPDFLink_GetRect(pagelink, 0, 0, &left, &top, &right, &bottom));
    385   EXPECT_NEAR(50.791, left, 0.001);
    386   EXPECT_NEAR(187.963, right, 0.001);
    387   EXPECT_NEAR(97.624, bottom, 0.001);
    388   EXPECT_NEAR(108.736, top, 0.001);
    389 
    390   // Check that valid link with invalid rect index leaves parameters unchanged.
    391   left = -1.0;
    392   right = -1.0;
    393   top = -1.0;
    394   bottom = -1.0;
    395   EXPECT_FALSE(FPDFLink_GetRect(pagelink, 0, 1, &left, &top, &right, &bottom));
    396   EXPECT_EQ(-1.0, left);
    397   EXPECT_EQ(-1.0, right);
    398   EXPECT_EQ(-1.0, bottom);
    399   EXPECT_EQ(-1.0, top);
    400 
    401   // Check that invalid link index leaves parameters unchanged.
    402   left = -2.0;
    403   right = -2.0;
    404   top = -2.0;
    405   bottom = -2.0;
    406   EXPECT_FALSE(FPDFLink_GetRect(pagelink, -1, 0, &left, &top, &right, &bottom));
    407   EXPECT_EQ(-2.0, left);
    408   EXPECT_EQ(-2.0, right);
    409   EXPECT_EQ(-2.0, bottom);
    410   EXPECT_EQ(-2.0, top);
    411 
    412   FPDFLink_CloseWebLinks(pagelink);
    413   FPDFText_ClosePage(textpage);
    414   UnloadPage(page);
    415 }
    416 
    417 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) {
    418   EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
    419   FPDF_PAGE page = LoadPage(0);
    420   EXPECT_TRUE(page);
    421 
    422   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    423   EXPECT_TRUE(textpage);
    424 
    425   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
    426   EXPECT_TRUE(pagelink);
    427 
    428   static const char* const kExpectedUrls[] = {
    429       "http://example.com",           // from "http://www.example.com?\r\nfoo"
    430       "http://example.com/",          // from "http://www.example.com/\r\nfoo"
    431       "http://example.com/test-foo",  // from "http://example.com/test-\r\nfoo"
    432       "http://abc.com/test-foo",      // from "http://abc.com/test-\r\n\r\nfoo"
    433       // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
    434       "http://example.com/", "http://www.abc.com",
    435   };
    436   static const int kNumLinks = static_cast<int>(FX_ArraySize(kExpectedUrls));
    437 
    438   EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
    439 
    440   unsigned short fixed_buffer[128];
    441   for (int i = 0; i < kNumLinks; i++) {
    442     const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
    443     memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer));
    444     EXPECT_EQ(static_cast<int>(expected_len),
    445               FPDFLink_GetURL(pagelink, i, nullptr, 0));
    446     EXPECT_EQ(
    447         static_cast<int>(expected_len),
    448         FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer)));
    449     EXPECT_TRUE(
    450         check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len));
    451   }
    452 
    453   FPDFLink_CloseWebLinks(pagelink);
    454   FPDFText_ClosePage(textpage);
    455   UnloadPage(page);
    456 }
    457 
    458 TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) {
    459   EXPECT_TRUE(OpenDocument("bug_650.pdf"));
    460   FPDF_PAGE page = LoadPage(0);
    461   EXPECT_TRUE(page);
    462 
    463   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    464   EXPECT_TRUE(textpage);
    465 
    466   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
    467   EXPECT_TRUE(pagelink);
    468 
    469   EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
    470   unsigned short fixed_buffer[128] = {0};
    471   static const char kExpectedUrl[] =
    472       "http://tutorial45.com/learn-autocad-basics-day-166/";
    473   static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
    474 
    475   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
    476   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer,
    477                                       FX_ArraySize(fixed_buffer)));
    478   EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize));
    479 
    480   FPDFLink_CloseWebLinks(pagelink);
    481   FPDFText_ClosePage(textpage);
    482   UnloadPage(page);
    483 }
    484 
    485 TEST_F(FPDFTextEmbeddertest, GetFontSize) {
    486   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
    487   FPDF_PAGE page = LoadPage(0);
    488   EXPECT_TRUE(page);
    489 
    490   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    491   EXPECT_TRUE(textpage);
    492 
    493   const double kExpectedFontsSizes[] = {12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
    494                                         12, 12, 12, 1,  1,  16, 16, 16, 16, 16,
    495                                         16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    496 
    497   int count = FPDFText_CountChars(textpage);
    498   ASSERT_EQ(FX_ArraySize(kExpectedFontsSizes), static_cast<size_t>(count));
    499   for (int i = 0; i < count; ++i)
    500     EXPECT_EQ(kExpectedFontsSizes[i], FPDFText_GetFontSize(textpage, i)) << i;
    501 
    502   FPDFText_ClosePage(textpage);
    503   UnloadPage(page);
    504 }
    505 
    506 TEST_F(FPDFTextEmbeddertest, ToUnicode) {
    507   EXPECT_TRUE(OpenDocument("bug_583.pdf"));
    508   FPDF_PAGE page = LoadPage(0);
    509   EXPECT_TRUE(page);
    510 
    511   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    512   EXPECT_TRUE(textpage);
    513 
    514   ASSERT_EQ(1, FPDFText_CountChars(textpage));
    515   EXPECT_EQ(static_cast<unsigned int>(0), FPDFText_GetUnicode(textpage, 0));
    516 
    517   FPDFText_ClosePage(textpage);
    518   UnloadPage(page);
    519 }
    520 
    521 TEST_F(FPDFTextEmbeddertest, Bug_921) {
    522   EXPECT_TRUE(OpenDocument("bug_921.pdf"));
    523   FPDF_PAGE page = LoadPage(0);
    524   EXPECT_TRUE(page);
    525 
    526   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    527   EXPECT_TRUE(textpage);
    528 
    529   static constexpr unsigned int kData[] = {
    530       1095, 1077, 1083, 1086, 1074, 1077, 1095, 1077, 1089, 1082, 1086, 1077,
    531       32,   1089, 1090, 1088, 1072, 1076, 1072, 1085, 1080, 1077, 46,   32};
    532   static constexpr int kStartIndex = 238;
    533 
    534   ASSERT_EQ(268, FPDFText_CountChars(textpage));
    535   for (size_t i = 0; i < FX_ArraySize(kData); ++i)
    536     EXPECT_EQ(kData[i], FPDFText_GetUnicode(textpage, kStartIndex + i));
    537 
    538   unsigned short buffer[FX_ArraySize(kData) + 1];
    539   memset(buffer, 0xbd, sizeof(buffer));
    540   int count =
    541       FPDFText_GetText(textpage, kStartIndex, FX_ArraySize(kData), buffer);
    542   ASSERT_GT(count, 0);
    543   ASSERT_EQ(FX_ArraySize(kData) + 1, static_cast<size_t>(count));
    544   for (size_t i = 0; i < FX_ArraySize(kData); ++i)
    545     EXPECT_EQ(kData[i], buffer[i]);
    546   EXPECT_EQ(0, buffer[FX_ArraySize(kData)]);
    547 
    548   FPDFText_ClosePage(textpage);
    549   UnloadPage(page);
    550 }
    551 
    552 TEST_F(FPDFTextEmbeddertest, GetTextWithHyphen) {
    553   EXPECT_TRUE(OpenDocument("bug_781804.pdf"));
    554   FPDF_PAGE page = LoadPage(0);
    555   EXPECT_TRUE(page);
    556 
    557   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    558   EXPECT_TRUE(textpage);
    559 
    560   // Check that soft hyphens are not included
    561   // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in
    562   // the original text. This is a weird thing that Adobe does, which we
    563   // replicate.
    564   constexpr unsigned short soft_expected[] = {
    565       0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe,
    566       0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000};
    567   {
    568     constexpr int count = FX_ArraySize(soft_expected) - 1;
    569     unsigned short buffer[FX_ArraySize(soft_expected)];
    570     memset(buffer, 0, sizeof(buffer));
    571 
    572     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, 0, count, buffer));
    573     for (int i = 0; i < count; i++)
    574       EXPECT_EQ(soft_expected[i], buffer[i]);
    575   }
    576 
    577   // Check that hard hyphens are included
    578   {
    579     // There isn't the \0 in the actual doc, but there is a \r\n, so need to
    580     // add 1 to get aligned.
    581     constexpr size_t offset = FX_ArraySize(soft_expected) + 1;
    582     // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannnot
    583     // store in a char[].
    584     constexpr unsigned short hard_expected[] = {
    585         0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065,
    586         0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000};
    587     constexpr int count = FX_ArraySize(hard_expected) - 1;
    588     unsigned short buffer[FX_ArraySize(hard_expected)];
    589 
    590     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, offset, count, buffer));
    591     for (int i = 0; i < count; i++)
    592       EXPECT_EQ(hard_expected[i], buffer[i]);
    593   }
    594 
    595   FPDFText_ClosePage(textpage);
    596   UnloadPage(page);
    597 }
    598 
    599 TEST_F(FPDFTextEmbeddertest, bug_782596) {
    600   // If there is a regression in this test, it will only fail under ASAN
    601   EXPECT_TRUE(OpenDocument("bug_782596.pdf"));
    602   FPDF_PAGE page = LoadPage(0);
    603   EXPECT_TRUE(page);
    604   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    605   EXPECT_TRUE(textpage);
    606   FPDFText_ClosePage(textpage);
    607   UnloadPage(page);
    608 }
    609 
    610 TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
    611   EXPECT_TRUE(OpenDocument("control_characters.pdf"));
    612   FPDF_PAGE page = LoadPage(0);
    613   EXPECT_TRUE(page);
    614 
    615   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
    616   EXPECT_TRUE(textpage);
    617 
    618   // Should not include the control characters in the output
    619   static const char expected[] = "Hello, world!\r\nGoodbye, world!";
    620   unsigned short fixed_buffer[128];
    621   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    622   int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
    623 
    624   ASSERT_GE(num_chars, 0);
    625   EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
    626   EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
    627 
    628   // Attempting to get a chunk of text after the control characters
    629   static const char expected_substring[] = "Goodbye, world!";
    630   // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
    631   // original stream
    632   static const int offset = 17;
    633   memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
    634   num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
    635 
    636   ASSERT_GE(num_chars, 0);
    637   EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
    638   EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
    639                                     sizeof(expected_substring)));
    640 
    641   FPDFText_ClosePage(textpage);
    642   UnloadPage(page);
    643 }
    644