Home | History | Annotate | Download | only in fpdfsdk
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "public/fpdf_text.h"
      8 
      9 #include <algorithm>
     10 #include <vector>
     11 
     12 #include "core/fpdfapi/page/cpdf_page.h"
     13 #include "core/fpdfdoc/cpdf_viewerpreferences.h"
     14 #include "core/fpdftext/cpdf_linkextract.h"
     15 #include "core/fpdftext/cpdf_textpage.h"
     16 #include "core/fpdftext/cpdf_textpagefind.h"
     17 #include "fpdfsdk/fsdk_define.h"
     18 #include "third_party/base/numerics/safe_conversions.h"
     19 #include "third_party/base/stl_util.h"
     20 
     21 #ifdef PDF_ENABLE_XFA
     22 #include "fpdfsdk/fpdfxfa/cpdfxfa_context.h"
     23 #include "fpdfsdk/fpdfxfa/cpdfxfa_page.h"
     24 #endif  // PDF_ENABLE_XFA
     25 
     26 #ifdef _WIN32
     27 #include <tchar.h>
     28 #endif
     29 
     30 namespace {
     31 
     32 constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
     33 
     34 CPDF_TextPage* CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page) {
     35   return static_cast<CPDF_TextPage*>(text_page);
     36 }
     37 
     38 CPDF_TextPageFind* CPDFTextPageFindFromFPDFSchHandle(FPDF_SCHHANDLE handle) {
     39   return static_cast<CPDF_TextPageFind*>(handle);
     40 }
     41 
     42 CPDF_LinkExtract* CPDFLinkExtractFromFPDFPageLink(FPDF_PAGELINK link) {
     43   return static_cast<CPDF_LinkExtract*>(link);
     44 }
     45 
     46 }  // namespace
     47 
     48 FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page) {
     49   CPDF_Page* pPDFPage = CPDFPageFromFPDFPage(page);
     50   if (!pPDFPage)
     51     return nullptr;
     52 
     53 #ifdef PDF_ENABLE_XFA
     54   CPDFXFA_Page* pPage = (CPDFXFA_Page*)page;
     55   CPDFXFA_Context* pContext = pPage->GetContext();
     56   CPDF_ViewerPreferences viewRef(pContext->GetPDFDoc());
     57 #else  // PDF_ENABLE_XFA
     58   CPDF_ViewerPreferences viewRef(pPDFPage->m_pDocument.Get());
     59 #endif  // PDF_ENABLE_XFA
     60 
     61   CPDF_TextPage* textpage = new CPDF_TextPage(
     62       pPDFPage, viewRef.IsDirectionR2L() ? FPDFText_Direction::Right
     63                                          : FPDFText_Direction::Left);
     64   textpage->ParseTextPage();
     65   return textpage;
     66 }
     67 
     68 FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page) {
     69   delete CPDFTextPageFromFPDFTextPage(text_page);
     70 }
     71 
     72 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page) {
     73   if (!text_page)
     74     return -1;
     75 
     76   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
     77   return textpage->CountChars();
     78 }
     79 
     80 FPDF_EXPORT unsigned int FPDF_CALLCONV
     81 FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index) {
     82   if (!text_page)
     83     return 0;
     84 
     85   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
     86   if (index < 0 || index >= textpage->CountChars())
     87     return 0;
     88 
     89   FPDF_CHAR_INFO charinfo;
     90   textpage->GetCharInfo(index, &charinfo);
     91   return charinfo.m_Unicode;
     92 }
     93 
     94 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
     95                                                       int index) {
     96   if (!text_page)
     97     return 0;
     98   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
     99 
    100   if (index < 0 || index >= textpage->CountChars())
    101     return 0;
    102 
    103   FPDF_CHAR_INFO charinfo;
    104   textpage->GetCharInfo(index, &charinfo);
    105   return charinfo.m_FontSize;
    106 }
    107 
    108 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
    109                                                         int index,
    110                                                         double* left,
    111                                                         double* right,
    112                                                         double* bottom,
    113                                                         double* top) {
    114   if (!text_page || index < 0)
    115     return false;
    116 
    117   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    118   if (index >= textpage->CountChars())
    119     return false;
    120 
    121   FPDF_CHAR_INFO charinfo;
    122   textpage->GetCharInfo(index, &charinfo);
    123   *left = charinfo.m_CharBox.left;
    124   *right = charinfo.m_CharBox.right;
    125   *bottom = charinfo.m_CharBox.bottom;
    126   *top = charinfo.m_CharBox.top;
    127   return true;
    128 }
    129 
    130 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
    131 FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,
    132                        int index,
    133                        double* x,
    134                        double* y) {
    135   if (!text_page)
    136     return false;
    137   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    138 
    139   if (index < 0 || index >= textpage->CountChars())
    140     return false;
    141   FPDF_CHAR_INFO charinfo;
    142   textpage->GetCharInfo(index, &charinfo);
    143   *x = charinfo.m_Origin.x;
    144   *y = charinfo.m_Origin.y;
    145   return true;
    146 }
    147 
    148 // select
    149 FPDF_EXPORT int FPDF_CALLCONV
    150 FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
    151                            double x,
    152                            double y,
    153                            double xTolerance,
    154                            double yTolerance) {
    155   if (!text_page)
    156     return -3;
    157 
    158   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    159   return textpage->GetIndexAtPos(
    160       CFX_PointF(static_cast<float>(x), static_cast<float>(y)),
    161       CFX_SizeF(static_cast<float>(xTolerance),
    162                 static_cast<float>(yTolerance)));
    163 }
    164 
    165 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
    166                                                int char_start,
    167                                                int char_count,
    168                                                unsigned short* result) {
    169   if (!page || char_start < 0 || char_count < 0 || !result)
    170     return 0;
    171 
    172   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
    173   int char_available = textpage->CountChars() - char_start;
    174   if (char_available <= 0)
    175     return 0;
    176 
    177   char_count = std::min(char_count, char_available);
    178   if (char_count == 0) {
    179     // Writing out "", which has a character count of 1 due to the NUL.
    180     *result = '\0';
    181     return 1;
    182   }
    183 
    184   WideString str = textpage->GetPageText(char_start, char_count);
    185 
    186   if (str.GetLength() > static_cast<size_t>(char_count))
    187     str = str.Left(static_cast<size_t>(char_count));
    188 
    189   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
    190   // the number of items to stay the same.
    191   ByteString byte_str = str.UTF16LE_Encode();
    192   size_t byte_str_len = byte_str.GetLength();
    193   int ret_count = byte_str_len / kBytesPerCharacter;
    194 
    195   ASSERT(ret_count <= char_count + 1);  // +1 to account for the NUL terminator.
    196   memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
    197   return ret_count;
    198 }
    199 
    200 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
    201                                                   int start,
    202                                                   int count) {
    203   if (!text_page)
    204     return 0;
    205 
    206   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    207   return textpage->CountRects(start, count);
    208 }
    209 
    210 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page,
    211                                                      int rect_index,
    212                                                      double* left,
    213                                                      double* top,
    214                                                      double* right,
    215                                                      double* bottom) {
    216   if (!text_page)
    217     return false;
    218 
    219   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    220   CFX_FloatRect rect;
    221   bool result = textpage->GetRect(rect_index, &rect);
    222 
    223   *left = rect.left;
    224   *top = rect.top;
    225   *right = rect.right;
    226   *bottom = rect.bottom;
    227   return result;
    228 }
    229 
    230 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
    231                                                       double left,
    232                                                       double top,
    233                                                       double right,
    234                                                       double bottom,
    235                                                       unsigned short* buffer,
    236                                                       int buflen) {
    237   if (!text_page)
    238     return 0;
    239 
    240   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
    241   CFX_FloatRect rect((float)left, (float)bottom, (float)right, (float)top);
    242   WideString str = textpage->GetTextByRect(rect);
    243 
    244   if (buflen <= 0 || !buffer)
    245     return str.GetLength();
    246 
    247   ByteString cbUTF16Str = str.UTF16LE_Encode();
    248   int len = cbUTF16Str.GetLength() / sizeof(unsigned short);
    249   int size = buflen > len ? len : buflen;
    250   memcpy(buffer, cbUTF16Str.GetBuffer(size * sizeof(unsigned short)),
    251          size * sizeof(unsigned short));
    252   cbUTF16Str.ReleaseBuffer(size * sizeof(unsigned short));
    253 
    254   return size;
    255 }
    256 
    257 // Search
    258 // -1 for end
    259 FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV
    260 FPDFText_FindStart(FPDF_TEXTPAGE text_page,
    261                    FPDF_WIDESTRING findwhat,
    262                    unsigned long flags,
    263                    int start_index) {
    264   if (!text_page)
    265     return nullptr;
    266 
    267   CPDF_TextPageFind* textpageFind =
    268       new CPDF_TextPageFind(CPDFTextPageFromFPDFTextPage(text_page));
    269   size_t len = WideString::WStringLength(findwhat);
    270   textpageFind->FindFirst(
    271       WideString::FromUTF16LE(findwhat, len), flags,
    272       start_index >= 0 ? Optional<size_t>(start_index) : Optional<size_t>());
    273   return textpageFind;
    274 }
    275 
    276 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle) {
    277   if (!handle)
    278     return false;
    279 
    280   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
    281   return textpageFind->FindNext();
    282 }
    283 
    284 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle) {
    285   if (!handle)
    286     return false;
    287 
    288   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
    289   return textpageFind->FindPrev();
    290 }
    291 
    292 FPDF_EXPORT int FPDF_CALLCONV
    293 FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle) {
    294   if (!handle)
    295     return 0;
    296 
    297   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
    298   return textpageFind->GetCurOrder();
    299 }
    300 
    301 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle) {
    302   if (!handle)
    303     return 0;
    304 
    305   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
    306   return textpageFind->GetMatchedCount();
    307 }
    308 
    309 FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle) {
    310   if (!handle)
    311     return;
    312 
    313   CPDF_TextPageFind* textpageFind = CPDFTextPageFindFromFPDFSchHandle(handle);
    314   delete textpageFind;
    315   handle = nullptr;
    316 }
    317 
    318 // web link
    319 FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV
    320 FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page) {
    321   if (!text_page)
    322     return nullptr;
    323 
    324   CPDF_LinkExtract* pageLink =
    325       new CPDF_LinkExtract(CPDFTextPageFromFPDFTextPage(text_page));
    326   pageLink->ExtractLinks();
    327   return pageLink;
    328 }
    329 
    330 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page) {
    331   if (!link_page)
    332     return 0;
    333 
    334   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
    335   return pdfium::base::checked_cast<int>(pageLink->CountLinks());
    336 }
    337 
    338 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page,
    339                                               int link_index,
    340                                               unsigned short* buffer,
    341                                               int buflen) {
    342   WideString wsUrl(L"");
    343   if (link_page && link_index >= 0) {
    344     CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
    345     wsUrl = pageLink->GetURL(link_index);
    346   }
    347   ByteString cbUTF16URL = wsUrl.UTF16LE_Encode();
    348   int required = cbUTF16URL.GetLength() / sizeof(unsigned short);
    349   if (!buffer || buflen <= 0)
    350     return required;
    351 
    352   int size = std::min(required, buflen);
    353   if (size > 0) {
    354     int buf_size = size * sizeof(unsigned short);
    355     memcpy(buffer, cbUTF16URL.GetBuffer(buf_size), buf_size);
    356   }
    357   return size;
    358 }
    359 
    360 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page,
    361                                                   int link_index) {
    362   if (!link_page || link_index < 0)
    363     return 0;
    364 
    365   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
    366   return pdfium::CollectionSize<int>(pageLink->GetRects(link_index));
    367 }
    368 
    369 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page,
    370                                                      int link_index,
    371                                                      int rect_index,
    372                                                      double* left,
    373                                                      double* top,
    374                                                      double* right,
    375                                                      double* bottom) {
    376   if (!link_page || link_index < 0 || rect_index < 0)
    377     return false;
    378 
    379   CPDF_LinkExtract* pageLink = CPDFLinkExtractFromFPDFPageLink(link_page);
    380   std::vector<CFX_FloatRect> rectArray = pageLink->GetRects(link_index);
    381   if (rect_index >= pdfium::CollectionSize<int>(rectArray))
    382     return false;
    383 
    384   *left = rectArray[rect_index].left;
    385   *right = rectArray[rect_index].right;
    386   *top = rectArray[rect_index].top;
    387   *bottom = rectArray[rect_index].bottom;
    388   return true;
    389 }
    390 
    391 FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page) {
    392   delete CPDFLinkExtractFromFPDFPageLink(link_page);
    393 }
    394