Home | History | Annotate | Download | only in public
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef PUBLIC_FPDF_TEXT_H_
      8 #define PUBLIC_FPDF_TEXT_H_
      9 
     10 #include "fpdfview.h"
     11 
     12 // Exported Functions
     13 #ifdef __cplusplus
     14 extern "C" {
     15 #endif
     16 
     17 // Function: FPDFText_LoadPage
     18 //          Prepare information about all characters in a page.
     19 // Parameters:
     20 //          page    -   Handle to the page. Returned by FPDF_LoadPage function
     21 //          (in FPDFVIEW module).
     22 // Return value:
     23 //          A handle to the text page information structure.
     24 //          NULL if something goes wrong.
     25 // Comments:
     26 //          Application must call FPDFText_ClosePage to release the text page
     27 //          information.
     28 //
     29 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page);
     30 
     31 // Function: FPDFText_ClosePage
     32 //          Release all resources allocated for a text page information
     33 //          structure.
     34 // Parameters:
     35 //          text_page   -   Handle to a text page information structure.
     36 //          Returned by FPDFText_LoadPage function.
     37 // Return Value:
     38 //          None.
     39 //
     40 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
     41 
     42 // Function: FPDFText_CountChars
     43 //          Get number of characters in a page.
     44 // Parameters:
     45 //          text_page   -   Handle to a text page information structure.
     46 //          Returned by FPDFText_LoadPage function.
     47 // Return value:
     48 //          Number of characters in the page. Return -1 for error.
     49 //          Generated characters, like additional space characters, new line
     50 //          characters, are also counted.
     51 // Comments:
     52 //          Characters in a page form a "stream", inside the stream, each
     53 //          character has an index.
     54 //          We will use the index parameters in many of FPDFTEXT functions. The
     55 //          first character in the page
     56 //          has an index value of zero.
     57 //
     58 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);
     59 
     60 // Function: FPDFText_GetUnicode
     61 //          Get Unicode of a character in a page.
     62 // Parameters:
     63 //          text_page   -   Handle to a text page information structure.
     64 //          Returned by FPDFText_LoadPage function.
     65 //          index       -   Zero-based index of the character.
     66 // Return value:
     67 //          The Unicode of the particular character.
     68 //          If a character is not encoded in Unicode and Foxit engine can't
     69 //          convert to Unicode,
     70 //          the return value will be zero.
     71 //
     72 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page,
     73                                                    int index);
     74 
     75 // Function: FPDFText_GetFontSize
     76 //          Get the font size of a particular character.
     77 // Parameters:
     78 //          text_page   -   Handle to a text page information structure.
     79 //          Returned by FPDFText_LoadPage function.
     80 //          index       -   Zero-based index of the character.
     81 // Return value:
     82 //          The font size of the particular character, measured in points (about
     83 //          1/72 inch).
     84 //          This is the typographic size of the font (so called "em size").
     85 //
     86 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
     87                                               int index);
     88 
     89 // Function: FPDFText_GetCharBox
     90 //          Get bounding box of a particular character.
     91 // Parameters:
     92 //          text_page   -   Handle to a text page information structure.
     93 //          Returned by FPDFText_LoadPage function.
     94 //          index       -   Zero-based index of the character.
     95 //          left        -   Pointer to a double number receiving left position
     96 //          of the character box.
     97 //          right       -   Pointer to a double number receiving right position
     98 //          of the character box.
     99 //          bottom      -   Pointer to a double number receiving bottom position
    100 //          of the character box.
    101 //          top         -   Pointer to a double number receiving top position of
    102 //          the character box.
    103 // Return Value:
    104 //          None.
    105 // Comments:
    106 //          All positions are measured in PDF "user space".
    107 //
    108 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
    109                                            int index,
    110                                            double* left,
    111                                            double* right,
    112                                            double* bottom,
    113                                            double* top);
    114 
    115 // Function: FPDFText_GetCharIndexAtPos
    116 //          Get the index of a character at or nearby a certain position on the
    117 //          page.
    118 // Parameters:
    119 //          text_page   -   Handle to a text page information structure.
    120 //          Returned by FPDFText_LoadPage function.
    121 //          x           -   X position in PDF "user space".
    122 //          y           -   Y position in PDF "user space".
    123 //          xTolerance  -   An x-axis tolerance value for character hit
    124 //          detection, in point unit.
    125 //          yTolerance  -   A y-axis tolerance value for character hit
    126 //          detection, in point unit.
    127 // Return Value:
    128 //          The zero-based index of the character at, or nearby the point (x,y).
    129 //          If there is no character at or nearby the point, return value will
    130 //          be -1.
    131 //          If an error occurs, -3 will be returned.
    132 //
    133 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
    134                                                  double x,
    135                                                  double y,
    136                                                  double xTolerance,
    137                                                  double yTolerance);
    138 
    139 // Function: FPDFText_GetText
    140 //          Extract unicode text string from the page.
    141 // Parameters:
    142 //          text_page   -   Handle to a text page information structure.
    143 //          Returned by FPDFText_LoadPage function.
    144 //          start_index -   Index for the start characters.
    145 //          count       -   Number of characters to be extracted.
    146 //          result      -   A buffer (allocated by application) receiving the
    147 //          extracted unicodes.
    148 //                          The size of the buffer must be able to hold the
    149 //                          number of characters plus a terminator.
    150 // Return Value:
    151 //          Number of characters written into the result buffer, including the
    152 //          trailing terminator.
    153 // Comments:
    154 //          This function ignores characters without unicode information.
    155 //
    156 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page,
    157                                        int start_index,
    158                                        int count,
    159                                        unsigned short* result);
    160 
    161 // Function: FPDFText_CountRects
    162 //          Count number of rectangular areas occupied by a segment of texts.
    163 // Parameters:
    164 //          text_page   -   Handle to a text page information structure.
    165 //          Returned by FPDFText_LoadPage function.
    166 //          start_index -   Index for the start characters.
    167 //          count       -   Number of characters.
    168 // Return value:
    169 //          Number of rectangles. Zero for error.
    170 // Comments:
    171 //          This function, along with FPDFText_GetRect can be used by
    172 //          applications to detect the position
    173 //          on the page for a text segment, so proper areas can be highlighted
    174 //          or something.
    175 //          FPDFTEXT will automatically merge small character boxes into bigger
    176 //          one if those characters
    177 //          are on the same line and use same font settings.
    178 //
    179 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page,
    180                                           int start_index,
    181                                           int count);
    182 
    183 // Function: FPDFText_GetRect
    184 //          Get a rectangular area from the result generated by
    185 //          FPDFText_CountRects.
    186 // Parameters:
    187 //          text_page   -   Handle to a text page information structure.
    188 //          Returned by FPDFText_LoadPage function.
    189 //          rect_index  -   Zero-based index for the rectangle.
    190 //          left        -   Pointer to a double value receiving the rectangle
    191 //          left boundary.
    192 //          top         -   Pointer to a double value receiving the rectangle
    193 //          top boundary.
    194 //          right       -   Pointer to a double value receiving the rectangle
    195 //          right boundary.
    196 //          bottom      -   Pointer to a double value receiving the rectangle
    197 //          bottom boundary.
    198 // Return Value:
    199 //          None.
    200 //
    201 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page,
    202                                         int rect_index,
    203                                         double* left,
    204                                         double* top,
    205                                         double* right,
    206                                         double* bottom);
    207 
    208 // Function: FPDFText_GetBoundedText
    209 //          Extract unicode text within a rectangular boundary on the page.
    210 // Parameters:
    211 //          text_page   -   Handle to a text page information structure.
    212 //          Returned by FPDFText_LoadPage function.
    213 //          left        -   Left boundary.
    214 //          top         -   Top boundary.
    215 //          right       -   Right boundary.
    216 //          bottom      -   Bottom boundary.
    217 //          buffer      -   A unicode buffer.
    218 //          buflen      -   Number of characters (not bytes) for the buffer,
    219 //          excluding an additional terminator.
    220 // Return Value:
    221 //          If buffer is NULL or buflen is zero, return number of characters
    222 //          (not bytes) of text present within
    223 //          the rectangle, excluding a terminating NUL.  Generally you should
    224 //          pass a buffer at least one larger
    225 //          than this if you want a terminating NUL, which will be provided if
    226 //          space is available.
    227 //          Otherwise, return number of characters copied into the buffer,
    228 //          including the terminating NUL
    229 //          when space for it is available.
    230 // Comment:
    231 //          If the buffer is too small, as much text as will fit is copied into
    232 //          it.
    233 //
    234 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
    235                                               double left,
    236                                               double top,
    237                                               double right,
    238                                               double bottom,
    239                                               unsigned short* buffer,
    240                                               int buflen);
    241 
    242 // Flags used by FPDFText_FindStart function.
    243 #define FPDF_MATCHCASE \
    244   0x00000001  // If not set, it will not match case by default.
    245 #define FPDF_MATCHWHOLEWORD \
    246   0x00000002  // If not set, it will not match the whole word by default.
    247 
    248 // Function: FPDFText_FindStart
    249 //          Start a search.
    250 // Parameters:
    251 //          text_page   -   Handle to a text page information structure.
    252 //          Returned by FPDFText_LoadPage function.
    253 //          findwhat    -   A unicode match pattern.
    254 //          flags       -   Option flags.
    255 //          start_index -   Start from this character. -1 for end of the page.
    256 // Return Value:
    257 //          A handle for the search context. FPDFText_FindClose must be called
    258 //          to release this handle.
    259 //
    260 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page,
    261                                                     FPDF_WIDESTRING findwhat,
    262                                                     unsigned long flags,
    263                                                     int start_index);
    264 
    265 // Function: FPDFText_FindNext
    266 //          Search in the direction from page start to end.
    267 // Parameters:
    268 //          handle      -   A search context handle returned by
    269 //          FPDFText_FindStart.
    270 // Return Value:
    271 //          Whether a match is found.
    272 //
    273 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);
    274 
    275 // Function: FPDFText_FindPrev
    276 //          Search in the direction from page end to start.
    277 // Parameters:
    278 //          handle      -   A search context handle returned by
    279 //          FPDFText_FindStart.
    280 // Return Value:
    281 //          Whether a match is found.
    282 //
    283 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);
    284 
    285 // Function: FPDFText_GetSchResultIndex
    286 //          Get the starting character index of the search result.
    287 // Parameters:
    288 //          handle      -   A search context handle returned by
    289 //          FPDFText_FindStart.
    290 // Return Value:
    291 //          Index for the starting character.
    292 //
    293 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
    294 
    295 // Function: FPDFText_GetSchCount
    296 //          Get the number of matched characters in the search result.
    297 // Parameters:
    298 //          handle      -   A search context handle returned by
    299 //          FPDFText_FindStart.
    300 // Return Value:
    301 //          Number of matched characters.
    302 //
    303 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
    304 
    305 // Function: FPDFText_FindClose
    306 //          Release a search context.
    307 // Parameters:
    308 //          handle      -   A search context handle returned by
    309 //          FPDFText_FindStart.
    310 // Return Value:
    311 //          None.
    312 //
    313 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);
    314 
    315 // Function: FPDFLink_LoadWebLinks
    316 //          Prepare information about weblinks in a page.
    317 // Parameters:
    318 //          text_page   -   Handle to a text page information structure.
    319 //          Returned by FPDFText_LoadPage function.
    320 // Return Value:
    321 //          A handle to the page's links information structure.
    322 //          NULL if something goes wrong.
    323 // Comments:
    324 //          Weblinks are those links implicitly embedded in PDF pages. PDF also
    325 //          has a type of
    326 //          annotation called "link", FPDFTEXT doesn't deal with that kind of
    327 //          link.
    328 //          FPDFTEXT weblink feature is useful for automatically detecting links
    329 //          in the page
    330 //          contents. For example, things like "http://www.foxitsoftware.com"
    331 //          will be detected,
    332 //          so applications can allow user to click on those characters to
    333 //          activate the link,
    334 //          even the PDF doesn't come with link annotations.
    335 //
    336 //          FPDFLink_CloseWebLinks must be called to release resources.
    337 //
    338 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
    339 
    340 // Function: FPDFLink_CountWebLinks
    341 //          Count number of detected web links.
    342 // Parameters:
    343 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    344 // Return Value:
    345 //          Number of detected web links.
    346 //
    347 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
    348 
    349 // Function: FPDFLink_GetURL
    350 //          Fetch the URL information for a detected web link.
    351 // Parameters:
    352 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    353 //          link_index  -   Zero-based index for the link.
    354 //          buffer      -   A unicode buffer.
    355 //          buflen      -   Number of characters (not bytes) for the buffer,
    356 //          including an additional terminator.
    357 // Return Value:
    358 //          If buffer is NULL or buflen is zero, return number of characters
    359 //          (not bytes and an additional terminator is also counted) needed,
    360 //          otherwise, return number of characters copied into the buffer.
    361 //
    362 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page,
    363                                       int link_index,
    364                                       unsigned short* buffer,
    365                                       int buflen);
    366 
    367 // Function: FPDFLink_CountRects
    368 //          Count number of rectangular areas for the link.
    369 // Parameters:
    370 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    371 //          link_index  -   Zero-based index for the link.
    372 // Return Value:
    373 //          Number of rectangular areas for the link.
    374 //
    375 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page,
    376                                           int link_index);
    377 
    378 // Function: FPDFLink_GetRect
    379 //          Fetch the boundaries of a rectangle for a link.
    380 // Parameters:
    381 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    382 //          link_index  -   Zero-based index for the link.
    383 //          rect_index  -   Zero-based index for a rectangle.
    384 //          left        -   Pointer to a double value receiving the rectangle
    385 //          left boundary.
    386 //          top         -   Pointer to a double value receiving the rectangle
    387 //          top boundary.
    388 //          right       -   Pointer to a double value receiving the rectangle
    389 //          right boundary.
    390 //          bottom      -   Pointer to a double value receiving the rectangle
    391 //          bottom boundary.
    392 // Return Value:
    393 //          None.
    394 //
    395 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page,
    396                                         int link_index,
    397                                         int rect_index,
    398                                         double* left,
    399                                         double* top,
    400                                         double* right,
    401                                         double* bottom);
    402 
    403 // Function: FPDFLink_CloseWebLinks
    404 //          Release resources used by weblink feature.
    405 // Parameters:
    406 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    407 // Return Value:
    408 //          None.
    409 //
    410 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
    411 
    412 #ifdef __cplusplus
    413 }
    414 #endif
    415 
    416 #endif  // PUBLIC_FPDF_TEXT_H_
    417