Home | History | Annotate | Download | only in public
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef PUBLIC_FPDF_TEXT_H_
      8 #define PUBLIC_FPDF_TEXT_H_
      9 
     10 #include "fpdfview.h"
     11 
     12 // Exported Functions
     13 #ifdef __cplusplus
     14 extern "C" {
     15 #endif
     16 
     17 // Function: FPDFText_LoadPage
     18 //          Prepare information about all characters in a page.
     19 // Parameters:
     20 //          page    -   Handle to the page. Returned by FPDF_LoadPage function (in FPDFVIEW module).
     21 // Return value:
     22 //          A handle to the text page information structure.
     23 //          NULL if something goes wrong.
     24 // Comments:
     25 //          Application must call FPDFText_ClosePage to release the text page information.
     26 //
     27 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page);
     28 
     29 // Function: FPDFText_ClosePage
     30 //          Release all resources allocated for a text page information structure.
     31 // Parameters:
     32 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     33 // Return Value:
     34 //          None.
     35 //
     36 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
     37 
     38 // Function: FPDFText_CountChars
     39 //          Get number of characters in a page.
     40 // Parameters:
     41 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     42 // Return value:
     43 //          Number of characters in the page. Return -1 for error.
     44 //          Generated characters, like additional space characters, new line characters, are also counted.
     45 // Comments:
     46 //          Characters in a page form a "stream", inside the stream, each character has an index.
     47 //          We will use the index parameters in many of FPDFTEXT functions. The first character in the page
     48 //          has an index value of zero.
     49 //
     50 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);
     51 
     52 // Function: FPDFText_GetUnicode
     53 //          Get Unicode of a character in a page.
     54 // Parameters:
     55 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     56 //          index       -   Zero-based index of the character.
     57 // Return value:
     58 //          The Unicode of the particular character.
     59 //          If a character is not encoded in Unicode and Foxit engine can't convert to Unicode,
     60 //          the return value will be zero.
     61 //
     62 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index);
     63 
     64 // Function: FPDFText_GetFontSize
     65 //          Get the font size of a particular character.
     66 // Parameters:
     67 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     68 //          index       -   Zero-based index of the character.
     69 // Return value:
     70 //          The font size of the particular character, measured in points (about 1/72 inch).
     71 //          This is the typographic size of the font (so called "em size").
     72 //
     73 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, int index);
     74 
     75 // Function: FPDFText_GetCharBox
     76 //          Get bounding box of a particular character.
     77 // Parameters:
     78 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     79 //          index       -   Zero-based index of the character.
     80 //          left        -   Pointer to a double number receiving left position of the character box.
     81 //          right       -   Pointer to a double number receiving right position of the character box.
     82 //          bottom      -   Pointer to a double number receiving bottom position of the character box.
     83 //          top         -   Pointer to a double number receiving top position of the character box.
     84 // Return Value:
     85 //          None.
     86 // Comments:
     87 //          All positions are measured in PDF "user space".
     88 //
     89 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, int index, double* left,
     90                                                     double* right, double* bottom, double* top);
     91 
     92 // Function: FPDFText_GetCharIndexAtPos
     93 //          Get the index of a character at or nearby a certain position on the page.
     94 // Parameters:
     95 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
     96 //          x           -   X position in PDF "user space".
     97 //          y           -   Y position in PDF "user space".
     98 //          xTolerance  -   An x-axis tolerance value for character hit detection, in point unit.
     99 //          yTolerance  -   A y-axis tolerance value for character hit detection, in point unit.
    100 // Return Value:
    101 //          The zero-based index of the character at, or nearby the point (x,y).
    102 //          If there is no character at or nearby the point, return value will be -1.
    103 //          If an error occurs, -3 will be returned.
    104 //
    105 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
    106                                                  double x, double y, double xTorelance, double yTolerance);
    107 
    108 // Function: FPDFText_GetText
    109 //          Extract unicode text string from the page.
    110 // Parameters:
    111 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    112 //          start_index -   Index for the start characters.
    113 //          count       -   Number of characters to be extracted.
    114 //          result      -   A buffer (allocated by application) receiving the extracted unicodes.
    115 //                          The size of the buffer must be able to hold the number of characters plus a terminator.
    116 // Return Value:
    117 //          Number of characters written into the result buffer, including the trailing terminator.
    118 // Comments:
    119 //          This function ignores characters without unicode information.
    120 //
    121 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page, int start_index, int count, unsigned short* result);
    122 
    123 // Function: FPDFText_CountRects
    124 //          Count number of rectangular areas occupied by a segment of texts.
    125 // Parameters:
    126 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    127 //          start_index -   Index for the start characters.
    128 //          count       -   Number of characters.
    129 // Return value:
    130 //          Number of rectangles. Zero for error.
    131 // Comments:
    132 //          This function, along with FPDFText_GetRect can be used by applications to detect the position
    133 //          on the page for a text segment, so proper areas can be highlighted or something.
    134 //          FPDFTEXT will automatically merge small character boxes into bigger one if those characters
    135 //          are on the same line and use same font settings.
    136 //
    137 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page, int start_index, int count);
    138 
    139 // Function: FPDFText_GetRect
    140 //          Get a rectangular area from the result generated by FPDFText_CountRects.
    141 // Parameters:
    142 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    143 //          rect_index  -   Zero-based index for the rectangle.
    144 //          left        -   Pointer to a double value receiving the rectangle left boundary.
    145 //          top         -   Pointer to a double value receiving the rectangle top boundary.
    146 //          right       -   Pointer to a double value receiving the rectangle right boundary.
    147 //          bottom      -   Pointer to a double value receiving the rectangle bottom boundary.
    148 // Return Value:
    149 //          None.
    150 //
    151 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page, int rect_index, double* left, double* top,
    152                                             double* right, double* bottom);
    153 
    154 // Function: FPDFText_GetBoundedText
    155 //          Extract unicode text within a rectangular boundary on the page.
    156 // Parameters:
    157 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    158 //          left        -   Left boundary.
    159 //          top         -   Top boundary.
    160 //          right       -   Right boundary.
    161 //          bottom      -   Bottom boundary.
    162 //          buffer      -   A unicode buffer.
    163 //          buflen      -   Number of characters (not bytes) for the buffer, excluding an additional terminator.
    164 // Return Value:
    165 //          If buffer is NULL or buflen is zero, return number of characters (not bytes) of text present within
    166 //          the rectangle, excluding a terminating NUL.  Generally you should pass a buffer at least one larger
    167 //          than this if you want a terminating NUL, which will be provided if space is available.
    168 //          Otherwise, return number of characters copied into the buffer, including the terminating NUL
    169 //          when space for it is available.
    170 // Comment:
    171 //          If the buffer is too small, as much text as will fit is copied into it.
    172 //
    173 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,double left, double top,
    174                                               double right, double bottom,unsigned short* buffer,int buflen);
    175 
    176 
    177 // Flags used by FPDFText_FindStart function.
    178 #define FPDF_MATCHCASE      0x00000001      //If not set, it will not match case by default.
    179 #define FPDF_MATCHWHOLEWORD 0x00000002      //If not set, it will not match the whole word by default.
    180 
    181 // Function: FPDFText_FindStart
    182 //          Start a search.
    183 // Parameters:
    184 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    185 //          findwhat    -   A unicode match pattern.
    186 //          flags       -   Option flags.
    187 //          start_index -   Start from this character. -1 for end of the page.
    188 // Return Value:
    189 //          A handle for the search context. FPDFText_FindClose must be called to release this handle.
    190 //
    191 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page, FPDF_WIDESTRING findwhat,
    192                                                     unsigned long flags, int start_index);
    193 
    194 // Function: FPDFText_FindNext
    195 //          Search in the direction from page start to end.
    196 // Parameters:
    197 //          handle      -   A search context handle returned by FPDFText_FindStart.
    198 // Return Value:
    199 //          Whether a match is found.
    200 //
    201 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);
    202 
    203 // Function: FPDFText_FindPrev
    204 //          Search in the direction from page end to start.
    205 // Parameters:
    206 //          handle      -   A search context handle returned by FPDFText_FindStart.
    207 // Return Value:
    208 //          Whether a match is found.
    209 //
    210 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);
    211 
    212 // Function: FPDFText_GetSchResultIndex
    213 //          Get the starting character index of the search result.
    214 // Parameters:
    215 //          handle      -   A search context handle returned by FPDFText_FindStart.
    216 // Return Value:
    217 //          Index for the starting character.
    218 //
    219 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
    220 
    221 // Function: FPDFText_GetSchCount
    222 //          Get the number of matched characters in the search result.
    223 // Parameters:
    224 //          handle      -   A search context handle returned by FPDFText_FindStart.
    225 // Return Value:
    226 //          Number of matched characters.
    227 //
    228 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
    229 
    230 // Function: FPDFText_FindClose
    231 //          Release a search context.
    232 // Parameters:
    233 //          handle      -   A search context handle returned by FPDFText_FindStart.
    234 // Return Value:
    235 //          None.
    236 //
    237 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);
    238 
    239 // Function: FPDFLink_LoadWebLinks
    240 //          Prepare information about weblinks in a page.
    241 // Parameters:
    242 //          text_page   -   Handle to a text page information structure. Returned by FPDFText_LoadPage function.
    243 // Return Value:
    244 //          A handle to the page's links information structure.
    245 //          NULL if something goes wrong.
    246 // Comments:
    247 //          Weblinks are those links implicitly embedded in PDF pages. PDF also has a type of
    248 //          annotation called "link", FPDFTEXT doesn't deal with that kind of link.
    249 //          FPDFTEXT weblink feature is useful for automatically detecting links in the page
    250 //          contents. For example, things like "http://www.foxitsoftware.com" will be detected,
    251 //          so applications can allow user to click on those characters to activate the link,
    252 //          even the PDF doesn't come with link annotations.
    253 //
    254 //          FPDFLink_CloseWebLinks must be called to release resources.
    255 //
    256 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
    257 
    258 // Function: FPDFLink_CountWebLinks
    259 //          Count number of detected web links.
    260 // Parameters:
    261 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    262 // Return Value:
    263 //          Number of detected web links.
    264 //
    265 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
    266 
    267 // Function: FPDFLink_GetURL
    268 //          Fetch the URL information for a detected web link.
    269 // Parameters:
    270 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    271 //          link_index  -   Zero-based index for the link.
    272 //          buffer      -   A unicode buffer.
    273 //          buflen      -   Number of characters (not bytes) for the buffer, including an additional terminator.
    274 // Return Value:
    275 //          If buffer is NULL or buflen is zero, return number of characters (not bytes and an additional terminator is also counted) needed,
    276 //          otherwise, return number of characters copied into the buffer.
    277 //
    278 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page, int link_index, unsigned short* buffer,int buflen);
    279 
    280 // Function: FPDFLink_CountRects
    281 //          Count number of rectangular areas for the link.
    282 // Parameters:
    283 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    284 //          link_index  -   Zero-based index for the link.
    285 // Return Value:
    286 //          Number of rectangular areas for the link.
    287 //
    288 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page, int link_index);
    289 
    290 // Function: FPDFLink_GetRect
    291 //          Fetch the boundaries of a rectangle for a link.
    292 // Parameters:
    293 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    294 //          link_index  -   Zero-based index for the link.
    295 //          rect_index  -   Zero-based index for a rectangle.
    296 //          left        -   Pointer to a double value receiving the rectangle left boundary.
    297 //          top         -   Pointer to a double value receiving the rectangle top boundary.
    298 //          right       -   Pointer to a double value receiving the rectangle right boundary.
    299 //          bottom      -   Pointer to a double value receiving the rectangle bottom boundary.
    300 // Return Value:
    301 //          None.
    302 //
    303 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page, int link_index, int rect_index,
    304                                         double* left, double* top,double* right, double* bottom);
    305 
    306 // Function: FPDFLink_CloseWebLinks
    307 //          Release resources used by weblink feature.
    308 // Parameters:
    309 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
    310 // Return Value:
    311 //          None.
    312 //
    313 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
    314 
    315 
    316 #ifdef __cplusplus
    317 }
    318 #endif
    319 
    320 #endif  // PUBLIC_FPDF_TEXT_H_
    321