1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef _FPDFTEXT_H_ 8 #define _FPDFTEXT_H_ 9 10 #include "fpdfview.h" 11 12 // Exported Functions 13 #ifdef __cplusplus 14 extern "C" { 15 #endif 16 17 // Function: FPDFText_LoadPage 18 // Prepare information about all characters in a page. 19 // Parameters: 20 // page - Handle to the page. Returned by FPDF_LoadPage function (in FPDFVIEW module). 21 // Return value: 22 // A handle to the text page information structure. 23 // NULL if something goes wrong. 24 // Comments: 25 // Application must call FPDFText_ClosePage to release the text page information. 26 // If you don't purchase Text Module , this function will return NULL. 27 // 28 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page); 29 30 // Function: FPDFText_ClosePage 31 // Release all resources allocated for a text page information structure. 32 // Parameters: 33 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 34 // Return Value: 35 // None. 36 // 37 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page); 38 39 // Function: FPDFText_CountChars 40 // Get number of characters in a page. 41 // Parameters: 42 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 43 // Return value: 44 // Number of characters in the page. Return -1 for error. 45 // Generated characters, like additional space characters, new line characters, are also counted. 46 // Comments: 47 // Characters in a page form a "stream", inside the stream, each character has an index. 48 // We will use the index parameters in many of FPDFTEXT functions. The first character in the page 49 // has an index value of zero. 50 // 51 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page); 52 53 // Function: FPDFText_GetUnicode 54 // Get Unicode of a character in a page. 55 // Parameters: 56 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 57 // index - Zero-based index of the character. 58 // Return value: 59 // The Unicode of the particular character. 60 // If a character is not encoded in Unicode and Foxit engine can't convert to Unicode, 61 // the return value will be zero. 62 // 63 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index); 64 65 // Function: FPDFText_GetFontSize 66 // Get the font size of a particular character. 67 // Parameters: 68 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 69 // index - Zero-based index of the character. 70 // Return value: 71 // The font size of the particular character, measured in points (about 1/72 inch). 72 // This is the typographic size of the font (so called "em size"). 73 // 74 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, int index); 75 76 // Function: FPDFText_GetCharBox 77 // Get bounding box of a particular character. 78 // Parameters: 79 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 80 // index - Zero-based index of the character. 81 // left - Pointer to a double number receiving left position of the character box. 82 // right - Pointer to a double number receiving right position of the character box. 83 // bottom - Pointer to a double number receiving bottom position of the character box. 84 // top - Pointer to a double number receiving top position of the character box. 85 // Return Value: 86 // None. 87 // Comments: 88 // All positions are measured in PDF "user space". 89 // 90 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, int index, double* left, 91 double* right, double* bottom, double* top); 92 93 // Function: FPDFText_GetCharIndexAtPos 94 // Get the index of a character at or nearby a certain position on the page. 95 // Parameters: 96 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 97 // x - X position in PDF "user space". 98 // y - Y position in PDF "user space". 99 // xTolerance - An x-axis tolerance value for character hit detection, in point unit. 100 // yTolerance - A y-axis tolerance value for character hit detection, in point unit. 101 // Return Value: 102 // The zero-based index of the character at, or nearby the point (x,y). 103 // If there is no character at or nearby the point, return value will be -1. 104 // If an error occurs, -3 will be returned. 105 // 106 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, 107 double x, double y, double xTorelance, double yTolerance); 108 109 // Function: FPDFText_GetText 110 // Extract unicode text string from the page. 111 // Parameters: 112 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 113 // start_index - Index for the start characters. 114 // count - Number of characters to be extracted. 115 // result - A buffer (allocated by application) receiving the extracted unicodes. 116 // The size of the buffer must be able to hold the number of characters plus a terminator. 117 // Return Value: 118 // Number of characters written into the result buffer, including the trailing terminator. 119 // Comments: 120 // This function ignores characters without unicode information. 121 // 122 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page, int start_index, int count, unsigned short* result); 123 124 // Function: FPDFText_CountRects 125 // Count number of rectangular areas occupied by a segment of texts. 126 // Parameters: 127 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 128 // start_index - Index for the start characters. 129 // count - Number of characters. 130 // Return value: 131 // Number of rectangles. Zero for error. 132 // Comments: 133 // This function, along with FPDFText_GetRect can be used by applications to detect the position 134 // on the page for a text segment, so proper areas can be highlighted or something. 135 // FPDFTEXT will automatically merge small character boxes into bigger one if those characters 136 // are on the same line and use same font settings. 137 // 138 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page, int start_index, int count); 139 140 // Function: FPDFText_GetRect 141 // Get a rectangular area from the result generated by FPDFText_CountRects. 142 // Parameters: 143 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 144 // rect_index - Zero-based index for the rectangle. 145 // left - Pointer to a double value receiving the rectangle left boundary. 146 // top - Pointer to a double value receiving the rectangle top boundary. 147 // right - Pointer to a double value receiving the rectangle right boundary. 148 // bottom - Pointer to a double value receiving the rectangle bottom boundary. 149 // Return Value: 150 // None. 151 // 152 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page, int rect_index, double* left, double* top, 153 double* right, double* bottom); 154 155 // Function: FPDFText_GetBoundedText 156 // Extract unicode text within a rectangular boundary on the page. 157 // Parameters: 158 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 159 // left - Left boundary. 160 // top - Top boundary. 161 // right - Right boundary. 162 // bottom - Bottom boundary. 163 // buffer - A unicode buffer. 164 // buflen - Number of characters (not bytes) for the buffer, excluding an additional terminator. 165 // Return Value: 166 // If buffer is NULL or buflen is zero, return number of characters (not bytes) needed, 167 // otherwise, return number of characters copied into the buffer. 168 // 169 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,double left, double top, 170 double right, double bottom,unsigned short* buffer,int buflen); 171 172 173 // Flags used by FPDFText_FindStart function. 174 #define FPDF_MATCHCASE 0x00000001 //If not set, it will not match case by default. 175 #define FPDF_MATCHWHOLEWORD 0x00000002 //If not set, it will not match the whole word by default. 176 177 // Function: FPDFText_FindStart 178 // Start a search. 179 // Parameters: 180 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 181 // findwhat - A unicode match pattern. 182 // flags - Option flags. 183 // start_index - Start from this character. -1 for end of the page. 184 // Return Value: 185 // A handle for the search context. FPDFText_FindClose must be called to release this handle. 186 // 187 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page, FPDF_WIDESTRING findwhat, 188 unsigned long flags, int start_index); 189 190 // Function: FPDFText_FindNext 191 // Search in the direction from page start to end. 192 // Parameters: 193 // handle - A search context handle returned by FPDFText_FindStart. 194 // Return Value: 195 // Whether a match is found. 196 // 197 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle); 198 199 // Function: FPDFText_FindPrev 200 // Search in the direction from page end to start. 201 // Parameters: 202 // handle - A search context handle returned by FPDFText_FindStart. 203 // Return Value: 204 // Whether a match is found. 205 // 206 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle); 207 208 // Function: FPDFText_GetSchResultIndex 209 // Get the starting character index of the search result. 210 // Parameters: 211 // handle - A search context handle returned by FPDFText_FindStart. 212 // Return Value: 213 // Index for the starting character. 214 // 215 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle); 216 217 // Function: FPDFText_GetSchCount 218 // Get the number of matched characters in the search result. 219 // Parameters: 220 // handle - A search context handle returned by FPDFText_FindStart. 221 // Return Value: 222 // Number of matched characters. 223 // 224 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle); 225 226 // Function: FPDFText_FindClose 227 // Release a search context. 228 // Parameters: 229 // handle - A search context handle returned by FPDFText_FindStart. 230 // Return Value: 231 // None. 232 // 233 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle); 234 235 // Function: FPDFLink_LoadWebLinks 236 // Prepare information about weblinks in a page. 237 // Parameters: 238 // text_page - Handle to a text page information structure. Returned by FPDFText_LoadPage function. 239 // Return Value: 240 // A handle to the page's links information structure. 241 // NULL if something goes wrong. 242 // Comments: 243 // Weblinks are those links implicitly embedded in PDF pages. PDF also has a type of 244 // annotation called "link", FPDFTEXT doesn't deal with that kind of link. 245 // FPDFTEXT weblink feature is useful for automatically detecting links in the page 246 // contents. For example, things like "http://www.foxitsoftware.com" will be detected, 247 // so applications can allow user to click on those characters to activate the link, 248 // even the PDF doesn't come with link annotations. 249 // 250 // FPDFLink_CloseWebLinks must be called to release resources. 251 // 252 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page); 253 254 // Function: FPDFLink_CountWebLinks 255 // Count number of detected web links. 256 // Parameters: 257 // link_page - Handle returned by FPDFLink_LoadWebLinks. 258 // Return Value: 259 // Number of detected web links. 260 // 261 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page); 262 263 // Function: FPDFLink_GetURL 264 // Fetch the URL information for a detected web link. 265 // Parameters: 266 // link_page - Handle returned by FPDFLink_LoadWebLinks. 267 // link_index - Zero-based index for the link. 268 // buffer - A unicode buffer. 269 // buflen - Number of characters (not bytes) for the buffer, including an additional terminator. 270 // Return Value: 271 // If buffer is NULL or buflen is zero, return number of characters (not bytes and an additional terminator is also counted) needed, 272 // otherwise, return number of characters copied into the buffer. 273 // 274 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page, int link_index, unsigned short* buffer,int buflen); 275 276 // Function: FPDFLink_CountRects 277 // Count number of rectangular areas for the link. 278 // Parameters: 279 // link_page - Handle returned by FPDFLink_LoadWebLinks. 280 // link_index - Zero-based index for the link. 281 // Return Value: 282 // Number of rectangular areas for the link. 283 // 284 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page, int link_index); 285 286 // Function: FPDFLink_GetRect 287 // Fetch the boundaries of a rectangle for a link. 288 // Parameters: 289 // link_page - Handle returned by FPDFLink_LoadWebLinks. 290 // link_index - Zero-based index for the link. 291 // rect_index - Zero-based index for a rectangle. 292 // left - Pointer to a double value receiving the rectangle left boundary. 293 // top - Pointer to a double value receiving the rectangle top boundary. 294 // right - Pointer to a double value receiving the rectangle right boundary. 295 // bottom - Pointer to a double value receiving the rectangle bottom boundary. 296 // Return Value: 297 // None. 298 // 299 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page, int link_index, int rect_index, 300 double* left, double* top,double* right, double* bottom); 301 302 // Function: FPDFLink_CloseWebLinks 303 // Release resources used by weblink feature. 304 // Parameters: 305 // link_page - Handle returned by FPDFLink_LoadWebLinks. 306 // Return Value: 307 // None. 308 // 309 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page); 310 311 312 #ifdef __cplusplus 313 }; 314 #endif 315 316 #endif//_FPDFTEXT_H_ 317