1 * Summary: interface for an HTML 4.0 non-verifying parser 2 * Description: this module implements an HTML 4.0 non-verifying parser 3 * with API compatible with the XML parser ones. It should 4 * be able to parse "real world" HTML, even if severely 5 * broken from a specification point of view. 6 * 7 * Copy: See Copyright for the status of this software. 8 * 9 * Author: Patrick Monnerat <pm (a] datasphere.ch>, DATASPHERE S.A. 10 11 /if not defined(HTML_PARSER_H__) 12 /define HTML_PARSER_H__ 13 14 /include "libxmlrpg/xmlversion" 15 16 /if defined(LIBXML_HTML_ENABLED) 17 18 /include "libxmlrpg/xmlTypesC" 19 /include "libxmlrpg/parser" 20 21 * Most of the back-end structures from XML and HTML are shared. 22 23 d htmlParserCtxtPtr... 24 d s based(######typedef######) 25 d like(xmlParserCtxtPtr) 26 27 d htmlParserCtxt ds based(htmlParserCtxtPtr) 28 d likeds(xmlParserCtxt) 29 30 d htmlParserNodeInfoPtr... 31 d s based(######typedef######) 32 d like(xmlParserNodeInfoPtr) 33 34 d htmlParserNodeInfo... 35 d ds based(htmlParserNodeInfoPtr) 36 d likeds(xmlParserNodeInfo) 37 38 d htmlSAXHandlerPtr... 39 d s based(######typedef######) 40 d like(xmlSAXHandlerPtr) 41 42 d htmlSAXHandler ds based(htmlSAXHandlerPtr) 43 d likeds(xmlSAXHandler) 44 45 d htmlParserInputPtr... 46 d s based(######typedef######) 47 d like(xmlParserInputPtr) 48 49 d htmlParserInput... 50 d ds based(htmlParserInputPtr) 51 d likeds(xmlParserInput) 52 53 d htmlDocPtr s based(######typedef######) 54 d like(xmlDocPtr) 55 56 d htmlNodePtr s based(######typedef######) 57 d like(xmlNodePtr) 58 59 * Internal description of an HTML element, representing HTML 4.01 60 * and XHTML 1.0 (which share the same structure). 61 62 d htmlElemDescPtr... 63 d s * based(######typedef######) 64 65 d htmlElemDesc ds based(htmlElemDescPtr) 66 d align qualified 67 d name * const char * 68 d startTag like(xmlCchar) Start tag implied ? 69 d endTag like(xmlCchar) End tag implied ? 70 d saveEndTag like(xmlCchar) Save end tag ? 71 d empty like(xmlCchar) Empty element ? 72 d depr like(xmlCchar) Deprecated element ? 73 d dtd like(xmlCchar) Loose DTD/Frameset 74 d isinline like(xmlCchar) Block 0/inline elem? 75 d desc * const char * 76 * 77 * New fields encapsulating HTML structure 78 * 79 * Bugs: 80 * This is a very limited representation. It fails to tell us when 81 * an element *requires* subelements (we only have whether they're 82 * allowed or not), and it doesn't tell us where CDATA and PCDATA 83 * are allowed. Some element relationships are not fully represented: 84 * these are flagged with the word MODIFIER 85 * 86 d subelts * const char * * 87 d defaultsubelt * const char * 88 d attrs_opt * const char * * 89 d attrs_depr * const char * * 90 d attrs_req * const char * * 91 92 * Internal description of an HTML entity. 93 94 d htmlEntityDescPtr... 95 d s * based(######typedef######) 96 97 d htmlEntityDesc... 98 d ds based(htmlEntityDescPtr) 99 d align qualified 100 d value like(xmlCuint) 101 d name * const char * 102 d desc * const char * 103 104 * There is only few public functions. 105 106 d htmlTagLookup pr extproc('htmlTagLookup') 107 d like(htmlElemDescPtr) const 108 d tag * value options(*string) const xmlChar * 109 110 d htmlEntityLookup... 111 d pr extproc('htmlEntityLookup') 112 d like(htmlEntityDescPtr) const 113 d name * value options(*string) const xmlChar * 114 115 d htmlEntityValueLookup... 116 d pr extproc('htmlEntityValueLookup') 117 d like(htmlEntityDescPtr) const 118 d value value like(xmlCuint) 119 120 d htmlIsAutoClosed... 121 d pr extproc('htmlIsAutoClosed') 122 d like(xmlCint) 123 d doc value like(htmlDocPtr) 124 d elem value like(htmlNodePtr) 125 126 d htmlAutoCloseTag... 127 d pr extproc('htmlAutoCloseTag') 128 d like(xmlCint) 129 d doc value like(htmlDocPtr) 130 d name * value options(*string) const xmlChar * 131 d elem value like(htmlNodePtr) 132 133 d htmlParseEntityRef... 134 d pr extproc('htmlParseEntityRef') 135 d like(htmlEntityDescPtr) const 136 d ctxt value like(htmlParserCtxtPtr) 137 d str * const xmlChar *(*) 138 139 d htmlParseCharRef... 140 d pr extproc('htmlParseCharRef') 141 d like(xmlCint) 142 d ctxt value like(htmlParserCtxtPtr) 143 144 d htmlParseElement... 145 d pr extproc('htmlParseElement') 146 d ctxt value like(htmlParserCtxtPtr) 147 148 d htmlNewParserCtxt... 149 d pr extproc('htmlNewParserCtxt') 150 d like(htmlParserCtxtPtr) 151 152 d htmlCreateMemoryParserCtxt... 153 d pr extproc('htmlCreateMemoryParserCtxt') 154 d like(htmlParserCtxtPtr) 155 d buffer * value options(*string) const char * 156 d size value like(xmlCint) 157 158 d htmlParseDocument... 159 d pr extproc('htmlParseDocument') 160 d like(xmlCint) 161 d ctxt value like(htmlParserCtxtPtr) 162 163 d htmlSAXParseDoc... 164 d pr extproc('htmlSAXParseDoc') 165 d like(htmlDocPtr) 166 d cur * value options(*string) xmlChar * 167 d encoding * value options(*string) const char * 168 d sax value like(htmlSAXHandlerPtr) 169 d userData * value void * 170 171 d htmlParseDoc pr extproc('htmlParseDoc') 172 d like(htmlDocPtr) 173 d cur * value options(*string) xmlChar * 174 d encoding * value options(*string) const char * 175 176 d htmlSAXParseFile... 177 d pr extproc('htmlSAXParseFile') 178 d like(htmlDocPtr) 179 d filename * value options(*string) const char * 180 d encoding * value options(*string) const char * 181 d sax value like(htmlSAXHandlerPtr) 182 d userData * value void * 183 184 d htmlParseFile pr extproc('htmlParseFile') 185 d like(htmlDocPtr) 186 d filename * value options(*string) const char * 187 d encoding * value options(*string) const char * 188 189 d UTF8ToHtml pr extproc('UTF8ToHtml') 190 d like(xmlCint) 191 d out 65535 options(*varsize) unsigned char [] 192 d outlen like(xmlCint) 193 d in * value options(*string) const unsigned char* 194 d inlen like(xmlCint) 195 196 d htmlEncodeEntities... 197 d pr extproc('htmlEncodeEntities') 198 d like(xmlCint) 199 d out 65535 options(*varsize) unsigned char [] 200 d outlen like(xmlCint) 201 d in * value options(*string) const unsigned char* 202 d inlen like(xmlCint) 203 d quoteChar value like(xmlCint) 204 205 d htmlIsScriptAttribute... 206 d pr extproc('htmlIsScriptAttribute') 207 d like(xmlCint) 208 d name * value options(*string) const xmlChar * 209 210 d htmlHandleOmittedElem... 211 d pr extproc('htmlHandleOmittedElem') 212 d like(xmlCint) 213 d val value like(xmlCint) 214 215 /if defined(LIBXML_PUSH_ENABLED) 216 217 * Interfaces for the Push mode. 218 219 d htmlCreatePushParserCtxt... 220 d pr extproc('htmlCreatePushParserCtxt') 221 d like(htmlParserCtxtPtr) 222 d sax value like(htmlSAXHandlerPtr) 223 d user_data * value void * 224 d chunk * value options(*string) const char * 225 d size value like(xmlCint) 226 d filename * value options(*string) const char * 227 d enc value like(xmlCharEncoding) 228 229 d htmlParseChunk pr extproc('htmlParseChunk') 230 d like(xmlCint) 231 d ctxt value like(htmlParserCtxtPtr) 232 d chunk * value options(*string) const char * 233 d size value like(xmlCint) 234 d terminate value like(xmlCint) 235 /endif LIBXML_PUSH_ENABLED 236 237 d htmlFreeParserCtxt... 238 d pr extproc('htmlFreeParserCtxt') 239 d ctxt value like(htmlParserCtxtPtr) 240 241 * New set of simpler/more flexible APIs 242 243 * xmlParserOption: 244 * 245 * This is the set of XML parser options that can be passed down 246 * to the xmlReadDoc() and similar calls. 247 248 d htmlParserOption... 249 d s based(######typedef######) 250 d like(xmlCenum) 251 d HTML_PARSE_RECOVER... Relaxed parsing 252 d c X'00000001' 253 d HTML_PARSE_NODEFDTD... No default doctype 254 d c X'00000004' 255 d HTML_PARSE_NOERROR... No error reports 256 d c X'00000020' 257 d HTML_PARSE_NOWARNING... No warning reports 258 d c X'00000040' 259 d HTML_PARSE_PEDANTIC... Pedantic err reports 260 d c X'00000080' 261 d HTML_PARSE_NOBLANKS... Remove blank nodes 262 d c X'00000100' 263 d HTML_PARSE_NONET... Forbid net access 264 d c X'00000800' 265 d HTML_PARSE_NOIMPLIED... No implied html/body 266 d c X'00002000' 267 d HTML_PARSE_COMPACT... compact small txtnod 268 d c X'00010000' 269 d HTML_PARSE_IGNORE_ENC... Ignore encoding hint 270 d c X'00200000' 271 272 d htmlCtxtReset pr extproc('htmlCtxtReset') 273 d ctxt value like(htmlParserCtxtPtr) 274 275 d htmlCtxtUseOptions... 276 d pr extproc('htmlCtxtUseOptions') 277 d like(xmlCint) 278 d ctxt value like(htmlParserCtxtPtr) 279 d options value like(xmlCint) 280 281 d htmlReadDoc pr extproc('htmlReadDoc') 282 d like(htmlDocPtr) 283 d cur * value options(*string) const xmlChar * 284 d URL * value options(*string) const char * 285 d encoding * value options(*string) const char * 286 d options value like(xmlCint) 287 288 d htmlReadFile pr extproc('htmlReadFile') 289 d like(htmlDocPtr) 290 d URL * value options(*string) const char * 291 d encoding * value options(*string) const char * 292 d options value like(xmlCint) 293 294 d htmlReadMemory pr extproc('htmlReadMemory') 295 d like(htmlDocPtr) 296 d buffer * value options(*string) const char * 297 d size value like(xmlCint) 298 d URL * value options(*string) const char * 299 d encoding * value options(*string) const char * 300 d options value like(xmlCint) 301 302 d htmlReadFd pr extproc('htmlReadFd') 303 d like(htmlDocPtr) 304 d fd value like(xmlCint) 305 d URL * value options(*string) const char * 306 d encoding * value options(*string) const char * 307 d options value like(xmlCint) 308 309 d htmlReadIO pr extproc('htmlReadIO') 310 d like(htmlDocPtr) 311 d ioread value like(xmlInputReadCallback) 312 d ioclose value like(xmlInputCloseCallback) 313 d ioctx * value void * 314 d URL * value options(*string) const char * 315 d encoding * value options(*string) const char * 316 d options value like(xmlCint) 317 318 d htmlCtxtReadDoc... 319 d pr extproc('htmlCtxtReadDoc') 320 d like(htmlDocPtr) 321 d ctxt value like(xmlParserCtxtPtr) 322 d cur * value options(*string) const xmlChar * 323 d URL * value options(*string) const char * 324 d encoding * value options(*string) const char * 325 d options value like(xmlCint) 326 327 d htmlCtxtReadFile... 328 d pr extproc('htmlCtxtReadFile') 329 d like(htmlDocPtr) 330 d ctxt value like(xmlParserCtxtPtr) 331 d filename * value options(*string) const char * 332 d encoding * value options(*string) const char * 333 d options value like(xmlCint) 334 335 d htmlCtxtReadMemory... 336 d pr extproc('htmlCtxtReadMemory') 337 d like(htmlDocPtr) 338 d ctxt value like(xmlParserCtxtPtr) 339 d buffer * value options(*string) const char * 340 d size value like(xmlCint) 341 d URL * value options(*string) const char * 342 d encoding * value options(*string) const char * 343 d options value like(xmlCint) 344 345 d htmlCtxtReadFd pr extproc('htmlCtxtReadFd') 346 d like(htmlDocPtr) 347 d ctxt value like(xmlParserCtxtPtr) 348 d fd value like(xmlCint) 349 d URL * value options(*string) const char * 350 d encoding * value options(*string) const char * 351 d options value like(xmlCint) 352 353 d htmlCtxtReadIO pr extproc('htmlCtxtReadIO') 354 d like(htmlDocPtr) 355 d ctxt value like(xmlParserCtxtPtr) 356 d ioread value like(xmlInputReadCallback) 357 d ioclose value like(xmlInputCloseCallback) 358 d ioctx * value void * 359 d URL * value options(*string) const char * 360 d encoding * value options(*string) const char * 361 d options value like(xmlCint) 362 363 * Further knowledge of HTML structure 364 365 d htmlStatus s based(######typedef######) 366 d like(xmlCenum) 367 d HTML_NA c X'0000' No check at all 368 d HTML_INVALID c X'0001' 369 d HTML_DEPRECATED... 370 d c X'0002' 371 d HTML_VALID c X'0004' 372 d HTML_REQUIRED c X'000C' HTML_VALID ored-in 373 374 * Using htmlElemDesc rather than name here, to emphasise the fact 375 * that otherwise there's a lookup overhead 376 377 d htmlAttrAllowed... 378 d pr extproc('htmlAttrAllowed') 379 d like(htmlStatus) 380 d #param1 value like(htmlElemDescPtr) const 381 d #param2 * value options(*string) const xmlChar * 382 d #param3 value like(xmlCint) 383 384 d htmlElementAllowedHere... 385 d pr extproc('htmlElementAllowedHere') 386 d like(xmlCint) 387 d #param1 value like(htmlElemDescPtr) const 388 d #param2 * value options(*string) const xmlChar * 389 390 d htmlElementStatusHere... 391 d pr extproc('htmlElementStatusHere') 392 d like(htmlStatus) 393 d #param1 value like(htmlElemDescPtr) const 394 d #param2 value like(htmlElemDescPtr) const 395 396 d htmlNodeStatus pr extproc('htmlNodeStatus') 397 d like(htmlStatus) 398 d #param1 value like(htmlNodePtr) 399 d #param2 value like(xmlCint) 400 401 * C macros implemented as procedures for ILE/RPG support. 402 403 d htmlDefaultSubelement... 404 d pr * extproc('__htmlDefaultSubelement') const char * 405 d elt * value const htmlElemDesc * 406 407 d htmlElementAllowedHereDesc... 408 d pr extproc( 409 d '__htmlElementAllowedHereDesc') 410 d like(xmlCint) 411 d parent * value const htmlElemDesc * 412 d elt * value const htmlElemDesc * 413 414 d htmlRequiredAttrs... 415 d pr * extproc('__htmlRequiredAttrs') const char * * 416 d elt * value const htmlElemDesc * 417 418 /endif LIBXML_HTML_ENABLED 419 /endif HTML_PARSER_H__ 420