Home | History | Annotate | Download | only in libxmlrpg
      1       * Summary: interface for an HTML 4.0 non-verifying parser
      2       * Description: this module implements an HTML 4.0 non-verifying parser
      3       *              with API compatible with the XML parser ones. It should
      4       *              be able to parse "real world" HTML, even if severely
      5       *              broken from a specification point of view.
      6       *
      7       * Copy: See Copyright for the status of this software.
      8       *
      9       * Author: Patrick Monnerat <pm (a] datasphere.ch>, DATASPHERE S.A.
     10 
     11       /if not defined(HTML_PARSER_H__)
     12       /define HTML_PARSER_H__
     13 
     14       /include "libxmlrpg/xmlversion"
     15 
     16       /if defined(LIBXML_HTML_ENABLED)
     17 
     18       /include "libxmlrpg/xmlTypesC"
     19       /include "libxmlrpg/parser"
     20 
     21       * Most of the back-end structures from XML and HTML are shared.
     22 
     23      d htmlParserCtxtPtr...
     24      d                 s                   based(######typedef######)
     25      d                                     like(xmlParserCtxtPtr)
     26 
     27      d htmlParserCtxt  ds                  based(htmlParserCtxtPtr)
     28      d                                     likeds(xmlParserCtxt)
     29 
     30      d htmlParserNodeInfoPtr...
     31      d                 s                   based(######typedef######)
     32      d                                     like(xmlParserNodeInfoPtr)
     33 
     34      d htmlParserNodeInfo...
     35      d                 ds                  based(htmlParserNodeInfoPtr)
     36      d                                     likeds(xmlParserNodeInfo)
     37 
     38      d htmlSAXHandlerPtr...
     39      d                 s                   based(######typedef######)
     40      d                                     like(xmlSAXHandlerPtr)
     41 
     42      d htmlSAXHandler  ds                  based(htmlSAXHandlerPtr)
     43      d                                     likeds(xmlSAXHandler)
     44 
     45      d htmlParserInputPtr...
     46      d                 s                   based(######typedef######)
     47      d                                     like(xmlParserInputPtr)
     48 
     49      d htmlParserInput...
     50      d                 ds                  based(htmlParserInputPtr)
     51      d                                     likeds(xmlParserInput)
     52 
     53      d htmlDocPtr      s                   based(######typedef######)
     54      d                                     like(xmlDocPtr)
     55 
     56      d htmlNodePtr     s                   based(######typedef######)
     57      d                                     like(xmlNodePtr)
     58 
     59       * Internal description of an HTML element, representing HTML 4.01
     60       * and XHTML 1.0 (which share the same structure).
     61 
     62      d htmlElemDescPtr...
     63      d                 s               *   based(######typedef######)
     64 
     65      d htmlElemDesc    ds                  based(htmlElemDescPtr)
     66      d                                     align qualified
     67      d  name                           *                                        const char *
     68      d  startTag                           like(xmlCchar)                       Start tag implied ?
     69      d  endTag                             like(xmlCchar)                       End tag implied ?
     70      d  saveEndTag                         like(xmlCchar)                       Save end tag ?
     71      d  empty                              like(xmlCchar)                       Empty element ?
     72      d  depr                               like(xmlCchar)                       Deprecated element ?
     73      d  dtd                                like(xmlCchar)                       Loose DTD/Frameset
     74      d  isinline                           like(xmlCchar)                       Block 0/inline elem?
     75      d  desc                           *                                        const char *
     76       *
     77       * New fields encapsulating HTML structure
     78       *
     79       * Bugs:
     80       *      This is a very limited representation.  It fails to tell us when
     81       *      an element *requires* subelements (we only have whether they're
     82       *      allowed or not), and it doesn't tell us where CDATA and PCDATA
     83       *      are allowed.  Some element relationships are not fully represented:
     84       *      these are flagged with the word MODIFIER
     85       *
     86      d  subelts                        *                                        const char * *
     87      d  defaultsubelt                  *                                        const char *
     88      d  attrs_opt                      *                                        const char * *
     89      d  attrs_depr                     *                                        const char * *
     90      d  attrs_req                      *                                        const char * *
     91 
     92       * Internal description of an HTML entity.
     93 
     94      d htmlEntityDescPtr...
     95      d                 s               *   based(######typedef######)
     96 
     97      d htmlEntityDesc...
     98      d                 ds                  based(htmlEntityDescPtr)
     99      d                                     align qualified
    100      d  value                              like(xmlCuint)
    101      d  name                           *                                        const char *
    102      d  desc                           *                                        const char *
    103 
    104       * There is only few public functions.
    105 
    106      d htmlTagLookup   pr                  extproc('htmlTagLookup')
    107      d                                     like(htmlElemDescPtr)                const
    108      d  tag                            *   value options(*string)               const xmlChar *
    109 
    110      d htmlEntityLookup...
    111      d                 pr                  extproc('htmlEntityLookup')
    112      d                                     like(htmlEntityDescPtr)              const
    113      d  name                           *   value options(*string)               const xmlChar *
    114 
    115      d htmlEntityValueLookup...
    116      d                 pr                  extproc('htmlEntityValueLookup')
    117      d                                     like(htmlEntityDescPtr)              const
    118      d  value                              value like(xmlCuint)
    119 
    120      d htmlIsAutoClosed...
    121      d                 pr                  extproc('htmlIsAutoClosed')
    122      d                                     like(xmlCint)
    123      d  doc                                value like(htmlDocPtr)
    124      d  elem                               value like(htmlNodePtr)
    125 
    126      d htmlAutoCloseTag...
    127      d                 pr                  extproc('htmlAutoCloseTag')
    128      d                                     like(xmlCint)
    129      d  doc                                value like(htmlDocPtr)
    130      d  name                           *   value options(*string)               const xmlChar *
    131      d  elem                               value like(htmlNodePtr)
    132 
    133      d htmlParseEntityRef...
    134      d                 pr                  extproc('htmlParseEntityRef')
    135      d                                     like(htmlEntityDescPtr)              const
    136      d  ctxt                               value like(htmlParserCtxtPtr)
    137      d  str                            *                                        const xmlChar *(*)
    138 
    139      d htmlParseCharRef...
    140      d                 pr                  extproc('htmlParseCharRef')
    141      d                                     like(xmlCint)
    142      d  ctxt                               value like(htmlParserCtxtPtr)
    143 
    144      d htmlParseElement...
    145      d                 pr                  extproc('htmlParseElement')
    146      d  ctxt                               value like(htmlParserCtxtPtr)
    147 
    148      d htmlNewParserCtxt...
    149      d                 pr                  extproc('htmlNewParserCtxt')
    150      d                                     like(htmlParserCtxtPtr)
    151 
    152      d htmlCreateMemoryParserCtxt...
    153      d                 pr                  extproc('htmlCreateMemoryParserCtxt')
    154      d                                     like(htmlParserCtxtPtr)
    155      d  buffer                         *   value options(*string)               const char *
    156      d  size                               value like(xmlCint)
    157 
    158      d htmlParseDocument...
    159      d                 pr                  extproc('htmlParseDocument')
    160      d                                     like(xmlCint)
    161      d  ctxt                               value like(htmlParserCtxtPtr)
    162 
    163      d htmlSAXParseDoc...
    164      d                 pr                  extproc('htmlSAXParseDoc')
    165      d                                     like(htmlDocPtr)
    166      d  cur                            *   value options(*string)               xmlChar *
    167      d  encoding                       *   value options(*string)               const char *
    168      d  sax                                value like(htmlSAXHandlerPtr)
    169      d  userData                       *   value                                void *
    170 
    171      d htmlParseDoc    pr                  extproc('htmlParseDoc')
    172      d                                     like(htmlDocPtr)
    173      d  cur                            *   value options(*string)               xmlChar *
    174      d  encoding                       *   value options(*string)               const char *
    175 
    176      d htmlSAXParseFile...
    177      d                 pr                  extproc('htmlSAXParseFile')
    178      d                                     like(htmlDocPtr)
    179      d  filename                       *   value options(*string)               const char *
    180      d  encoding                       *   value options(*string)               const char *
    181      d  sax                                value like(htmlSAXHandlerPtr)
    182      d  userData                       *   value                                void *
    183 
    184      d htmlParseFile   pr                  extproc('htmlParseFile')
    185      d                                     like(htmlDocPtr)
    186      d  filename                       *   value options(*string)               const char *
    187      d  encoding                       *   value options(*string)               const char *
    188 
    189      d UTF8ToHtml      pr                  extproc('UTF8ToHtml')
    190      d                                     like(xmlCint)
    191      d  out                       65535    options(*varsize)                    unsigned char []
    192      d  outlen                             like(xmlCint)
    193      d  in                             *   value options(*string)               const unsigned char*
    194      d  inlen                              like(xmlCint)
    195 
    196      d htmlEncodeEntities...
    197      d                 pr                  extproc('htmlEncodeEntities')
    198      d                                     like(xmlCint)
    199      d  out                       65535    options(*varsize)                    unsigned char []
    200      d  outlen                             like(xmlCint)
    201      d  in                             *   value options(*string)               const unsigned char*
    202      d  inlen                              like(xmlCint)
    203      d  quoteChar                          value like(xmlCint)
    204 
    205      d htmlIsScriptAttribute...
    206      d                 pr                  extproc('htmlIsScriptAttribute')
    207      d                                     like(xmlCint)
    208      d  name                           *   value options(*string)               const xmlChar *
    209 
    210      d htmlHandleOmittedElem...
    211      d                 pr                  extproc('htmlHandleOmittedElem')
    212      d                                     like(xmlCint)
    213      d  val                                value like(xmlCint)
    214 
    215       /if defined(LIBXML_PUSH_ENABLED)
    216 
    217       * Interfaces for the Push mode.
    218 
    219      d htmlCreatePushParserCtxt...
    220      d                 pr                  extproc('htmlCreatePushParserCtxt')
    221      d                                     like(htmlParserCtxtPtr)
    222      d  sax                                value like(htmlSAXHandlerPtr)
    223      d  user_data                      *   value                                void *
    224      d  chunk                          *   value options(*string)               const char *
    225      d  size                               value like(xmlCint)
    226      d  filename                       *   value options(*string)               const char *
    227      d  enc                                value like(xmlCharEncoding)
    228 
    229      d htmlParseChunk  pr                  extproc('htmlParseChunk')
    230      d                                     like(xmlCint)
    231      d  ctxt                               value like(htmlParserCtxtPtr)
    232      d  chunk                          *   value options(*string)               const char *
    233      d  size                               value like(xmlCint)
    234      d  terminate                          value like(xmlCint)
    235       /endif                                                                    LIBXML_PUSH_ENABLED
    236 
    237      d htmlFreeParserCtxt...
    238      d                 pr                  extproc('htmlFreeParserCtxt')
    239      d  ctxt                               value like(htmlParserCtxtPtr)
    240 
    241       * New set of simpler/more flexible APIs
    242 
    243       * xmlParserOption:
    244       *
    245       * This is the set of XML parser options that can be passed down
    246       * to the xmlReadDoc() and similar calls.
    247 
    248      d htmlParserOption...
    249      d                 s                   based(######typedef######)
    250      d                                     like(xmlCenum)
    251      d  HTML_PARSE_RECOVER...                                                   Relaxed parsing
    252      d                 c                   X'00000001'
    253      d  HTML_PARSE_NODEFDTD...                                                  No default doctype
    254      d                 c                   X'00000004'
    255      d  HTML_PARSE_NOERROR...                                                   No error reports
    256      d                 c                   X'00000020'
    257      d  HTML_PARSE_NOWARNING...                                                 No warning reports
    258      d                 c                   X'00000040'
    259      d  HTML_PARSE_PEDANTIC...                                                  Pedantic err reports
    260      d                 c                   X'00000080'
    261      d  HTML_PARSE_NOBLANKS...                                                  Remove blank nodes
    262      d                 c                   X'00000100'
    263      d  HTML_PARSE_NONET...                                                     Forbid net access
    264      d                 c                   X'00000800'
    265      d  HTML_PARSE_NOIMPLIED...                                                 No implied html/body
    266      d                 c                   X'00002000'
    267      d  HTML_PARSE_COMPACT...                                                   compact small txtnod
    268      d                 c                   X'00010000'
    269      d  HTML_PARSE_IGNORE_ENC...                                                Ignore encoding hint
    270      d                 c                   X'00200000'
    271 
    272      d htmlCtxtReset   pr                  extproc('htmlCtxtReset')
    273      d ctxt                                value like(htmlParserCtxtPtr)
    274 
    275      d htmlCtxtUseOptions...
    276      d                 pr                  extproc('htmlCtxtUseOptions')
    277      d                                     like(xmlCint)
    278      d ctxt                                value like(htmlParserCtxtPtr)
    279      d options                             value like(xmlCint)
    280 
    281      d htmlReadDoc     pr                  extproc('htmlReadDoc')
    282      d                                     like(htmlDocPtr)
    283      d  cur                            *   value options(*string)               const xmlChar *
    284      d  URL                            *   value options(*string)               const char *
    285      d  encoding                       *   value options(*string)               const char *
    286      d  options                            value like(xmlCint)
    287 
    288      d htmlReadFile    pr                  extproc('htmlReadFile')
    289      d                                     like(htmlDocPtr)
    290      d  URL                            *   value options(*string)               const char *
    291      d  encoding                       *   value options(*string)               const char *
    292      d  options                            value like(xmlCint)
    293 
    294      d htmlReadMemory  pr                  extproc('htmlReadMemory')
    295      d                                     like(htmlDocPtr)
    296      d  buffer                         *   value options(*string)               const char *
    297      d  size                               value like(xmlCint)
    298      d  URL                            *   value options(*string)               const char *
    299      d  encoding                       *   value options(*string)               const char *
    300      d  options                            value like(xmlCint)
    301 
    302      d htmlReadFd      pr                  extproc('htmlReadFd')
    303      d                                     like(htmlDocPtr)
    304      d  fd                                 value like(xmlCint)
    305      d  URL                            *   value options(*string)               const char *
    306      d  encoding                       *   value options(*string)               const char *
    307      d  options                            value like(xmlCint)
    308 
    309      d htmlReadIO      pr                  extproc('htmlReadIO')
    310      d                                     like(htmlDocPtr)
    311      d  ioread                             value like(xmlInputReadCallback)
    312      d  ioclose                            value like(xmlInputCloseCallback)
    313      d  ioctx                          *   value                                void *
    314      d  URL                            *   value options(*string)               const char *
    315      d  encoding                       *   value options(*string)               const char *
    316      d  options                            value like(xmlCint)
    317 
    318      d htmlCtxtReadDoc...
    319      d                 pr                  extproc('htmlCtxtReadDoc')
    320      d                                     like(htmlDocPtr)
    321      d  ctxt                               value like(xmlParserCtxtPtr)
    322      d  cur                            *   value options(*string)               const xmlChar *
    323      d  URL                            *   value options(*string)               const char *
    324      d  encoding                       *   value options(*string)               const char *
    325      d  options                            value like(xmlCint)
    326 
    327      d htmlCtxtReadFile...
    328      d                 pr                  extproc('htmlCtxtReadFile')
    329      d                                     like(htmlDocPtr)
    330      d  ctxt                               value like(xmlParserCtxtPtr)
    331      d  filename                       *   value options(*string)               const char *
    332      d  encoding                       *   value options(*string)               const char *
    333      d  options                            value like(xmlCint)
    334 
    335      d htmlCtxtReadMemory...
    336      d                 pr                  extproc('htmlCtxtReadMemory')
    337      d                                     like(htmlDocPtr)
    338      d  ctxt                               value like(xmlParserCtxtPtr)
    339      d  buffer                         *   value options(*string)               const char *
    340      d  size                               value like(xmlCint)
    341      d  URL                            *   value options(*string)               const char *
    342      d  encoding                       *   value options(*string)               const char *
    343      d  options                            value like(xmlCint)
    344 
    345      d htmlCtxtReadFd  pr                  extproc('htmlCtxtReadFd')
    346      d                                     like(htmlDocPtr)
    347      d  ctxt                               value like(xmlParserCtxtPtr)
    348      d  fd                                 value like(xmlCint)
    349      d  URL                            *   value options(*string)               const char *
    350      d  encoding                       *   value options(*string)               const char *
    351      d  options                            value like(xmlCint)
    352 
    353      d htmlCtxtReadIO  pr                  extproc('htmlCtxtReadIO')
    354      d                                     like(htmlDocPtr)
    355      d  ctxt                               value like(xmlParserCtxtPtr)
    356      d  ioread                             value like(xmlInputReadCallback)
    357      d  ioclose                            value like(xmlInputCloseCallback)
    358      d  ioctx                          *   value                                void *
    359      d  URL                            *   value options(*string)               const char *
    360      d  encoding                       *   value options(*string)               const char *
    361      d  options                            value like(xmlCint)
    362 
    363       * Further knowledge of HTML structure
    364 
    365      d htmlStatus      s                   based(######typedef######)
    366      d                                     like(xmlCenum)
    367      d  HTML_NA        c                   X'0000'                              No check at all
    368      d  HTML_INVALID   c                   X'0001'
    369      d  HTML_DEPRECATED...
    370      d                 c                   X'0002'
    371      d  HTML_VALID     c                   X'0004'
    372      d  HTML_REQUIRED  c                   X'000C'                              HTML_VALID ored-in
    373 
    374       * Using htmlElemDesc rather than name here, to emphasise the fact
    375       *  that otherwise there's a lookup overhead
    376 
    377      d htmlAttrAllowed...
    378      d                 pr                  extproc('htmlAttrAllowed')
    379      d                                     like(htmlStatus)
    380      d  #param1                            value like(htmlElemDescPtr)          const
    381      d  #param2                        *   value options(*string)               const xmlChar *
    382      d  #param3                            value like(xmlCint)
    383 
    384      d htmlElementAllowedHere...
    385      d                 pr                  extproc('htmlElementAllowedHere')
    386      d                                     like(xmlCint)
    387      d  #param1                            value like(htmlElemDescPtr)          const
    388      d  #param2                        *   value options(*string)               const xmlChar *
    389 
    390      d htmlElementStatusHere...
    391      d                 pr                  extproc('htmlElementStatusHere')
    392      d                                     like(htmlStatus)
    393      d  #param1                            value like(htmlElemDescPtr)          const
    394      d  #param2                            value like(htmlElemDescPtr)          const
    395 
    396      d htmlNodeStatus  pr                  extproc('htmlNodeStatus')
    397      d                                     like(htmlStatus)
    398      d  #param1                            value like(htmlNodePtr)
    399      d  #param2                            value like(xmlCint)
    400 
    401       * C macros implemented as procedures for ILE/RPG support.
    402 
    403      d htmlDefaultSubelement...
    404      d                 pr              *   extproc('__htmlDefaultSubelement')   const char *
    405      d  elt                            *   value                                const htmlElemDesc *
    406 
    407      d htmlElementAllowedHereDesc...
    408      d                 pr                  extproc(
    409      d                                     '__htmlElementAllowedHereDesc')
    410      d                                     like(xmlCint)
    411      d  parent                         *   value                                const htmlElemDesc *
    412      d  elt                            *   value                                const htmlElemDesc *
    413 
    414      d htmlRequiredAttrs...
    415      d                 pr              *   extproc('__htmlRequiredAttrs')        const char * *
    416      d  elt                            *   value                                const htmlElemDesc *
    417 
    418       /endif                                                                    LIBXML_HTML_ENABLED
    419       /endif                                                                    HTML_PARSER_H__
    420