Home | History | Annotate | Download | only in libxmlrpg
      1       * Summary: interface for an HTML 4.0 non-verifying parser
      2       * Description: this module implements an HTML 4.0 non-verifying parser
      3       *              with API compatible with the XML parser ones. It should
      4       *              be able to parse "real world" HTML, even if severely
      5       *              broken from a specification point of view.
      6       *
      7       * Copy: See Copyright for the status of this software.
      8       *
      9       * Author: Patrick Monnerat <pm (a] datasphere.ch>, DATASPHERE S.A.
     10 
     11       /if not defined(HTML_PARSER_H__)
     12       /define HTML_PARSER_H__
     13 
     14       /include "libxmlrpg/xmlversion"
     15       /include "libxmlrpg/xmlTypesC"
     16       /include "libxmlrpg/parser"
     17 
     18       /if defined(LIBXML_HTML_ENABLED)
     19 
     20       * Most of the back-end structures from XML and HTML are shared.
     21 
     22      d htmlParserCtxtPtr...
     23      d                 s                   based(######typedef######)
     24      d                                     like(xmlParserCtxtPtr)
     25 
     26      d htmlParserCtxt  ds                  based(htmlParserCtxtPtr)
     27      d                                     likeds(xmlParserCtxt)
     28 
     29      d htmlParserNodeInfoPtr...
     30      d                 s                   based(######typedef######)
     31      d                                     like(xmlParserNodeInfoPtr)
     32 
     33      d htmlParserNodeInfo...
     34      d                 ds                  based(htmlParserNodeInfoPtr)
     35      d                                     likeds(xmlParserNodeInfo)
     36 
     37      d htmlSAXHandlerPtr...
     38      d                 s                   based(######typedef######)
     39      d                                     like(xmlSAXHandlerPtr)
     40 
     41      d htmlSAXHandler  ds                  based(htmlSAXHandlerPtr)
     42      d                                     likeds(xmlSAXHandler)
     43 
     44      d htmlParserInputPtr...
     45      d                 s                   based(######typedef######)
     46      d                                     like(xmlParserInputPtr)
     47 
     48      d htmlParserInput...
     49      d                 ds                  based(htmlParserInputPtr)
     50      d                                     likeds(xmlParserInput)
     51 
     52      d htmlDocPtr      s                   based(######typedef######)
     53      d                                     like(xmlDocPtr)
     54 
     55      d htmlNodePtr     s                   based(######typedef######)
     56      d                                     like(xmlNodePtr)
     57 
     58       * Internal description of an HTML element, representing HTML 4.01
     59       * and XHTML 1.0 (which share the same structure).
     60 
     61      d htmlElemDescPtr...
     62      d                 s               *   based(######typedef######)
     63 
     64      d htmlElemDesc    ds                  based(htmlElemDescPtr)
     65      d                                     align qualified
     66      d  name                           *                                        const char *
     67      d  startTag                           like(xmlCchar)                       Start tag implied ?
     68      d  endTag                             like(xmlCchar)                       End tag implied ?
     69      d  saveEndTag                         like(xmlCchar)                       Save end tag ?
     70      d  empty                              like(xmlCchar)                       Empty element ?
     71      d  depr                               like(xmlCchar)                       Deprecated element ?
     72      d  dtd                                like(xmlCchar)                       Loose DTD/Frameset
     73      d  isinline                           like(xmlCchar)                       Block 0/inline elem?
     74      d  desc                           *                                        const char *
     75       *
     76       * New fields encapsulating HTML structure
     77       *
     78       * Bugs:
     79       *      This is a very limited representation.  It fails to tell us when
     80       *      an element *requires* subelements (we only have whether they're
     81       *      allowed or not), and it doesn't tell us where CDATA and PCDATA
     82       *      are allowed.  Some element relationships are not fully represented:
     83       *      these are flagged with the word MODIFIER
     84       *
     85      d  subelts                        *                                        const char * *
     86      d  defaultsubelt                  *                                        const char *
     87      d  attrs_opt                      *                                        const char * *
     88      d  attrs_depr                     *                                        const char * *
     89      d  attrs_req                      *                                        const char * *
     90 
     91       * Internal description of an HTML entity.
     92 
     93      d htmlEntityDescPtr...
     94      d                 s               *   based(######typedef######)
     95 
     96      d htmlEntityDesc...
     97      d                 ds                  based(htmlEntityDescPtr)
     98      d                                     align qualified
     99      d  value                        10u 0                                      Unicode char value
    100      d  name                           *                                        const char *
    101      d  desc                           *                                        const char *
    102 
    103       * There is only few public functions.
    104 
    105      d htmlTagLookup   pr                  extproc('htmlTagLookup')
    106      d                                     like(htmlElemDescPtr)                const
    107      d  tag                            *   value options(*string)               const xmlChar *
    108 
    109      d htmlEntityLookup...
    110      d                 pr                  extproc('htmlEntityLookup')
    111      d                                     like(htmlEntityDescPtr)              const
    112      d  name                           *   value options(*string)               const xmlChar *
    113 
    114      d htmlEntityValueLookup...
    115      d                 pr                  extproc('htmlEntityValueLookup')
    116      d                                     like(htmlEntityDescPtr)              const
    117      d  value                        10u 0 value
    118 
    119      d htmlIsAutoClosed...
    120      d                 pr            10i 0 extproc('htmlIsAutoClosed')
    121      d  doc                                value like(htmlDocPtr)
    122      d  elem                               value like(htmlNodePtr)
    123 
    124      d htmlAutoCloseTag...
    125      d                 pr            10i 0 extproc('htmlAutoCloseTag')
    126      d  doc                                value like(htmlDocPtr)
    127      d  name                           *   value options(*string)               const xmlChar *
    128      d  elem                               value like(htmlNodePtr)
    129 
    130      d htmlParseEntityRef...
    131      d                 pr                  extproc('htmlParseEntityRef')
    132      d                                     like(htmlEntityDescPtr)              const
    133      d  ctxt                               value like(htmlParserCtxtPtr)
    134      d  str                            *                                        const xmlChar *(*)
    135 
    136      d htmlParseCharRef...
    137      d                 pr            10i 0 extproc('htmlParseCharRef')
    138      d  ctxt                               value like(htmlParserCtxtPtr)
    139 
    140      d htmlParseElement...
    141      d                 pr                  extproc('htmlParseElement')
    142      d  ctxt                               value like(htmlParserCtxtPtr)
    143 
    144      d htmlNewParserCtxt...
    145      d                 pr                  extproc('htmlNewParserCtxt')
    146      d                                     like(htmlParserCtxtPtr)
    147 
    148      d htmlCreateMemoryParserCtxt...
    149      d                 pr                  extproc('htmlCreateMemoryParserCtxt')
    150      d                                     like(htmlParserCtxtPtr)
    151      d  buffer                         *   value options(*string)               const char *
    152      d  size                         10i 0 value
    153 
    154      d htmlParseDocument...
    155      d                 pr            10i 0 extproc('htmlParseDocument')
    156      d  ctxt                               value like(htmlParserCtxtPtr)
    157 
    158      d htmlSAXParseDoc...
    159      d                 pr                  extproc('htmlSAXParseDoc')
    160      d                                     like(htmlDocPtr)
    161      d  cur                            *   value options(*string)               xmlChar *
    162      d  encoding                       *   value options(*string)               const char *
    163      d  sax                                value like(htmlSAXHandlerPtr)
    164      d  userData                       *   value                                void *
    165 
    166      d htmlParseDoc    pr                  extproc('htmlParseDoc')
    167      d                                     like(htmlDocPtr)
    168      d  cur                            *   value options(*string)               xmlChar *
    169      d  encoding                       *   value options(*string)               const char *
    170 
    171      d htmlSAXParseFile...
    172      d                 pr                  extproc('htmlSAXParseFile')
    173      d                                     like(htmlDocPtr)
    174      d  filename                       *   value options(*string)               const char *
    175      d  encoding                       *   value options(*string)               const char *
    176      d  sax                                value like(htmlSAXHandlerPtr)
    177      d  userData                       *   value                                void *
    178 
    179      d htmlParseFile   pr                  extproc('htmlParseFile')
    180      d                                     like(htmlDocPtr)
    181      d  filename                       *   value options(*string)               const char *
    182      d  encoding                       *   value options(*string)               const char *
    183 
    184      d UTF8ToHtml      pr            10i 0 extproc('UTF8ToHtml')
    185      d  out                       65535    options(*varsize)                    unsigned char []
    186      d  outlen                       10i 0
    187      d  in                             *   value options(*string)               const unsigned char*
    188      d  inlen                        10i 0
    189 
    190      d htmlEncodeEntities...
    191      d                 pr            10i 0 extproc('htmlEncodeEntities')
    192      d  out                       65535    options(*varsize)                    unsigned char []
    193      d  outlen                       10i 0
    194      d  in                             *   value options(*string)               const unsigned char*
    195      d  inlen                        10i 0
    196      d  quoteChar                    10i 0 value
    197 
    198      d htmlIsScriptAttribute...
    199      d                 pr            10i 0 extproc('htmlIsScriptAttribute')
    200      d  name                           *   value options(*string)               const xmlChar *
    201 
    202      d htmlHandleOmittedElem...
    203      d                 pr            10i 0 extproc('htmlHandleOmittedElem')
    204      d  val                          10i 0 value
    205 
    206       /if defined(LIBXML_PUSH_ENABLED)
    207 
    208       * Interfaces for the Push mode.
    209 
    210      d htmlCreatePushParserCtxt...
    211      d                 pr                  extproc('htmlCreatePushParserCtxt')
    212      d                                     like(htmlParserCtxtPtr)
    213      d  sax                                value like(htmlSAXHandlerPtr)
    214      d  user_data                      *   value                                void *
    215      d  chunk                          *   value options(*string)               const char *
    216      d  size                         10i 0 value
    217      d  filename                       *   value options(*string)               const char *
    218      d  enc                                value like(xmlCharEncoding)
    219 
    220      d htmlParseChunk  pr            10i 0 extproc('htmlParseChunk')
    221      d  ctxt                               value like(htmlParserCtxtPtr)
    222      d  chunk                          *   value options(*string)               const char *
    223      d  size                         10i 0 value
    224      d  terminate                    10i 0 value
    225       /endif                                                                    LIBXML_PUSH_ENABLED
    226 
    227      d htmlFreeParserCtxt...
    228      d                 pr                  extproc('htmlFreeParserCtxt')
    229      d  ctxt                               value like(htmlParserCtxtPtr)
    230 
    231       * New set of simpler/more flexible APIs
    232 
    233       * xmlParserOption:
    234       *
    235       * This is the set of XML parser options that can be passed down
    236       * to the xmlReadDoc() and similar calls.
    237 
    238      d htmlParserOption...
    239      d                 s             10i 0 based(######typedef######)           enum
    240      d  HTML_PARSE_RECOVER...                                                   Relaxed parsing
    241      d                 c                   X'00000001'
    242      d  HTML_PARSE_NODEFDTD...                                                  No default doctype
    243      d                 c                   X'00000004'
    244      d  HTML_PARSE_NOERROR...                                                   No error reports
    245      d                 c                   X'00000020'
    246      d  HTML_PARSE_NOWARNING...                                                 No warning reports
    247      d                 c                   X'00000040'
    248      d  HTML_PARSE_PEDANTIC...                                                  Pedantic err reports
    249      d                 c                   X'00000080'
    250      d  HTML_PARSE_NOBLANKS...                                                  Remove blank nodes
    251      d                 c                   X'00000100'
    252      d  HTML_PARSE_NONET...                                                     Forbid net access
    253      d                 c                   X'00000800'
    254      d  HTML_PARSE_NOIMPLIED...                                                 No implied html/body
    255      d                 c                   X'00002000'
    256      d  HTML_PARSE_COMPACT...                                                   compact small txtnod
    257      d                 c                   X'00010000'
    258      d  HTML_PARSE_IGNORE_ENC...                                                Ignore encoding hint
    259      d                 c                   X'00200000'
    260 
    261      d htmlCtxtReset   pr                  extproc('htmlCtxtReset')
    262      d ctxt                                value like(htmlParserCtxtPtr)
    263 
    264      d htmlCtxtUseOptions...
    265      d                 pr            10i 0 extproc('htmlCtxtUseOptions')
    266      d ctxt                                value like(htmlParserCtxtPtr)
    267      d options                       10i 0 value
    268 
    269      d htmlReadDoc     pr                  extproc('htmlReadDoc')
    270      d                                     like(htmlDocPtr)
    271      d  cur                            *   value options(*string)               const xmlChar *
    272      d  URL                            *   value options(*string)               const char *
    273      d  encoding                       *   value options(*string)               const char *
    274      d  options                      10i 0 value
    275 
    276      d htmlReadFile    pr                  extproc('htmlReadFile')
    277      d                                     like(htmlDocPtr)
    278      d  URL                            *   value options(*string)               const char *
    279      d  encoding                       *   value options(*string)               const char *
    280      d  options                      10i 0 value
    281 
    282      d htmlReadMemory  pr                  extproc('htmlReadMemory')
    283      d                                     like(htmlDocPtr)
    284      d  buffer                         *   value options(*string)               const char *
    285      d  size                         10i 0 value
    286      d  URL                            *   value options(*string)               const char *
    287      d  encoding                       *   value options(*string)               const char *
    288      d  options                      10i 0 value
    289 
    290      d htmlReadFd      pr                  extproc('htmlReadFd')
    291      d                                     like(htmlDocPtr)
    292      d  fd                           10i 0 value
    293      d  URL                            *   value options(*string)               const char *
    294      d  encoding                       *   value options(*string)               const char *
    295      d  options                      10i 0 value
    296 
    297      d htmlReadIO      pr                  extproc('htmlReadIO')
    298      d                                     like(htmlDocPtr)
    299      d  ioread                             value like(xmlInputReadCallback)
    300      d  ioclose                            value like(xmlInputCloseCallback)
    301      d  ioctx                          *   value                                void *
    302      d  URL                            *   value options(*string)               const char *
    303      d  encoding                       *   value options(*string)               const char *
    304      d  options                      10i 0 value
    305 
    306      d htmlCtxtReadDoc...
    307      d                 pr                  extproc('htmlCtxtReadDoc')
    308      d                                     like(htmlDocPtr)
    309      d  ctxt                               value like(xmlParserCtxtPtr)
    310      d  cur                            *   value options(*string)               const xmlChar *
    311      d  URL                            *   value options(*string)               const char *
    312      d  encoding                       *   value options(*string)               const char *
    313      d  options                      10i 0 value
    314 
    315      d htmlCtxtReadFile...
    316      d                 pr                  extproc('htmlCtxtReadFile')
    317      d                                     like(htmlDocPtr)
    318      d  ctxt                               value like(xmlParserCtxtPtr)
    319      d  filename                       *   value options(*string)               const char *
    320      d  encoding                       *   value options(*string)               const char *
    321      d  options                      10i 0 value
    322 
    323      d htmlCtxtReadMemory...
    324      d                 pr                  extproc('htmlCtxtReadMemory')
    325      d                                     like(htmlDocPtr)
    326      d  ctxt                               value like(xmlParserCtxtPtr)
    327      d  buffer                         *   value options(*string)               const char *
    328      d  size                         10i 0 value
    329      d  URL                            *   value options(*string)               const char *
    330      d  encoding                       *   value options(*string)               const char *
    331      d  options                      10i 0 value
    332 
    333      d htmlCtxtReadFd  pr                  extproc('htmlCtxtReadFd')
    334      d                                     like(htmlDocPtr)
    335      d  ctxt                               value like(xmlParserCtxtPtr)
    336      d  fd                           10i 0 value
    337      d  URL                            *   value options(*string)               const char *
    338      d  encoding                       *   value options(*string)               const char *
    339      d  options                      10i 0 value
    340 
    341      d htmlCtxtReadIO  pr                  extproc('htmlCtxtReadIO')
    342      d                                     like(htmlDocPtr)
    343      d  ctxt                               value like(xmlParserCtxtPtr)
    344      d  ioread                             value like(xmlInputReadCallback)
    345      d  ioclose                            value like(xmlInputCloseCallback)
    346      d  ioctx                          *   value                                void *
    347      d  URL                            *   value options(*string)               const char *
    348      d  encoding                       *   value options(*string)               const char *
    349      d  options                      10i 0 value
    350 
    351       * Further knowledge of HTML structure
    352 
    353      d htmlStatus      s             10i 0 based(######typedef######)           enum
    354      d  HTML_NA        c                   X'0000'                              No check at all
    355      d  HTML_INVALID   c                   X'0001'
    356      d  HTML_DEPRECATED...
    357      d                 c                   X'0002'
    358      d  HTML_VALID     c                   X'0004'
    359      d  HTML_REQUIRED  c                   X'000C'                              HTML_VALID ored-in
    360 
    361       * Using htmlElemDesc rather than name here, to emphasise the fact
    362       *  that otherwise there's a lookup overhead
    363 
    364      d htmlAttrAllowed...
    365      d                 pr                  extproc('htmlAttrAllowed')
    366      d                                     like(htmlStatus)
    367      d  #param1                            value like(htmlElemDescPtr)          const
    368      d  #param2                        *   value options(*string)               const xmlChar *
    369      d  #param3                      10i 0 value
    370 
    371      d htmlElementAllowedHere...
    372      d                 pr            10i 0 extproc('htmlElementAllowedHere')
    373      d  #param1                            value like(htmlElemDescPtr)          const
    374      d  #param2                        *   value options(*string)               const xmlChar *
    375 
    376      d htmlElementStatusHere...
    377      d                 pr                  extproc('htmlElementStatusHere')
    378      d                                     like(htmlStatus)
    379      d  #param1                            value like(htmlElemDescPtr)          const
    380      d  #param2                            value like(htmlElemDescPtr)          const
    381 
    382      d htmlNodeStatus  pr                  extproc('htmlNodeStatus')
    383      d                                     like(htmlStatus)
    384      d  #param1                            value like(htmlNodePtr)
    385      d  #param2                      10i 0 value
    386 
    387       * C macros implemented as procedures for ILE/RPG support.
    388 
    389      d htmlDefaultSubelement...
    390      d                 pr              *   extproc('__htmlDefaultSubelement')   const char *
    391      d  elt                            *   value                                const htmlElemDesc *
    392 
    393      d htmlElementAllowedHereDesc...
    394      d                 pr            10i 0 extproc(
    395      d                                     '__htmlElementAllowedHereDesc')
    396      d  parent                         *   value                                const htmlElemDesc *
    397      d  elt                            *   value                                const htmlElemDesc *
    398 
    399      d htmlRequiredAttrs...
    400      d                 pr              *   extproc('__htmlRequiredAttrs')        const char * *
    401      d  elt                            *   value                                const htmlElemDesc *
    402 
    403       /endif                                                                    LIBXML_HTML_ENABLED
    404       /endif                                                                    HTML_PARSER_H__
    405