Home | History | Annotate | Download | only in examples
      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel (at) haxx.se>, et al.
      9  *
     10  * This software is licensed as described in the file COPYING, which
     11  * you should have received as part of this distribution. The terms
     12  * are also available at https://curl.haxx.se/docs/copyright.html.
     13  *
     14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
     15  * copies of the Software, and permit persons to whom the Software is
     16  * furnished to do so, under the terms of the COPYING file.
     17  *
     18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
     19  * KIND, either express or implied.
     20  *
     21  ***************************************************************************/
     22 /* <DESC>
     23  * Get a web page, extract the title with libxml.
     24  * </DESC>
     25  */
     26 // Written by Lars Nilsson
     27 //
     28 // GNU C++ compile command line suggestion (edit paths accordingly):
     29 //
     30 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
     31 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
     32 
     33 #include <stdio.h>
     34 #include <string.h>
     35 #include <stdlib.h>
     36 #include <string>
     37 #include <curl/curl.h>
     38 #include <libxml/HTMLparser.h>
     39 
     40 //
     41 //  Case-insensitive string comparison
     42 //
     43 
     44 #ifdef _MSC_VER
     45 #define COMPARE(a, b) (!_stricmp((a), (b)))
     46 #else
     47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
     48 #endif
     49 
     50 //
     51 //  libxml callback context structure
     52 //
     53 
     54 struct Context
     55 {
     56   Context(): addTitle(false) { }
     57 
     58   bool addTitle;
     59   std::string title;
     60 };
     61 
     62 //
     63 //  libcurl variables for error strings and returned data
     64 
     65 static char errorBuffer[CURL_ERROR_SIZE];
     66 static std::string buffer;
     67 
     68 //
     69 //  libcurl write callback function
     70 //
     71 
     72 static int writer(char *data, size_t size, size_t nmemb,
     73                   std::string *writerData)
     74 {
     75   if (writerData == NULL)
     76     return 0;
     77 
     78   writerData->append(data, size*nmemb);
     79 
     80   return size * nmemb;
     81 }
     82 
     83 //
     84 //  libcurl connection initialization
     85 //
     86 
     87 static bool init(CURL *&conn, char *url)
     88 {
     89   CURLcode code;
     90 
     91   conn = curl_easy_init();
     92 
     93   if (conn == NULL)
     94   {
     95     fprintf(stderr, "Failed to create CURL connection\n");
     96 
     97     exit(EXIT_FAILURE);
     98   }
     99 
    100   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
    101   if (code != CURLE_OK)
    102   {
    103     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
    104 
    105     return false;
    106   }
    107 
    108   code = curl_easy_setopt(conn, CURLOPT_URL, url);
    109   if (code != CURLE_OK)
    110   {
    111     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
    112 
    113     return false;
    114   }
    115 
    116   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
    117   if (code != CURLE_OK)
    118   {
    119     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
    120 
    121     return false;
    122   }
    123 
    124   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
    125   if (code != CURLE_OK)
    126   {
    127     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
    128 
    129     return false;
    130   }
    131 
    132   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
    133   if (code != CURLE_OK)
    134   {
    135     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
    136 
    137     return false;
    138   }
    139 
    140   return true;
    141 }
    142 
    143 //
    144 //  libxml start element callback function
    145 //
    146 
    147 static void StartElement(void *voidContext,
    148                          const xmlChar *name,
    149                          const xmlChar **attributes)
    150 {
    151   Context *context = (Context *)voidContext;
    152 
    153   if (COMPARE((char *)name, "TITLE"))
    154   {
    155     context->title = "";
    156     context->addTitle = true;
    157   }
    158   (void) attributes;
    159 }
    160 
    161 //
    162 //  libxml end element callback function
    163 //
    164 
    165 static void EndElement(void *voidContext,
    166                        const xmlChar *name)
    167 {
    168   Context *context = (Context *)voidContext;
    169 
    170   if (COMPARE((char *)name, "TITLE"))
    171     context->addTitle = false;
    172 }
    173 
    174 //
    175 //  Text handling helper function
    176 //
    177 
    178 static void handleCharacters(Context *context,
    179                              const xmlChar *chars,
    180                              int length)
    181 {
    182   if (context->addTitle)
    183     context->title.append((char *)chars, length);
    184 }
    185 
    186 //
    187 //  libxml PCDATA callback function
    188 //
    189 
    190 static void Characters(void *voidContext,
    191                        const xmlChar *chars,
    192                        int length)
    193 {
    194   Context *context = (Context *)voidContext;
    195 
    196   handleCharacters(context, chars, length);
    197 }
    198 
    199 //
    200 //  libxml CDATA callback function
    201 //
    202 
    203 static void cdata(void *voidContext,
    204                   const xmlChar *chars,
    205                   int length)
    206 {
    207   Context *context = (Context *)voidContext;
    208 
    209   handleCharacters(context, chars, length);
    210 }
    211 
    212 //
    213 //  libxml SAX callback structure
    214 //
    215 
    216 static htmlSAXHandler saxHandler =
    217 {
    218   NULL,
    219   NULL,
    220   NULL,
    221   NULL,
    222   NULL,
    223   NULL,
    224   NULL,
    225   NULL,
    226   NULL,
    227   NULL,
    228   NULL,
    229   NULL,
    230   NULL,
    231   NULL,
    232   StartElement,
    233   EndElement,
    234   NULL,
    235   Characters,
    236   NULL,
    237   NULL,
    238   NULL,
    239   NULL,
    240   NULL,
    241   NULL,
    242   NULL,
    243   cdata,
    244   NULL
    245 };
    246 
    247 //
    248 //  Parse given (assumed to be) HTML text and return the title
    249 //
    250 
    251 static void parseHtml(const std::string &html,
    252                       std::string &title)
    253 {
    254   htmlParserCtxtPtr ctxt;
    255   Context context;
    256 
    257   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
    258                                   XML_CHAR_ENCODING_NONE);
    259 
    260   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
    261   htmlParseChunk(ctxt, "", 0, 1);
    262 
    263   htmlFreeParserCtxt(ctxt);
    264 
    265   title = context.title;
    266 }
    267 
    268 int main(int argc, char *argv[])
    269 {
    270   CURL *conn = NULL;
    271   CURLcode code;
    272   std::string title;
    273 
    274   // Ensure one argument is given
    275 
    276   if (argc != 2)
    277   {
    278     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
    279 
    280     exit(EXIT_FAILURE);
    281   }
    282 
    283   curl_global_init(CURL_GLOBAL_DEFAULT);
    284 
    285   // Initialize CURL connection
    286 
    287   if (!init(conn, argv[1]))
    288   {
    289     fprintf(stderr, "Connection initializion failed\n");
    290 
    291     exit(EXIT_FAILURE);
    292   }
    293 
    294   // Retrieve content for the URL
    295 
    296   code = curl_easy_perform(conn);
    297   curl_easy_cleanup(conn);
    298 
    299   if (code != CURLE_OK)
    300   {
    301     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
    302 
    303     exit(EXIT_FAILURE);
    304   }
    305 
    306   // Parse the (assumed) HTML code
    307 
    308   parseHtml(buffer, title);
    309 
    310   // Display the extracted title
    311 
    312   printf("Title: %s\n", title.c_str());
    313 
    314   return EXIT_SUCCESS;
    315 }
    316