Home | History | Annotate | Download | only in examples
      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel (at) haxx.se>, et al.
      9  *
     10  * This software is licensed as described in the file COPYING, which
     11  * you should have received as part of this distribution. The terms
     12  * are also available at https://curl.haxx.se/docs/copyright.html.
     13  *
     14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
     15  * copies of the Software, and permit persons to whom the Software is
     16  * furnished to do so, under the terms of the COPYING file.
     17  *
     18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
     19  * KIND, either express or implied.
     20  *
     21  ***************************************************************************/
     22 /* <DESC>
     23  * Get a web page, extract the title with libxml.
     24  * </DESC>
     25 
     26  Written by Lars Nilsson
     27 
     28  GNU C++ compile command line suggestion (edit paths accordingly):
     29 
     30  g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
     31  -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
     32 */
     33 #include <stdio.h>
     34 #include <string.h>
     35 #include <stdlib.h>
     36 #include <string>
     37 #include <curl/curl.h>
     38 #include <libxml/HTMLparser.h>
     39 
     40 //
     41 //  Case-insensitive string comparison
     42 //
     43 
     44 #ifdef _MSC_VER
     45 #define COMPARE(a, b) (!_stricmp((a), (b)))
     46 #else
     47 #define COMPARE(a, b) (!strcasecmp((a), (b)))
     48 #endif
     49 
     50 //
     51 //  libxml callback context structure
     52 //
     53 
     54 struct Context
     55 {
     56   Context(): addTitle(false) { }
     57 
     58   bool addTitle;
     59   std::string title;
     60 };
     61 
     62 //
     63 //  libcurl variables for error strings and returned data
     64 
     65 static char errorBuffer[CURL_ERROR_SIZE];
     66 static std::string buffer;
     67 
     68 //
     69 //  libcurl write callback function
     70 //
     71 
     72 static int writer(char *data, size_t size, size_t nmemb,
     73                   std::string *writerData)
     74 {
     75   if(writerData == NULL)
     76     return 0;
     77 
     78   writerData->append(data, size*nmemb);
     79 
     80   return size * nmemb;
     81 }
     82 
     83 //
     84 //  libcurl connection initialization
     85 //
     86 
     87 static bool init(CURL *&conn, char *url)
     88 {
     89   CURLcode code;
     90 
     91   conn = curl_easy_init();
     92 
     93   if(conn == NULL) {
     94     fprintf(stderr, "Failed to create CURL connection\n");
     95     exit(EXIT_FAILURE);
     96   }
     97 
     98   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
     99   if(code != CURLE_OK) {
    100     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
    101     return false;
    102   }
    103 
    104   code = curl_easy_setopt(conn, CURLOPT_URL, url);
    105   if(code != CURLE_OK) {
    106     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
    107     return false;
    108   }
    109 
    110   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
    111   if(code != CURLE_OK) {
    112     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
    113     return false;
    114   }
    115 
    116   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
    117   if(code != CURLE_OK) {
    118     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
    119     return false;
    120   }
    121 
    122   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
    123   if(code != CURLE_OK) {
    124     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
    125     return false;
    126   }
    127 
    128   return true;
    129 }
    130 
    131 //
    132 //  libxml start element callback function
    133 //
    134 
    135 static void StartElement(void *voidContext,
    136                          const xmlChar *name,
    137                          const xmlChar **attributes)
    138 {
    139   Context *context = (Context *)voidContext;
    140 
    141   if(COMPARE((char *)name, "TITLE")) {
    142     context->title = "";
    143     context->addTitle = true;
    144   }
    145   (void) attributes;
    146 }
    147 
    148 //
    149 //  libxml end element callback function
    150 //
    151 
    152 static void EndElement(void *voidContext,
    153                        const xmlChar *name)
    154 {
    155   Context *context = (Context *)voidContext;
    156 
    157   if(COMPARE((char *)name, "TITLE"))
    158     context->addTitle = false;
    159 }
    160 
    161 //
    162 //  Text handling helper function
    163 //
    164 
    165 static void handleCharacters(Context *context,
    166                              const xmlChar *chars,
    167                              int length)
    168 {
    169   if(context->addTitle)
    170     context->title.append((char *)chars, length);
    171 }
    172 
    173 //
    174 //  libxml PCDATA callback function
    175 //
    176 
    177 static void Characters(void *voidContext,
    178                        const xmlChar *chars,
    179                        int length)
    180 {
    181   Context *context = (Context *)voidContext;
    182 
    183   handleCharacters(context, chars, length);
    184 }
    185 
    186 //
    187 //  libxml CDATA callback function
    188 //
    189 
    190 static void cdata(void *voidContext,
    191                   const xmlChar *chars,
    192                   int length)
    193 {
    194   Context *context = (Context *)voidContext;
    195 
    196   handleCharacters(context, chars, length);
    197 }
    198 
    199 //
    200 //  libxml SAX callback structure
    201 //
    202 
    203 static htmlSAXHandler saxHandler =
    204 {
    205   NULL,
    206   NULL,
    207   NULL,
    208   NULL,
    209   NULL,
    210   NULL,
    211   NULL,
    212   NULL,
    213   NULL,
    214   NULL,
    215   NULL,
    216   NULL,
    217   NULL,
    218   NULL,
    219   StartElement,
    220   EndElement,
    221   NULL,
    222   Characters,
    223   NULL,
    224   NULL,
    225   NULL,
    226   NULL,
    227   NULL,
    228   NULL,
    229   NULL,
    230   cdata,
    231   NULL
    232 };
    233 
    234 //
    235 //  Parse given (assumed to be) HTML text and return the title
    236 //
    237 
    238 static void parseHtml(const std::string &html,
    239                       std::string &title)
    240 {
    241   htmlParserCtxtPtr ctxt;
    242   Context context;
    243 
    244   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
    245                                   XML_CHAR_ENCODING_NONE);
    246 
    247   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
    248   htmlParseChunk(ctxt, "", 0, 1);
    249 
    250   htmlFreeParserCtxt(ctxt);
    251 
    252   title = context.title;
    253 }
    254 
    255 int main(int argc, char *argv[])
    256 {
    257   CURL *conn = NULL;
    258   CURLcode code;
    259   std::string title;
    260 
    261   // Ensure one argument is given
    262 
    263   if(argc != 2) {
    264     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
    265     exit(EXIT_FAILURE);
    266   }
    267 
    268   curl_global_init(CURL_GLOBAL_DEFAULT);
    269 
    270   // Initialize CURL connection
    271 
    272   if(!init(conn, argv[1])) {
    273     fprintf(stderr, "Connection initializion failed\n");
    274     exit(EXIT_FAILURE);
    275   }
    276 
    277   // Retrieve content for the URL
    278 
    279   code = curl_easy_perform(conn);
    280   curl_easy_cleanup(conn);
    281 
    282   if(code != CURLE_OK) {
    283     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
    284     exit(EXIT_FAILURE);
    285   }
    286 
    287   // Parse the (assumed) HTML code
    288   parseHtml(buffer, title);
    289 
    290   // Display the extracted title
    291   printf("Title: %s\n", title.c_str());
    292 
    293   return EXIT_SUCCESS;
    294 }
    295