1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel (at) haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.haxx.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 ***************************************************************************/ 22 /* <DESC> 23 * Get a web page, extract the title with libxml. 24 * </DESC> 25 26 Written by Lars Nilsson 27 28 GNU C++ compile command line suggestion (edit paths accordingly): 29 30 g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ 31 -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 32 */ 33 #include <stdio.h> 34 #include <string.h> 35 #include <stdlib.h> 36 #include <string> 37 #include <curl/curl.h> 38 #include <libxml/HTMLparser.h> 39 40 // 41 // Case-insensitive string comparison 42 // 43 44 #ifdef _MSC_VER 45 #define COMPARE(a, b) (!_stricmp((a), (b))) 46 #else 47 #define COMPARE(a, b) (!strcasecmp((a), (b))) 48 #endif 49 50 // 51 // libxml callback context structure 52 // 53 54 struct Context 55 { 56 Context(): addTitle(false) { } 57 58 bool addTitle; 59 std::string title; 60 }; 61 62 // 63 // libcurl variables for error strings and returned data 64 65 static char errorBuffer[CURL_ERROR_SIZE]; 66 static std::string buffer; 67 68 // 69 // libcurl write callback function 70 // 71 72 static int writer(char *data, size_t size, size_t nmemb, 73 std::string *writerData) 74 { 75 if(writerData == NULL) 76 return 0; 77 78 writerData->append(data, size*nmemb); 79 80 return size * nmemb; 81 } 82 83 // 84 // libcurl connection initialization 85 // 86 87 static bool init(CURL *&conn, char *url) 88 { 89 CURLcode code; 90 91 conn = curl_easy_init(); 92 93 if(conn == NULL) { 94 fprintf(stderr, "Failed to create CURL connection\n"); 95 exit(EXIT_FAILURE); 96 } 97 98 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); 99 if(code != CURLE_OK) { 100 fprintf(stderr, "Failed to set error buffer [%d]\n", code); 101 return false; 102 } 103 104 code = curl_easy_setopt(conn, CURLOPT_URL, url); 105 if(code != CURLE_OK) { 106 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); 107 return false; 108 } 109 110 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L); 111 if(code != CURLE_OK) { 112 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); 113 return false; 114 } 115 116 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); 117 if(code != CURLE_OK) { 118 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); 119 return false; 120 } 121 122 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); 123 if(code != CURLE_OK) { 124 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); 125 return false; 126 } 127 128 return true; 129 } 130 131 // 132 // libxml start element callback function 133 // 134 135 static void StartElement(void *voidContext, 136 const xmlChar *name, 137 const xmlChar **attributes) 138 { 139 Context *context = (Context *)voidContext; 140 141 if(COMPARE((char *)name, "TITLE")) { 142 context->title = ""; 143 context->addTitle = true; 144 } 145 (void) attributes; 146 } 147 148 // 149 // libxml end element callback function 150 // 151 152 static void EndElement(void *voidContext, 153 const xmlChar *name) 154 { 155 Context *context = (Context *)voidContext; 156 157 if(COMPARE((char *)name, "TITLE")) 158 context->addTitle = false; 159 } 160 161 // 162 // Text handling helper function 163 // 164 165 static void handleCharacters(Context *context, 166 const xmlChar *chars, 167 int length) 168 { 169 if(context->addTitle) 170 context->title.append((char *)chars, length); 171 } 172 173 // 174 // libxml PCDATA callback function 175 // 176 177 static void Characters(void *voidContext, 178 const xmlChar *chars, 179 int length) 180 { 181 Context *context = (Context *)voidContext; 182 183 handleCharacters(context, chars, length); 184 } 185 186 // 187 // libxml CDATA callback function 188 // 189 190 static void cdata(void *voidContext, 191 const xmlChar *chars, 192 int length) 193 { 194 Context *context = (Context *)voidContext; 195 196 handleCharacters(context, chars, length); 197 } 198 199 // 200 // libxml SAX callback structure 201 // 202 203 static htmlSAXHandler saxHandler = 204 { 205 NULL, 206 NULL, 207 NULL, 208 NULL, 209 NULL, 210 NULL, 211 NULL, 212 NULL, 213 NULL, 214 NULL, 215 NULL, 216 NULL, 217 NULL, 218 NULL, 219 StartElement, 220 EndElement, 221 NULL, 222 Characters, 223 NULL, 224 NULL, 225 NULL, 226 NULL, 227 NULL, 228 NULL, 229 NULL, 230 cdata, 231 NULL 232 }; 233 234 // 235 // Parse given (assumed to be) HTML text and return the title 236 // 237 238 static void parseHtml(const std::string &html, 239 std::string &title) 240 { 241 htmlParserCtxtPtr ctxt; 242 Context context; 243 244 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", 245 XML_CHAR_ENCODING_NONE); 246 247 htmlParseChunk(ctxt, html.c_str(), html.size(), 0); 248 htmlParseChunk(ctxt, "", 0, 1); 249 250 htmlFreeParserCtxt(ctxt); 251 252 title = context.title; 253 } 254 255 int main(int argc, char *argv[]) 256 { 257 CURL *conn = NULL; 258 CURLcode code; 259 std::string title; 260 261 // Ensure one argument is given 262 263 if(argc != 2) { 264 fprintf(stderr, "Usage: %s <url>\n", argv[0]); 265 exit(EXIT_FAILURE); 266 } 267 268 curl_global_init(CURL_GLOBAL_DEFAULT); 269 270 // Initialize CURL connection 271 272 if(!init(conn, argv[1])) { 273 fprintf(stderr, "Connection initializion failed\n"); 274 exit(EXIT_FAILURE); 275 } 276 277 // Retrieve content for the URL 278 279 code = curl_easy_perform(conn); 280 curl_easy_cleanup(conn); 281 282 if(code != CURLE_OK) { 283 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); 284 exit(EXIT_FAILURE); 285 } 286 287 // Parse the (assumed) HTML code 288 parseHtml(buffer, title); 289 290 // Display the extracted title 291 printf("Title: %s\n", title.c_str()); 292 293 return EXIT_SUCCESS; 294 } 295