1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) 1998 - 2015, Daniel Stenberg, <daniel (at) haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.haxx.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 ***************************************************************************/ 22 /* <DESC> 23 * Get a web page, extract the title with libxml. 24 * </DESC> 25 */ 26 // Written by Lars Nilsson 27 // 28 // GNU C++ compile command line suggestion (edit paths accordingly): 29 // 30 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ 31 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 32 33 #include <stdio.h> 34 #include <string.h> 35 #include <stdlib.h> 36 #include <string> 37 #include <curl/curl.h> 38 #include <libxml/HTMLparser.h> 39 40 // 41 // Case-insensitive string comparison 42 // 43 44 #ifdef _MSC_VER 45 #define COMPARE(a, b) (!_stricmp((a), (b))) 46 #else 47 #define COMPARE(a, b) (!strcasecmp((a), (b))) 48 #endif 49 50 // 51 // libxml callback context structure 52 // 53 54 struct Context 55 { 56 Context(): addTitle(false) { } 57 58 bool addTitle; 59 std::string title; 60 }; 61 62 // 63 // libcurl variables for error strings and returned data 64 65 static char errorBuffer[CURL_ERROR_SIZE]; 66 static std::string buffer; 67 68 // 69 // libcurl write callback function 70 // 71 72 static int writer(char *data, size_t size, size_t nmemb, 73 std::string *writerData) 74 { 75 if (writerData == NULL) 76 return 0; 77 78 writerData->append(data, size*nmemb); 79 80 return size * nmemb; 81 } 82 83 // 84 // libcurl connection initialization 85 // 86 87 static bool init(CURL *&conn, char *url) 88 { 89 CURLcode code; 90 91 conn = curl_easy_init(); 92 93 if (conn == NULL) 94 { 95 fprintf(stderr, "Failed to create CURL connection\n"); 96 97 exit(EXIT_FAILURE); 98 } 99 100 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); 101 if (code != CURLE_OK) 102 { 103 fprintf(stderr, "Failed to set error buffer [%d]\n", code); 104 105 return false; 106 } 107 108 code = curl_easy_setopt(conn, CURLOPT_URL, url); 109 if (code != CURLE_OK) 110 { 111 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); 112 113 return false; 114 } 115 116 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L); 117 if (code != CURLE_OK) 118 { 119 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); 120 121 return false; 122 } 123 124 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); 125 if (code != CURLE_OK) 126 { 127 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); 128 129 return false; 130 } 131 132 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); 133 if (code != CURLE_OK) 134 { 135 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); 136 137 return false; 138 } 139 140 return true; 141 } 142 143 // 144 // libxml start element callback function 145 // 146 147 static void StartElement(void *voidContext, 148 const xmlChar *name, 149 const xmlChar **attributes) 150 { 151 Context *context = (Context *)voidContext; 152 153 if (COMPARE((char *)name, "TITLE")) 154 { 155 context->title = ""; 156 context->addTitle = true; 157 } 158 (void) attributes; 159 } 160 161 // 162 // libxml end element callback function 163 // 164 165 static void EndElement(void *voidContext, 166 const xmlChar *name) 167 { 168 Context *context = (Context *)voidContext; 169 170 if (COMPARE((char *)name, "TITLE")) 171 context->addTitle = false; 172 } 173 174 // 175 // Text handling helper function 176 // 177 178 static void handleCharacters(Context *context, 179 const xmlChar *chars, 180 int length) 181 { 182 if (context->addTitle) 183 context->title.append((char *)chars, length); 184 } 185 186 // 187 // libxml PCDATA callback function 188 // 189 190 static void Characters(void *voidContext, 191 const xmlChar *chars, 192 int length) 193 { 194 Context *context = (Context *)voidContext; 195 196 handleCharacters(context, chars, length); 197 } 198 199 // 200 // libxml CDATA callback function 201 // 202 203 static void cdata(void *voidContext, 204 const xmlChar *chars, 205 int length) 206 { 207 Context *context = (Context *)voidContext; 208 209 handleCharacters(context, chars, length); 210 } 211 212 // 213 // libxml SAX callback structure 214 // 215 216 static htmlSAXHandler saxHandler = 217 { 218 NULL, 219 NULL, 220 NULL, 221 NULL, 222 NULL, 223 NULL, 224 NULL, 225 NULL, 226 NULL, 227 NULL, 228 NULL, 229 NULL, 230 NULL, 231 NULL, 232 StartElement, 233 EndElement, 234 NULL, 235 Characters, 236 NULL, 237 NULL, 238 NULL, 239 NULL, 240 NULL, 241 NULL, 242 NULL, 243 cdata, 244 NULL 245 }; 246 247 // 248 // Parse given (assumed to be) HTML text and return the title 249 // 250 251 static void parseHtml(const std::string &html, 252 std::string &title) 253 { 254 htmlParserCtxtPtr ctxt; 255 Context context; 256 257 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", 258 XML_CHAR_ENCODING_NONE); 259 260 htmlParseChunk(ctxt, html.c_str(), html.size(), 0); 261 htmlParseChunk(ctxt, "", 0, 1); 262 263 htmlFreeParserCtxt(ctxt); 264 265 title = context.title; 266 } 267 268 int main(int argc, char *argv[]) 269 { 270 CURL *conn = NULL; 271 CURLcode code; 272 std::string title; 273 274 // Ensure one argument is given 275 276 if (argc != 2) 277 { 278 fprintf(stderr, "Usage: %s <url>\n", argv[0]); 279 280 exit(EXIT_FAILURE); 281 } 282 283 curl_global_init(CURL_GLOBAL_DEFAULT); 284 285 // Initialize CURL connection 286 287 if (!init(conn, argv[1])) 288 { 289 fprintf(stderr, "Connection initializion failed\n"); 290 291 exit(EXIT_FAILURE); 292 } 293 294 // Retrieve content for the URL 295 296 code = curl_easy_perform(conn); 297 curl_easy_cleanup(conn); 298 299 if (code != CURLE_OK) 300 { 301 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); 302 303 exit(EXIT_FAILURE); 304 } 305 306 // Parse the (assumed) HTML code 307 308 parseHtml(buffer, title); 309 310 // Display the extracted title 311 312 printf("Title: %s\n", title.c_str()); 313 314 return EXIT_SUCCESS; 315 } 316