Home | History | Annotate | Download | only in examples
      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Web crawler based on curl and libxml2.
      9  * Copyright (C) 2018 Jeroen Ooms <jeroenooms (at) gmail.com>
     10  * License: MIT
     11  *
     12  * To compile:
     13  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
     14  *
     15  */
     16 /* <DESC>
     17  * Web crawler based on curl and libxml2 to stress-test curl with
     18  * hundreds of concurrent connections to various servers.
     19  * </DESC>
     20  */
     21 
     22 /* Parameters */
     23 int max_con = 200;
     24 int max_total = 20000;
     25 int max_requests = 500;
     26 int max_link_per_page = 5;
     27 int follow_relative_links = 0;
     28 char *start_page = "https://www.reuters.com";
     29 
     30 #include <libxml/HTMLparser.h>
     31 #include <libxml/xpath.h>
     32 #include <libxml/uri.h>
     33 #include <curl/curl.h>
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <math.h>
     37 #include <signal.h>
     38 
     39 int pending_interrupt = 0;
     40 void sighandler(int dummy)
     41 {
     42   pending_interrupt = 1;
     43 }
     44 
     45 /* resizable buffer */
     46 typedef struct {
     47   char *buf;
     48   size_t size;
     49 } memory;
     50 
     51 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
     52 {
     53   size_t realsize = sz * nmemb;
     54   memory *mem = (memory*) ctx;
     55   char *ptr = realloc(mem->buf, mem->size + realsize);
     56   if(!ptr) {
     57     /* out of memory */
     58     printf("not enough memory (realloc returned NULL)\n");
     59     return 0;
     60   }
     61   mem->buf = ptr;
     62   memcpy(&(mem->buf[mem->size]), contents, realsize);
     63   mem->size += realsize;
     64   return realsize;
     65 }
     66 
     67 CURL *make_handle(char *url)
     68 {
     69   CURL *handle = curl_easy_init();
     70 
     71   /* Important: use HTTP2 over HTTPS */
     72   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
     73   curl_easy_setopt(handle, CURLOPT_URL, url);
     74 
     75   /* buffer body */
     76   memory *mem = malloc(sizeof(memory));
     77   mem->size = 0;
     78   mem->buf = malloc(1);
     79   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
     80   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
     81   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
     82 
     83   /* For completeness */
     84   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
     85   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
     86   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
     87   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
     88   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
     89   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
     90   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
     91   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
     92   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
     93   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
     94   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
     95   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
     96   return handle;
     97 }
     98 
     99 /* HREF finder implemented in libxml2 but could be any HTML parser */
    100 size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
    101 {
    102   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
    103              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
    104   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
    105   if(!doc)
    106     return 0;
    107   xmlChar *xpath = (xmlChar*) "//a/@href";
    108   xmlXPathContextPtr context = xmlXPathNewContext(doc);
    109   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
    110   xmlXPathFreeContext(context);
    111   if(!result)
    112     return 0;
    113   xmlNodeSetPtr nodeset = result->nodesetval;
    114   if(xmlXPathNodeSetIsEmpty(nodeset)) {
    115     xmlXPathFreeObject(result);
    116     return 0;
    117   }
    118   size_t count = 0;
    119   for(int i = 0; i < nodeset->nodeNr; i++) {
    120     double r = rand();
    121     int x = r * nodeset->nodeNr / RAND_MAX;
    122     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
    123     xmlChar *href = xmlNodeListGetString(doc, node, 1);
    124     if(follow_relative_links) {
    125       xmlChar *orig = href;
    126       href = xmlBuildURI(href, (xmlChar *) url);
    127       xmlFree(orig);
    128     }
    129     char *link = (char *) href;
    130     if(!link || strlen(link) < 20)
    131       continue;
    132     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
    133       curl_multi_add_handle(multi_handle, make_handle(link));
    134       if(count++ == max_link_per_page)
    135         break;
    136     }
    137     xmlFree(link);
    138   }
    139   xmlXPathFreeObject(result);
    140   return count;
    141 }
    142 
    143 int is_html(char *ctype)
    144 {
    145   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
    146 }
    147 
    148 int main(void)
    149 {
    150   signal(SIGINT, sighandler);
    151   LIBXML_TEST_VERSION;
    152   curl_global_init(CURL_GLOBAL_DEFAULT);
    153   CURLM *multi_handle = curl_multi_init();
    154   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
    155   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
    156 
    157   /* enables http/2 if available */
    158 #ifdef CURLPIPE_MULTIPLEX
    159   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
    160 #endif
    161 
    162   /* sets html start page */
    163   curl_multi_add_handle(multi_handle, make_handle(start_page));
    164 
    165   int msgs_left;
    166   int pending = 0;
    167   int complete = 0;
    168   int still_running = 1;
    169   while(still_running && !pending_interrupt) {
    170     int numfds;
    171     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
    172     curl_multi_perform(multi_handle, &still_running);
    173 
    174     /* See how the transfers went */
    175     CURLMsg *m = NULL;
    176     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
    177       if(m->msg == CURLMSG_DONE) {
    178         CURL *handle = m->easy_handle;
    179         char *url;
    180         memory *mem;
    181         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
    182         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
    183         if(m->data.result == CURLE_OK) {
    184           long res_status;
    185           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
    186           if(res_status == 200) {
    187             char *ctype;
    188             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
    189             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
    190             if(is_html(ctype) && mem->size > 100) {
    191               if(pending < max_requests && (complete + pending) < max_total) {
    192                 pending += follow_links(multi_handle, mem, url);
    193                 still_running = 1;
    194               }
    195             }
    196           }
    197           else {
    198             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
    199           }
    200         }
    201         else {
    202           printf("[%d] Connection failure: %s\n", complete, url);
    203         }
    204         curl_multi_remove_handle(multi_handle, handle);
    205         curl_easy_cleanup(handle);
    206         free(mem->buf);
    207         free(mem);
    208         complete++;
    209         pending--;
    210       }
    211     }
    212   }
    213   curl_multi_cleanup(multi_handle);
    214   curl_global_cleanup();
    215   return 0;
    216 }
    217