1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Web crawler based on curl and libxml2. 9 * Copyright (C) 2018 Jeroen Ooms <jeroenooms (at) gmail.com> 10 * License: MIT 11 * 12 * To compile: 13 * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) 14 * 15 */ 16 /* <DESC> 17 * Web crawler based on curl and libxml2 to stress-test curl with 18 * hundreds of concurrent connections to various servers. 19 * </DESC> 20 */ 21 22 /* Parameters */ 23 int max_con = 200; 24 int max_total = 20000; 25 int max_requests = 500; 26 int max_link_per_page = 5; 27 int follow_relative_links = 0; 28 char *start_page = "https://www.reuters.com"; 29 30 #include <libxml/HTMLparser.h> 31 #include <libxml/xpath.h> 32 #include <libxml/uri.h> 33 #include <curl/curl.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <math.h> 37 #include <signal.h> 38 39 int pending_interrupt = 0; 40 void sighandler(int dummy) 41 { 42 pending_interrupt = 1; 43 } 44 45 /* resizable buffer */ 46 typedef struct { 47 char *buf; 48 size_t size; 49 } memory; 50 51 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) 52 { 53 size_t realsize = sz * nmemb; 54 memory *mem = (memory*) ctx; 55 char *ptr = realloc(mem->buf, mem->size + realsize); 56 if(!ptr) { 57 /* out of memory */ 58 printf("not enough memory (realloc returned NULL)\n"); 59 return 0; 60 } 61 mem->buf = ptr; 62 memcpy(&(mem->buf[mem->size]), contents, realsize); 63 mem->size += realsize; 64 return realsize; 65 } 66 67 CURL *make_handle(char *url) 68 { 69 CURL *handle = curl_easy_init(); 70 71 /* Important: use HTTP2 over HTTPS */ 72 curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); 73 curl_easy_setopt(handle, CURLOPT_URL, url); 74 75 /* buffer body */ 76 memory *mem = malloc(sizeof(memory)); 77 mem->size = 0; 78 mem->buf = malloc(1); 79 curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); 80 curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); 81 curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); 82 83 /* For completeness */ 84 curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); 85 curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); 86 curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); 87 curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); 88 curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); 89 curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); 90 curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); 91 curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); 92 curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); 93 curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); 94 curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); 95 curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); 96 return handle; 97 } 98 99 /* HREF finder implemented in libxml2 but could be any HTML parser */ 100 size_t follow_links(CURLM *multi_handle, memory *mem, char *url) 101 { 102 int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ 103 HTML_PARSE_NOWARNING | HTML_PARSE_NONET; 104 htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); 105 if(!doc) 106 return 0; 107 xmlChar *xpath = (xmlChar*) "//a/@href"; 108 xmlXPathContextPtr context = xmlXPathNewContext(doc); 109 xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); 110 xmlXPathFreeContext(context); 111 if(!result) 112 return 0; 113 xmlNodeSetPtr nodeset = result->nodesetval; 114 if(xmlXPathNodeSetIsEmpty(nodeset)) { 115 xmlXPathFreeObject(result); 116 return 0; 117 } 118 size_t count = 0; 119 for(int i = 0; i < nodeset->nodeNr; i++) { 120 double r = rand(); 121 int x = r * nodeset->nodeNr / RAND_MAX; 122 const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; 123 xmlChar *href = xmlNodeListGetString(doc, node, 1); 124 if(follow_relative_links) { 125 xmlChar *orig = href; 126 href = xmlBuildURI(href, (xmlChar *) url); 127 xmlFree(orig); 128 } 129 char *link = (char *) href; 130 if(!link || strlen(link) < 20) 131 continue; 132 if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { 133 curl_multi_add_handle(multi_handle, make_handle(link)); 134 if(count++ == max_link_per_page) 135 break; 136 } 137 xmlFree(link); 138 } 139 xmlXPathFreeObject(result); 140 return count; 141 } 142 143 int is_html(char *ctype) 144 { 145 return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); 146 } 147 148 int main(void) 149 { 150 signal(SIGINT, sighandler); 151 LIBXML_TEST_VERSION; 152 curl_global_init(CURL_GLOBAL_DEFAULT); 153 CURLM *multi_handle = curl_multi_init(); 154 curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); 155 curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); 156 157 /* enables http/2 if available */ 158 #ifdef CURLPIPE_MULTIPLEX 159 curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); 160 #endif 161 162 /* sets html start page */ 163 curl_multi_add_handle(multi_handle, make_handle(start_page)); 164 165 int msgs_left; 166 int pending = 0; 167 int complete = 0; 168 int still_running = 1; 169 while(still_running && !pending_interrupt) { 170 int numfds; 171 curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); 172 curl_multi_perform(multi_handle, &still_running); 173 174 /* See how the transfers went */ 175 CURLMsg *m = NULL; 176 while((m = curl_multi_info_read(multi_handle, &msgs_left))) { 177 if(m->msg == CURLMSG_DONE) { 178 CURL *handle = m->easy_handle; 179 char *url; 180 memory *mem; 181 curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); 182 curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); 183 if(m->data.result == CURLE_OK) { 184 long res_status; 185 curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); 186 if(res_status == 200) { 187 char *ctype; 188 curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); 189 printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); 190 if(is_html(ctype) && mem->size > 100) { 191 if(pending < max_requests && (complete + pending) < max_total) { 192 pending += follow_links(multi_handle, mem, url); 193 still_running = 1; 194 } 195 } 196 } 197 else { 198 printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); 199 } 200 } 201 else { 202 printf("[%d] Connection failure: %s\n", complete, url); 203 } 204 curl_multi_remove_handle(multi_handle, handle); 205 curl_easy_cleanup(handle); 206 free(mem->buf); 207 free(mem); 208 complete++; 209 pending--; 210 } 211 } 212 } 213 curl_multi_cleanup(multi_handle); 214 curl_global_cleanup(); 215 return 0; 216 } 217