1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel (at) haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at http://curl.haxx.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 ***************************************************************************/ 22 /* Download a document and use libtidy to parse the HTML. 23 * Written by Jeff Pohlmeyer 24 * 25 * LibTidy => http://tidy.sourceforge.net 26 * 27 * gcc -Wall -I/usr/local/include tidycurl.c -lcurl -ltidy -o tidycurl 28 * 29 */ 30 31 #include <stdio.h> 32 #include <tidy/tidy.h> 33 #include <tidy/buffio.h> 34 #include <curl/curl.h> 35 36 /* curl write callback, to fill tidy's input buffer... */ 37 uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) 38 { 39 uint r; 40 r = size * nmemb; 41 tidyBufAppend( out, in, r ); 42 return(r); 43 } 44 45 /* Traverse the document tree */ 46 void dumpNode(TidyDoc doc, TidyNode tnod, int indent ) 47 { 48 TidyNode child; 49 for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) 50 { 51 ctmbstr name = tidyNodeGetName( child ); 52 if ( name ) 53 { 54 /* if it has a name, then it's an HTML tag ... */ 55 TidyAttr attr; 56 printf( "%*.*s%s ", indent, indent, "<", name); 57 /* walk the attribute list */ 58 for ( attr=tidyAttrFirst(child); attr; attr=tidyAttrNext(attr) ) { 59 printf(tidyAttrName(attr)); 60 tidyAttrValue(attr)?printf("=\"%s\" ", 61 tidyAttrValue(attr)):printf(" "); 62 } 63 printf( ">\n"); 64 } 65 else { 66 /* if it doesn't have a name, then it's probably text, cdata, etc... */ 67 TidyBuffer buf; 68 tidyBufInit(&buf); 69 tidyNodeGetText(doc, child, &buf); 70 printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); 71 tidyBufFree(&buf); 72 } 73 dumpNode( doc, child, indent + 4 ); /* recursive */ 74 } 75 } 76 77 78 int main(int argc, char **argv ) 79 { 80 CURL *curl; 81 char curl_errbuf[CURL_ERROR_SIZE]; 82 TidyDoc tdoc; 83 TidyBuffer docbuf = {0}; 84 TidyBuffer tidy_errbuf = {0}; 85 int err; 86 if ( argc == 2) { 87 curl = curl_easy_init(); 88 curl_easy_setopt(curl, CURLOPT_URL, argv[1]); 89 curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); 90 curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); 91 curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); 92 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); 93 94 tdoc = tidyCreate(); 95 tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ 96 tidyOptSetInt(tdoc, TidyWrapLen, 4096); 97 tidySetErrorBuffer( tdoc, &tidy_errbuf ); 98 tidyBufInit(&docbuf); 99 100 curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); 101 err=curl_easy_perform(curl); 102 if ( !err ) { 103 err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ 104 if ( err >= 0 ) { 105 err = tidyCleanAndRepair(tdoc); /* fix any problems */ 106 if ( err >= 0 ) { 107 err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ 108 if ( err >= 0 ) { 109 dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */ 110 fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ 111 } 112 } 113 } 114 } 115 else 116 fprintf(stderr, "%s\n", curl_errbuf); 117 118 /* clean-up */ 119 curl_easy_cleanup(curl); 120 tidyBufFree(&docbuf); 121 tidyBufFree(&tidy_errbuf); 122 tidyRelease(tdoc); 123 return(err); 124 125 } 126 else 127 printf( "usage: %s <url>\n", argv[0] ); 128 129 return(0); 130 } 131