Home | History | Annotate | Download | only in lib
      1 /***************************************************************************
      2  *                                  _   _ ____  _
      3  *  Project                     ___| | | |  _ \| |
      4  *                             / __| | | | |_) | |
      5  *                            | (__| |_| |  _ <| |___
      6  *                             \___|\___/|_| \_\_____|
      7  *
      8  * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel (at) haxx.se>, et al.
      9  *
     10  * This software is licensed as described in the file COPYING, which
     11  * you should have received as part of this distribution. The terms
     12  * are also available at https://curl.haxx.se/docs/copyright.html.
     13  *
     14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
     15  * copies of the Software, and permit persons to whom the Software is
     16  * furnished to do so, under the terms of the COPYING file.
     17  *
     18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
     19  * KIND, either express or implied.
     20  *
     21  ***************************************************************************/
     22 
     23 #include "curl_setup.h"
     24 
     25 #include "urldata.h"
     26 #include "urlapi-int.h"
     27 #include "strcase.h"
     28 #include "dotdot.h"
     29 #include "url.h"
     30 #include "escape.h"
     31 #include "curl_ctype.h"
     32 
     33 /* The last 3 #include files should be in this order */
     34 #include "curl_printf.h"
     35 #include "curl_memory.h"
     36 #include "memdebug.h"
     37 
     38   /* MSDOS/Windows style drive prefix, eg c: in c:foo */
     39 #define STARTS_WITH_DRIVE_PREFIX(str) \
     40   ((('a' <= str[0] && str[0] <= 'z') || \
     41     ('A' <= str[0] && str[0] <= 'Z')) && \
     42    (str[1] == ':'))
     43 
     44   /* MSDOS/Windows style drive prefix, optionally with
     45    * a '|' instead of ':', followed by a slash or NUL */
     46 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
     47   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
     48     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
     49    ((str)[1] == ':' || (str)[1] == '|') && \
     50    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
     51 
     52 /* Internal representation of CURLU. Point to URL-encoded strings. */
     53 struct Curl_URL {
     54   char *scheme;
     55   char *user;
     56   char *password;
     57   char *options; /* IMAP only? */
     58   char *host;
     59   char *port;
     60   char *path;
     61   char *query;
     62   char *fragment;
     63 
     64   char *scratch; /* temporary scratch area */
     65   long portnum; /* the numerical version */
     66 };
     67 
     68 #define DEFAULT_SCHEME "https"
     69 
     70 static void free_urlhandle(struct Curl_URL *u)
     71 {
     72   free(u->scheme);
     73   free(u->user);
     74   free(u->password);
     75   free(u->options);
     76   free(u->host);
     77   free(u->port);
     78   free(u->path);
     79   free(u->query);
     80   free(u->fragment);
     81   free(u->scratch);
     82 }
     83 
     84 /* move the full contents of one handle onto another and
     85    free the original */
     86 static void mv_urlhandle(struct Curl_URL *from,
     87                          struct Curl_URL *to)
     88 {
     89   free_urlhandle(to);
     90   *to = *from;
     91   free(from);
     92 }
     93 
     94 /*
     95  * Find the separator at the end of the host name, or the '?' in cases like
     96  * http://www.url.com?id=2380
     97  */
     98 static const char *find_host_sep(const char *url)
     99 {
    100   const char *sep;
    101   const char *query;
    102 
    103   /* Find the start of the hostname */
    104   sep = strstr(url, "//");
    105   if(!sep)
    106     sep = url;
    107   else
    108     sep += 2;
    109 
    110   query = strchr(sep, '?');
    111   sep = strchr(sep, '/');
    112 
    113   if(!sep)
    114     sep = url + strlen(url);
    115 
    116   if(!query)
    117     query = url + strlen(url);
    118 
    119   return sep < query ? sep : query;
    120 }
    121 
    122 /*
    123  * Decide in an encoding-independent manner whether a character in an
    124  * URL must be escaped. The same criterion must be used in strlen_url()
    125  * and strcpy_url().
    126  */
    127 static bool urlchar_needs_escaping(int c)
    128 {
    129     return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
    130 }
    131 
    132 /*
    133  * strlen_url() returns the length of the given URL if the spaces within the
    134  * URL were properly URL encoded.
    135  * URL encoding should be skipped for host names, otherwise IDN resolution
    136  * will fail.
    137  */
    138 static size_t strlen_url(const char *url, bool relative)
    139 {
    140   const unsigned char *ptr;
    141   size_t newlen = 0;
    142   bool left = TRUE; /* left side of the ? */
    143   const unsigned char *host_sep = (const unsigned char *) url;
    144 
    145   if(!relative)
    146     host_sep = (const unsigned char *) find_host_sep(url);
    147 
    148   for(ptr = (unsigned char *)url; *ptr; ptr++) {
    149 
    150     if(ptr < host_sep) {
    151       ++newlen;
    152       continue;
    153     }
    154 
    155     switch(*ptr) {
    156     case '?':
    157       left = FALSE;
    158       /* FALLTHROUGH */
    159     default:
    160       if(urlchar_needs_escaping(*ptr))
    161         newlen += 2;
    162       newlen++;
    163       break;
    164     case ' ':
    165       if(left)
    166         newlen += 3;
    167       else
    168         newlen++;
    169       break;
    170     }
    171   }
    172   return newlen;
    173 }
    174 
    175 /* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
    176  * the source URL accordingly.
    177  * URL encoding should be skipped for host names, otherwise IDN resolution
    178  * will fail.
    179  */
    180 static void strcpy_url(char *output, const char *url, bool relative)
    181 {
    182   /* we must add this with whitespace-replacing */
    183   bool left = TRUE;
    184   const unsigned char *iptr;
    185   char *optr = output;
    186   const unsigned char *host_sep = (const unsigned char *) url;
    187 
    188   if(!relative)
    189     host_sep = (const unsigned char *) find_host_sep(url);
    190 
    191   for(iptr = (unsigned char *)url;    /* read from here */
    192       *iptr;         /* until zero byte */
    193       iptr++) {
    194 
    195     if(iptr < host_sep) {
    196       *optr++ = *iptr;
    197       continue;
    198     }
    199 
    200     switch(*iptr) {
    201     case '?':
    202       left = FALSE;
    203       /* FALLTHROUGH */
    204     default:
    205       if(urlchar_needs_escaping(*iptr)) {
    206         msnprintf(optr, 4, "%%%02x", *iptr);
    207         optr += 3;
    208       }
    209       else
    210         *optr++=*iptr;
    211       break;
    212     case ' ':
    213       if(left) {
    214         *optr++='%'; /* add a '%' */
    215         *optr++='2'; /* add a '2' */
    216         *optr++='0'; /* add a '0' */
    217       }
    218       else
    219         *optr++='+'; /* add a '+' here */
    220       break;
    221     }
    222   }
    223   *optr = 0; /* zero terminate output buffer */
    224 
    225 }
    226 
    227 /*
    228  * Returns true if the given URL is absolute (as opposed to relative) within
    229  * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
    230  * non-NULL.
    231  */
    232 bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
    233 {
    234   size_t i;
    235 #ifdef WIN32
    236   if(STARTS_WITH_DRIVE_PREFIX(url))
    237     return FALSE;
    238 #endif
    239   for(i = 0; i < buflen && url[i]; ++i) {
    240     char s = url[i];
    241     if((s == ':') && (url[i + 1] == '/')) {
    242       if(buf)
    243         buf[i] = 0;
    244       return TRUE;
    245     }
    246     /* RFC 3986 3.1 explains:
    247       scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
    248     */
    249     else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
    250       if(buf)
    251         buf[i] = (char)TOLOWER(s);
    252     }
    253     else
    254       break;
    255   }
    256   return FALSE;
    257 }
    258 
    259 /*
    260  * Concatenate a relative URL to a base URL making it absolute.
    261  * URL-encodes any spaces.
    262  * The returned pointer must be freed by the caller unless NULL
    263  * (returns NULL on out of memory).
    264  */
    265 static char *concat_url(const char *base, const char *relurl)
    266 {
    267   /***
    268    TRY to append this new path to the old URL
    269    to the right of the host part. Oh crap, this is doomed to cause
    270    problems in the future...
    271   */
    272   char *newest;
    273   char *protsep;
    274   char *pathsep;
    275   size_t newlen;
    276   bool host_changed = FALSE;
    277 
    278   const char *useurl = relurl;
    279   size_t urllen;
    280 
    281   /* we must make our own copy of the URL to play with, as it may
    282      point to read-only data */
    283   char *url_clone = strdup(base);
    284 
    285   if(!url_clone)
    286     return NULL; /* skip out of this NOW */
    287 
    288   /* protsep points to the start of the host name */
    289   protsep = strstr(url_clone, "//");
    290   if(!protsep)
    291     protsep = url_clone;
    292   else
    293     protsep += 2; /* pass the slashes */
    294 
    295   if('/' != relurl[0]) {
    296     int level = 0;
    297 
    298     /* First we need to find out if there's a ?-letter in the URL,
    299        and cut it and the right-side of that off */
    300     pathsep = strchr(protsep, '?');
    301     if(pathsep)
    302       *pathsep = 0;
    303 
    304     /* we have a relative path to append to the last slash if there's one
    305        available, or if the new URL is just a query string (starts with a
    306        '?')  we append the new one at the end of the entire currently worked
    307        out URL */
    308     if(useurl[0] != '?') {
    309       pathsep = strrchr(protsep, '/');
    310       if(pathsep)
    311         *pathsep = 0;
    312     }
    313 
    314     /* Check if there's any slash after the host name, and if so, remember
    315        that position instead */
    316     pathsep = strchr(protsep, '/');
    317     if(pathsep)
    318       protsep = pathsep + 1;
    319     else
    320       protsep = NULL;
    321 
    322     /* now deal with one "./" or any amount of "../" in the newurl
    323        and act accordingly */
    324 
    325     if((useurl[0] == '.') && (useurl[1] == '/'))
    326       useurl += 2; /* just skip the "./" */
    327 
    328     while((useurl[0] == '.') &&
    329           (useurl[1] == '.') &&
    330           (useurl[2] == '/')) {
    331       level++;
    332       useurl += 3; /* pass the "../" */
    333     }
    334 
    335     if(protsep) {
    336       while(level--) {
    337         /* cut off one more level from the right of the original URL */
    338         pathsep = strrchr(protsep, '/');
    339         if(pathsep)
    340           *pathsep = 0;
    341         else {
    342           *protsep = 0;
    343           break;
    344         }
    345       }
    346     }
    347   }
    348   else {
    349     /* We got a new absolute path for this server */
    350 
    351     if((relurl[0] == '/') && (relurl[1] == '/')) {
    352       /* the new URL starts with //, just keep the protocol part from the
    353          original one */
    354       *protsep = 0;
    355       useurl = &relurl[2]; /* we keep the slashes from the original, so we
    356                               skip the new ones */
    357       host_changed = TRUE;
    358     }
    359     else {
    360       /* cut off the original URL from the first slash, or deal with URLs
    361          without slash */
    362       pathsep = strchr(protsep, '/');
    363       if(pathsep) {
    364         /* When people use badly formatted URLs, such as
    365            "http://www.url.com?dir=/home/daniel" we must not use the first
    366            slash, if there's a ?-letter before it! */
    367         char *sep = strchr(protsep, '?');
    368         if(sep && (sep < pathsep))
    369           pathsep = sep;
    370         *pathsep = 0;
    371       }
    372       else {
    373         /* There was no slash. Now, since we might be operating on a badly
    374            formatted URL, such as "http://www.url.com?id=2380" which doesn't
    375            use a slash separator as it is supposed to, we need to check for a
    376            ?-letter as well! */
    377         pathsep = strchr(protsep, '?');
    378         if(pathsep)
    379           *pathsep = 0;
    380       }
    381     }
    382   }
    383 
    384   /* If the new part contains a space, this is a mighty stupid redirect
    385      but we still make an effort to do "right". To the left of a '?'
    386      letter we replace each space with %20 while it is replaced with '+'
    387      on the right side of the '?' letter.
    388   */
    389   newlen = strlen_url(useurl, !host_changed);
    390 
    391   urllen = strlen(url_clone);
    392 
    393   newest = malloc(urllen + 1 + /* possible slash */
    394                   newlen + 1 /* zero byte */);
    395 
    396   if(!newest) {
    397     free(url_clone); /* don't leak this */
    398     return NULL;
    399   }
    400 
    401   /* copy over the root url part */
    402   memcpy(newest, url_clone, urllen);
    403 
    404   /* check if we need to append a slash */
    405   if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
    406     ;
    407   else
    408     newest[urllen++]='/';
    409 
    410   /* then append the new piece on the right side */
    411   strcpy_url(&newest[urllen], useurl, !host_changed);
    412 
    413   free(url_clone);
    414 
    415   return newest;
    416 }
    417 
    418 /*
    419  * parse_hostname_login()
    420  *
    421  * Parse the login details (user name, password and options) from the URL and
    422  * strip them out of the host name
    423  *
    424  */
    425 static CURLUcode parse_hostname_login(struct Curl_URL *u,
    426                                       const struct Curl_handler *h,
    427                                       char **hostname,
    428                                       unsigned int flags)
    429 {
    430   CURLUcode result = CURLUE_OK;
    431   CURLcode ccode;
    432   char *userp = NULL;
    433   char *passwdp = NULL;
    434   char *optionsp = NULL;
    435 
    436   /* At this point, we're hoping all the other special cases have
    437    * been taken care of, so conn->host.name is at most
    438    *    [user[:password][;options]]@]hostname
    439    *
    440    * We need somewhere to put the embedded details, so do that first.
    441    */
    442 
    443   char *ptr = strchr(*hostname, '@');
    444   char *login = *hostname;
    445 
    446   if(!ptr)
    447     goto out;
    448 
    449   /* We will now try to extract the
    450    * possible login information in a string like:
    451    * ftp://user:password@ftp.my.site:8021/README */
    452   *hostname = ++ptr;
    453 
    454   /* We could use the login information in the URL so extract it. Only parse
    455      options if the handler says we should. Note that 'h' might be NULL! */
    456   ccode = Curl_parse_login_details(login, ptr - login - 1,
    457                                    &userp, &passwdp,
    458                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
    459                                    &optionsp:NULL);
    460   if(ccode) {
    461     result = CURLUE_MALFORMED_INPUT;
    462     goto out;
    463   }
    464 
    465   if(userp) {
    466     if(flags & CURLU_DISALLOW_USER) {
    467       /* Option DISALLOW_USER is set and url contains username. */
    468       result = CURLUE_USER_NOT_ALLOWED;
    469       goto out;
    470     }
    471 
    472     u->user = userp;
    473   }
    474 
    475   if(passwdp)
    476     u->password = passwdp;
    477 
    478   if(optionsp)
    479     u->options = optionsp;
    480 
    481   return CURLUE_OK;
    482   out:
    483 
    484   free(userp);
    485   free(passwdp);
    486   free(optionsp);
    487 
    488   return result;
    489 }
    490 
    491 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
    492 {
    493   char *portptr = NULL;
    494   char endbracket;
    495   int len;
    496 
    497   /*
    498    * Find the end of an IPv6 address, either on the ']' ending bracket or
    499    * a percent-encoded zone index.
    500    */
    501   if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
    502                  &endbracket, &len)) {
    503     if(']' == endbracket)
    504       portptr = &hostname[len];
    505     else if('%' == endbracket) {
    506       int zonelen = len;
    507       if(1 == sscanf(hostname + zonelen, "25%*[^]]%c%n", &endbracket, &len)) {
    508         if(']' != endbracket)
    509           return CURLUE_MALFORMED_INPUT;
    510         portptr = &hostname[--zonelen + len + 1];
    511       }
    512       else
    513         return CURLUE_MALFORMED_INPUT;
    514     }
    515     else
    516       return CURLUE_MALFORMED_INPUT;
    517 
    518     /* this is a RFC2732-style specified IP-address */
    519     if(portptr && *portptr) {
    520       if(*portptr != ':')
    521         return CURLUE_MALFORMED_INPUT;
    522     }
    523     else
    524       portptr = NULL;
    525   }
    526   else
    527     portptr = strchr(hostname, ':');
    528 
    529   if(portptr) {
    530     char *rest;
    531     long port;
    532     char portbuf[7];
    533 
    534     /* Browser behavior adaptation. If there's a colon with no digits after,
    535        just cut off the name there which makes us ignore the colon and just
    536        use the default port. Firefox, Chrome and Safari all do that. */
    537     if(!portptr[1]) {
    538       *portptr = '\0';
    539       return CURLUE_OK;
    540     }
    541 
    542     if(!ISDIGIT(portptr[1]))
    543       return CURLUE_BAD_PORT_NUMBER;
    544 
    545     port = strtol(portptr + 1, &rest, 10);  /* Port number must be decimal */
    546 
    547     if((port <= 0) || (port > 0xffff))
    548       /* Single unix standard says port numbers are 16 bits long, but we don't
    549          treat port zero as OK. */
    550       return CURLUE_BAD_PORT_NUMBER;
    551 
    552     if(rest[0])
    553       return CURLUE_BAD_PORT_NUMBER;
    554 
    555     *portptr++ = '\0'; /* cut off the name there */
    556     *rest = 0;
    557     /* generate a new port number string to get rid of leading zeroes etc */
    558     msnprintf(portbuf, sizeof(portbuf), "%ld", port);
    559     u->portnum = port;
    560     u->port = strdup(portbuf);
    561     if(!u->port)
    562       return CURLUE_OUT_OF_MEMORY;
    563   }
    564 
    565   return CURLUE_OK;
    566 }
    567 
    568 /* scan for byte values < 31 or 127 */
    569 static CURLUcode junkscan(char *part)
    570 {
    571   if(part) {
    572     static const char badbytes[]={
    573       /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    574       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    575       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    576       0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    577       0x7f,
    578       0x00 /* zero terminate */
    579     };
    580     size_t n = strlen(part);
    581     size_t nfine = strcspn(part, badbytes);
    582     if(nfine != n)
    583       /* since we don't know which part is scanned, return a generic error
    584          code */
    585       return CURLUE_MALFORMED_INPUT;
    586   }
    587   return CURLUE_OK;
    588 }
    589 
    590 static CURLUcode hostname_check(char *hostname, unsigned int flags)
    591 {
    592   const char *l = NULL; /* accepted characters */
    593   size_t len;
    594   size_t hlen = strlen(hostname);
    595   (void)flags;
    596 
    597   if(hostname[0] == '[') {
    598     hostname++;
    599     l = "0123456789abcdefABCDEF::.%";
    600     hlen -= 2;
    601   }
    602 
    603   if(l) {
    604     /* only valid letters are ok */
    605     len = strspn(hostname, l);
    606     if(hlen != len)
    607       /* hostname with bad content */
    608       return CURLUE_MALFORMED_INPUT;
    609   }
    610   else {
    611     /* letters from the second string is not ok */
    612     len = strcspn(hostname, " ");
    613     if(hlen != len)
    614       /* hostname with bad content */
    615       return CURLUE_MALFORMED_INPUT;
    616   }
    617   return CURLUE_OK;
    618 }
    619 
    620 #define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
    621 
    622 static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
    623 {
    624   char *path;
    625   bool path_alloced = FALSE;
    626   char *hostname;
    627   char *query = NULL;
    628   char *fragment = NULL;
    629   CURLUcode result;
    630   bool url_has_scheme = FALSE;
    631   char schemebuf[MAX_SCHEME_LEN];
    632   char *schemep = NULL;
    633   size_t schemelen = 0;
    634   size_t urllen;
    635   const struct Curl_handler *h = NULL;
    636 
    637   if(!url)
    638     return CURLUE_MALFORMED_INPUT;
    639 
    640   /*************************************************************
    641    * Parse the URL.
    642    ************************************************************/
    643   /* allocate scratch area */
    644   urllen = strlen(url);
    645   path = u->scratch = malloc(urllen * 2 + 2);
    646   if(!path)
    647     return CURLUE_OUT_OF_MEMORY;
    648 
    649   hostname = &path[urllen + 1];
    650   hostname[0] = 0;
    651 
    652   if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
    653     url_has_scheme = TRUE;
    654     schemelen = strlen(schemebuf);
    655   }
    656 
    657   /* handle the file: scheme */
    658   if(url_has_scheme && strcasecompare(schemebuf, "file")) {
    659     /* path has been allocated large enough to hold this */
    660     strcpy(path, &url[5]);
    661 
    662     hostname = NULL; /* no host for file: URLs */
    663     u->scheme = strdup("file");
    664     if(!u->scheme)
    665       return CURLUE_OUT_OF_MEMORY;
    666 
    667     /* Extra handling URLs with an authority component (i.e. that start with
    668      * "file://")
    669      *
    670      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
    671      * RFC 8089, but not the (current) WHAT-WG URL spec.
    672      */
    673     if(path[0] == '/' && path[1] == '/') {
    674       /* swallow the two slashes */
    675       char *ptr = &path[2];
    676 
    677       /*
    678        * According to RFC 8089, a file: URL can be reliably dereferenced if:
    679        *
    680        *  o it has no/blank hostname, or
    681        *
    682        *  o the hostname matches "localhost" (case-insensitively), or
    683        *
    684        *  o the hostname is a FQDN that resolves to this machine.
    685        *
    686        * For brevity, we only consider URLs with empty, "localhost", or
    687        * "127.0.0.1" hostnames as local.
    688        *
    689        * Additionally, there is an exception for URLs with a Windows drive
    690        * letter in the authority (which was accidentally omitted from RFC 8089
    691        * Appendix E, but believe me, it was meant to be there. --MK)
    692        */
    693       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
    694         /* the URL includes a host name, it must match "localhost" or
    695            "127.0.0.1" to be valid */
    696         if(!checkprefix("localhost/", ptr) &&
    697            !checkprefix("127.0.0.1/", ptr)) {
    698           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
    699              none */
    700           return CURLUE_MALFORMED_INPUT;
    701         }
    702         ptr += 9; /* now points to the slash after the host */
    703       }
    704 
    705       path = ptr;
    706     }
    707 
    708 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
    709     /* Don't allow Windows drive letters when not in Windows.
    710      * This catches both "file:/c:" and "file:c:" */
    711     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
    712        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
    713       /* File drive letters are only accepted in MSDOS/Windows */
    714       return CURLUE_MALFORMED_INPUT;
    715     }
    716 #else
    717     /* If the path starts with a slash and a drive letter, ditch the slash */
    718     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
    719       /* This cannot be done with strcpy, as the memory chunks overlap! */
    720       memmove(path, &path[1], strlen(&path[1]) + 1);
    721     }
    722 #endif
    723 
    724   }
    725   else {
    726     /* clear path */
    727     const char *p;
    728     const char *hostp;
    729     size_t len;
    730     path[0] = 0;
    731 
    732     if(url_has_scheme) {
    733       int i = 0;
    734       p = &url[schemelen + 1];
    735       while(p && (*p == '/') && (i < 4)) {
    736         p++;
    737         i++;
    738       }
    739       if((i < 1) || (i>3))
    740         /* less than one or more than three slashes */
    741         return CURLUE_MALFORMED_INPUT;
    742 
    743       schemep = schemebuf;
    744       if(!Curl_builtin_scheme(schemep) &&
    745          !(flags & CURLU_NON_SUPPORT_SCHEME))
    746         return CURLUE_UNSUPPORTED_SCHEME;
    747 
    748       if(junkscan(schemep))
    749         return CURLUE_MALFORMED_INPUT;
    750     }
    751     else {
    752       /* no scheme! */
    753 
    754       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
    755         return CURLUE_MALFORMED_INPUT;
    756       if(flags & CURLU_DEFAULT_SCHEME)
    757         schemep = (char *) DEFAULT_SCHEME;
    758 
    759       /*
    760        * The URL was badly formatted, let's try without scheme specified.
    761        */
    762       p = url;
    763     }
    764     hostp = p; /* host name starts here */
    765 
    766     while(*p && !HOSTNAME_END(*p)) /* find end of host name */
    767       p++;
    768 
    769     len = p - hostp;
    770     if(!len)
    771       return CURLUE_MALFORMED_INPUT;
    772 
    773     memcpy(hostname, hostp, len);
    774     hostname[len] = 0;
    775 
    776     if((flags & CURLU_GUESS_SCHEME) && !schemep) {
    777       /* legacy curl-style guess based on host name */
    778       if(checkprefix("ftp.", hostname))
    779         schemep = (char *)"ftp";
    780       else if(checkprefix("dict.", hostname))
    781         schemep = (char *)"dict";
    782       else if(checkprefix("ldap.", hostname))
    783         schemep = (char *)"ldap";
    784       else if(checkprefix("imap.", hostname))
    785         schemep = (char *)"imap";
    786       else if(checkprefix("smtp.", hostname))
    787         schemep = (char *)"smtp";
    788       else if(checkprefix("pop3.", hostname))
    789         schemep = (char *)"pop3";
    790       else
    791         schemep = (char *)"http";
    792     }
    793 
    794     len = strlen(p);
    795     memcpy(path, p, len);
    796     path[len] = 0;
    797 
    798     u->scheme = strdup(schemep);
    799     if(!u->scheme)
    800       return CURLUE_OUT_OF_MEMORY;
    801   }
    802 
    803   /* if this is a known scheme, get some details */
    804   h = Curl_builtin_scheme(u->scheme);
    805 
    806   if(junkscan(path))
    807     return CURLUE_MALFORMED_INPUT;
    808 
    809   query = strchr(path, '?');
    810   if(query)
    811     *query++ = 0;
    812 
    813   fragment = strchr(query?query:path, '#');
    814   if(fragment)
    815     *fragment++ = 0;
    816 
    817   if(!path[0])
    818     /* if there's no path set, unset */
    819     path = NULL;
    820   else if(!(flags & CURLU_PATH_AS_IS)) {
    821     /* sanitise paths and remove ../ and ./ sequences according to RFC3986 */
    822     char *newp = Curl_dedotdotify(path);
    823     if(!newp)
    824       return CURLUE_OUT_OF_MEMORY;
    825 
    826     if(strcmp(newp, path)) {
    827       /* if we got a new version */
    828       path = newp;
    829       path_alloced = TRUE;
    830     }
    831     else
    832       free(newp);
    833   }
    834   if(path) {
    835     u->path = path_alloced?path:strdup(path);
    836     if(!u->path)
    837       return CURLUE_OUT_OF_MEMORY;
    838   }
    839 
    840   if(hostname) {
    841     /*
    842      * Parse the login details and strip them out of the host name.
    843      */
    844     if(junkscan(hostname))
    845       return CURLUE_MALFORMED_INPUT;
    846 
    847     result = parse_hostname_login(u, h, &hostname, flags);
    848     if(result)
    849       return result;
    850 
    851     result = Curl_parse_port(u, hostname);
    852     if(result)
    853       return result;
    854 
    855     result = hostname_check(hostname, flags);
    856     if(result)
    857       return result;
    858 
    859     u->host = strdup(hostname);
    860     if(!u->host)
    861       return CURLUE_OUT_OF_MEMORY;
    862   }
    863 
    864   if(query) {
    865     u->query = strdup(query);
    866     if(!u->query)
    867       return CURLUE_OUT_OF_MEMORY;
    868   }
    869   if(fragment && fragment[0]) {
    870     u->fragment = strdup(fragment);
    871     if(!u->fragment)
    872       return CURLUE_OUT_OF_MEMORY;
    873   }
    874 
    875   free(u->scratch);
    876   u->scratch = NULL;
    877 
    878   return CURLUE_OK;
    879 }
    880 
    881 /*
    882  * Parse the URL and set the relevant members of the Curl_URL struct.
    883  */
    884 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
    885 {
    886   CURLUcode result = seturl(url, u, flags);
    887   if(result) {
    888     free_urlhandle(u);
    889     memset(u, 0, sizeof(struct Curl_URL));
    890   }
    891   return result;
    892 }
    893 
    894 /*
    895  */
    896 CURLU *curl_url(void)
    897 {
    898   return calloc(sizeof(struct Curl_URL), 1);
    899 }
    900 
    901 void curl_url_cleanup(CURLU *u)
    902 {
    903   if(u) {
    904     free_urlhandle(u);
    905     free(u);
    906   }
    907 }
    908 
    909 #define DUP(dest, src, name)         \
    910   if(src->name) {                    \
    911     dest->name = strdup(src->name);  \
    912     if(!dest->name)                  \
    913       goto fail;                     \
    914   }
    915 
    916 CURLU *curl_url_dup(CURLU *in)
    917 {
    918   struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
    919   if(u) {
    920     DUP(u, in, scheme);
    921     DUP(u, in, user);
    922     DUP(u, in, password);
    923     DUP(u, in, options);
    924     DUP(u, in, host);
    925     DUP(u, in, port);
    926     DUP(u, in, path);
    927     DUP(u, in, query);
    928     DUP(u, in, fragment);
    929     u->portnum = in->portnum;
    930   }
    931   return u;
    932   fail:
    933   curl_url_cleanup(u);
    934   return NULL;
    935 }
    936 
    937 CURLUcode curl_url_get(CURLU *u, CURLUPart what,
    938                        char **part, unsigned int flags)
    939 {
    940   char *ptr;
    941   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
    942   char portbuf[7];
    943   bool urldecode = (flags & CURLU_URLDECODE)?1:0;
    944   bool plusdecode = FALSE;
    945   (void)flags;
    946   if(!u)
    947     return CURLUE_BAD_HANDLE;
    948   if(!part)
    949     return CURLUE_BAD_PARTPOINTER;
    950   *part = NULL;
    951 
    952   switch(what) {
    953   case CURLUPART_SCHEME:
    954     ptr = u->scheme;
    955     ifmissing = CURLUE_NO_SCHEME;
    956     urldecode = FALSE; /* never for schemes */
    957     break;
    958   case CURLUPART_USER:
    959     ptr = u->user;
    960     ifmissing = CURLUE_NO_USER;
    961     break;
    962   case CURLUPART_PASSWORD:
    963     ptr = u->password;
    964     ifmissing = CURLUE_NO_PASSWORD;
    965     break;
    966   case CURLUPART_OPTIONS:
    967     ptr = u->options;
    968     ifmissing = CURLUE_NO_OPTIONS;
    969     break;
    970   case CURLUPART_HOST:
    971     ptr = u->host;
    972     ifmissing = CURLUE_NO_HOST;
    973     break;
    974   case CURLUPART_PORT:
    975     ptr = u->port;
    976     ifmissing = CURLUE_NO_PORT;
    977     urldecode = FALSE; /* never for port */
    978     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
    979       /* there's no stored port number, but asked to deliver
    980          a default one for the scheme */
    981       const struct Curl_handler *h =
    982         Curl_builtin_scheme(u->scheme);
    983       if(h) {
    984         msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
    985         ptr = portbuf;
    986       }
    987     }
    988     else if(ptr && u->scheme) {
    989       /* there is a stored port number, but ask to inhibit if
    990          it matches the default one for the scheme */
    991       const struct Curl_handler *h =
    992         Curl_builtin_scheme(u->scheme);
    993       if(h && (h->defport == u->portnum) &&
    994          (flags & CURLU_NO_DEFAULT_PORT))
    995         ptr = NULL;
    996     }
    997     break;
    998   case CURLUPART_PATH:
    999     ptr = u->path;
   1000     if(!ptr) {
   1001       ptr = u->path = strdup("/");
   1002       if(!u->path)
   1003         return CURLUE_OUT_OF_MEMORY;
   1004     }
   1005     break;
   1006   case CURLUPART_QUERY:
   1007     ptr = u->query;
   1008     ifmissing = CURLUE_NO_QUERY;
   1009     plusdecode = urldecode;
   1010     break;
   1011   case CURLUPART_FRAGMENT:
   1012     ptr = u->fragment;
   1013     ifmissing = CURLUE_NO_FRAGMENT;
   1014     break;
   1015   case CURLUPART_URL: {
   1016     char *url;
   1017     char *scheme;
   1018     char *options = u->options;
   1019     char *port = u->port;
   1020     if(u->scheme && strcasecompare("file", u->scheme)) {
   1021       url = aprintf("file://%s%s%s",
   1022                     u->path,
   1023                     u->fragment? "#": "",
   1024                     u->fragment? u->fragment : "");
   1025     }
   1026     else if(!u->host)
   1027       return CURLUE_NO_HOST;
   1028     else {
   1029       const struct Curl_handler *h = NULL;
   1030       if(u->scheme)
   1031         scheme = u->scheme;
   1032       else if(flags & CURLU_DEFAULT_SCHEME)
   1033         scheme = (char *) DEFAULT_SCHEME;
   1034       else
   1035         return CURLUE_NO_SCHEME;
   1036 
   1037       if(scheme) {
   1038         h = Curl_builtin_scheme(scheme);
   1039         if(!port && (flags & CURLU_DEFAULT_PORT)) {
   1040           /* there's no stored port number, but asked to deliver
   1041              a default one for the scheme */
   1042           if(h) {
   1043             msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
   1044             port = portbuf;
   1045           }
   1046         }
   1047         else if(port) {
   1048           /* there is a stored port number, but asked to inhibit if it matches
   1049              the default one for the scheme */
   1050           if(h && (h->defport == u->portnum) &&
   1051              (flags & CURLU_NO_DEFAULT_PORT))
   1052             port = NULL;
   1053         }
   1054       }
   1055       if(h && !(h->flags & PROTOPT_URLOPTIONS))
   1056         options = NULL;
   1057 
   1058       url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
   1059                     scheme,
   1060                     u->user ? u->user : "",
   1061                     u->password ? ":": "",
   1062                     u->password ? u->password : "",
   1063                     options ? ";" : "",
   1064                     options ? options : "",
   1065                     (u->user || u->password || options) ? "@": "",
   1066                     u->host,
   1067                     port ? ":": "",
   1068                     port ? port : "",
   1069                     (u->path && (u->path[0] != '/')) ? "/": "",
   1070                     u->path ? u->path : "/",
   1071                     (u->query && u->query[0]) ? "?": "",
   1072                     (u->query && u->query[0]) ? u->query : "",
   1073                     u->fragment? "#": "",
   1074                     u->fragment? u->fragment : "");
   1075     }
   1076     if(!url)
   1077       return CURLUE_OUT_OF_MEMORY;
   1078     *part = url;
   1079     return CURLUE_OK;
   1080   }
   1081   default:
   1082     ptr = NULL;
   1083     break;
   1084   }
   1085   if(ptr) {
   1086     *part = strdup(ptr);
   1087     if(!*part)
   1088       return CURLUE_OUT_OF_MEMORY;
   1089     if(plusdecode) {
   1090       /* convert + to space */
   1091       char *plus;
   1092       for(plus = *part; *plus; ++plus) {
   1093         if(*plus == '+')
   1094           *plus = ' ';
   1095       }
   1096     }
   1097     if(urldecode) {
   1098       char *decoded;
   1099       size_t dlen;
   1100       CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
   1101       free(*part);
   1102       if(res) {
   1103         *part = NULL;
   1104         return CURLUE_URLDECODE;
   1105       }
   1106       *part = decoded;
   1107     }
   1108     return CURLUE_OK;
   1109   }
   1110   else
   1111     return ifmissing;
   1112 }
   1113 
   1114 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
   1115                        const char *part, unsigned int flags)
   1116 {
   1117   char **storep = NULL;
   1118   long port = 0;
   1119   bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
   1120   bool plusencode = FALSE;
   1121   bool urlskipslash = FALSE;
   1122   bool appendquery = FALSE;
   1123   bool equalsencode = FALSE;
   1124 
   1125   if(!u)
   1126     return CURLUE_BAD_HANDLE;
   1127   if(!part) {
   1128     /* setting a part to NULL clears it */
   1129     switch(what) {
   1130     case CURLUPART_URL:
   1131       break;
   1132     case CURLUPART_SCHEME:
   1133       storep = &u->scheme;
   1134       break;
   1135     case CURLUPART_USER:
   1136       storep = &u->user;
   1137       break;
   1138     case CURLUPART_PASSWORD:
   1139       storep = &u->password;
   1140       break;
   1141     case CURLUPART_OPTIONS:
   1142       storep = &u->options;
   1143       break;
   1144     case CURLUPART_HOST:
   1145       storep = &u->host;
   1146       break;
   1147     case CURLUPART_PORT:
   1148       storep = &u->port;
   1149       break;
   1150     case CURLUPART_PATH:
   1151       storep = &u->path;
   1152       break;
   1153     case CURLUPART_QUERY:
   1154       storep = &u->query;
   1155       break;
   1156     case CURLUPART_FRAGMENT:
   1157       storep = &u->fragment;
   1158       break;
   1159     default:
   1160       return CURLUE_UNKNOWN_PART;
   1161     }
   1162     if(storep && *storep) {
   1163       free(*storep);
   1164       *storep = NULL;
   1165     }
   1166     return CURLUE_OK;
   1167   }
   1168 
   1169   switch(what) {
   1170   case CURLUPART_SCHEME:
   1171     if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
   1172        /* verify that it is a fine scheme */
   1173        !Curl_builtin_scheme(part))
   1174       return CURLUE_UNSUPPORTED_SCHEME;
   1175     storep = &u->scheme;
   1176     urlencode = FALSE; /* never */
   1177     break;
   1178   case CURLUPART_USER:
   1179     storep = &u->user;
   1180     break;
   1181   case CURLUPART_PASSWORD:
   1182     storep = &u->password;
   1183     break;
   1184   case CURLUPART_OPTIONS:
   1185     storep = &u->options;
   1186     break;
   1187   case CURLUPART_HOST:
   1188     storep = &u->host;
   1189     break;
   1190   case CURLUPART_PORT:
   1191     urlencode = FALSE; /* never */
   1192     port = strtol(part, NULL, 10);  /* Port number must be decimal */
   1193     if((port <= 0) || (port > 0xffff))
   1194       return CURLUE_BAD_PORT_NUMBER;
   1195     storep = &u->port;
   1196     break;
   1197   case CURLUPART_PATH:
   1198     urlskipslash = TRUE;
   1199     storep = &u->path;
   1200     break;
   1201   case CURLUPART_QUERY:
   1202     plusencode = urlencode;
   1203     appendquery = (flags & CURLU_APPENDQUERY)?1:0;
   1204     equalsencode = appendquery;
   1205     storep = &u->query;
   1206     break;
   1207   case CURLUPART_FRAGMENT:
   1208     storep = &u->fragment;
   1209     break;
   1210   case CURLUPART_URL: {
   1211     /*
   1212      * Allow a new URL to replace the existing (if any) contents.
   1213      *
   1214      * If the existing contents is enough for a URL, allow a relative URL to
   1215      * replace it.
   1216      */
   1217     CURLUcode result;
   1218     char *oldurl;
   1219     char *redired_url;
   1220     CURLU *handle2;
   1221 
   1222     if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN)) {
   1223       handle2 = curl_url();
   1224       if(!handle2)
   1225         return CURLUE_OUT_OF_MEMORY;
   1226       result = parseurl(part, handle2, flags);
   1227       if(!result)
   1228         mv_urlhandle(handle2, u);
   1229       else
   1230         curl_url_cleanup(handle2);
   1231       return result;
   1232     }
   1233     /* extract the full "old" URL to do the redirect on */
   1234     result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
   1235     if(result) {
   1236       /* couldn't get the old URL, just use the new! */
   1237       handle2 = curl_url();
   1238       if(!handle2)
   1239         return CURLUE_OUT_OF_MEMORY;
   1240       result = parseurl(part, handle2, flags);
   1241       if(!result)
   1242         mv_urlhandle(handle2, u);
   1243       else
   1244         curl_url_cleanup(handle2);
   1245       return result;
   1246     }
   1247 
   1248     /* apply the relative part to create a new URL */
   1249     redired_url = concat_url(oldurl, part);
   1250     free(oldurl);
   1251     if(!redired_url)
   1252       return CURLUE_OUT_OF_MEMORY;
   1253 
   1254     /* now parse the new URL */
   1255     handle2 = curl_url();
   1256     if(!handle2) {
   1257       free(redired_url);
   1258       return CURLUE_OUT_OF_MEMORY;
   1259     }
   1260     result = parseurl(redired_url, handle2, flags);
   1261     free(redired_url);
   1262     if(!result)
   1263       mv_urlhandle(handle2, u);
   1264     else
   1265       curl_url_cleanup(handle2);
   1266     return result;
   1267   }
   1268   default:
   1269     return CURLUE_UNKNOWN_PART;
   1270   }
   1271   if(storep) {
   1272     const char *newp = part;
   1273     size_t nalloc = strlen(part);
   1274 
   1275     if(urlencode) {
   1276       const char *i;
   1277       char *o;
   1278       bool free_part = FALSE;
   1279       char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
   1280       if(!enc)
   1281         return CURLUE_OUT_OF_MEMORY;
   1282       if(plusencode) {
   1283         /* space to plus */
   1284         i = part;
   1285         for(o = enc; *i; ++o, ++i)
   1286           *o = (*i == ' ') ? '+' : *i;
   1287         *o = 0; /* zero terminate */
   1288         part = strdup(enc);
   1289         if(!part) {
   1290           free(enc);
   1291           return CURLUE_OUT_OF_MEMORY;
   1292         }
   1293         free_part = TRUE;
   1294       }
   1295       for(i = part, o = enc; *i; i++) {
   1296         if(Curl_isunreserved(*i) ||
   1297            ((*i == '/') && urlskipslash) ||
   1298            ((*i == '=') && equalsencode) ||
   1299            ((*i == '+') && plusencode)) {
   1300           if((*i == '=') && equalsencode)
   1301             /* only skip the first equals sign */
   1302             equalsencode = FALSE;
   1303           *o = *i;
   1304           o++;
   1305         }
   1306         else {
   1307           msnprintf(o, 4, "%%%02x", *i);
   1308           o += 3;
   1309         }
   1310       }
   1311       *o = 0; /* zero terminate */
   1312       newp = enc;
   1313       if(free_part)
   1314         free((char *)part);
   1315     }
   1316     else {
   1317       char *p;
   1318       newp = strdup(part);
   1319       if(!newp)
   1320         return CURLUE_OUT_OF_MEMORY;
   1321       p = (char *)newp;
   1322       while(*p) {
   1323         /* make sure percent encoded are lower case */
   1324         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
   1325            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
   1326           p[1] = (char)TOLOWER(p[1]);
   1327           p[2] = (char)TOLOWER(p[2]);
   1328           p += 3;
   1329         }
   1330         else
   1331           p++;
   1332       }
   1333     }
   1334 
   1335     if(appendquery) {
   1336       /* Append the string onto the old query. Add a '&' separator if none is
   1337          present at the end of the exsting query already */
   1338       size_t querylen = u->query ? strlen(u->query) : 0;
   1339       bool addamperand = querylen && (u->query[querylen -1] != '&');
   1340       if(querylen) {
   1341         size_t newplen = strlen(newp);
   1342         char *p = malloc(querylen + addamperand + newplen + 1);
   1343         if(!p) {
   1344           free((char *)newp);
   1345           return CURLUE_OUT_OF_MEMORY;
   1346         }
   1347         strcpy(p, u->query); /* original query */
   1348         if(addamperand)
   1349           p[querylen] = '&'; /* ampersand */
   1350         strcpy(&p[querylen + addamperand], newp); /* new suffix */
   1351         free((char *)newp);
   1352         free(*storep);
   1353         *storep = p;
   1354         return CURLUE_OK;
   1355       }
   1356     }
   1357 
   1358     free(*storep);
   1359     *storep = (char *)newp;
   1360   }
   1361   /* set after the string, to make it not assigned if the allocation above
   1362      fails */
   1363   if(port)
   1364     u->portnum = port;
   1365   return CURLUE_OK;
   1366 }
   1367