1 /* 2 * Copyright 2001-2004 Brandon Long 3 * All Rights Reserved. 4 * 5 * ClearSilver Templating System 6 * 7 * This code is made available under the terms of the ClearSilver License. 8 * http://www.clearsilver.net/license.hdf 9 * 10 */ 11 12 #include "cs_config.h" 13 14 #include <stdlib.h> 15 #include <string.h> 16 #include <sys/types.h> 17 #include <regex.h> 18 #include <ctype.h> 19 #include "util/neo_misc.h" 20 #include "util/neo_err.h" 21 #include "util/neo_str.h" 22 #include "html.h" 23 #include "cgi.h" 24 25 static int has_space_formatting(const char *src, int slen) 26 { 27 int spaces = 0; 28 int returns = 0; 29 int ascii_art = 0; 30 int x = 0; 31 32 for (x = 0; x < slen; x++) 33 { 34 if (src[x] == '\t') return 1; 35 if (src[x] == ' ') 36 { 37 spaces++; 38 if (x && (src[x-1] == '.')) 39 spaces--; 40 } 41 else if (src[x] == '\n') 42 { 43 spaces = 0; 44 returns++; 45 } 46 else if (strchr ("/\\<>:[]!@#$%^&*()|", src[x])) 47 { 48 ascii_art++; 49 if (ascii_art > 3) return 2; 50 } 51 else if (src[x] != '\r') 52 { 53 if (returns > 2) return 1; 54 if (spaces > 2) return 1; 55 returns = 0; 56 spaces = 0; 57 ascii_art = 0; 58 } 59 } 60 61 return 0; 62 } 63 64 /* 65 static int has_long_lines (char *s, int l) 66 { 67 char *ptr; 68 int x = 0; 69 70 while (x < l) 71 { 72 ptr = strchr (s + x, '\n'); 73 if (ptr == NULL) 74 { 75 if (l - x > 75) return 1; 76 return 0; 77 } 78 if (ptr - (s + x) > 75) return 1; 79 x = ptr - s + 1; 80 } 81 return 0; 82 } 83 */ 84 85 /* The first step is to actually find all of the URLs and email 86 * addresses using our handy regular expressions. We then mark these, 87 * and then go through convert non-special areas with straight 88 * text->html escapes, and convert special parts as special parts 89 */ 90 struct _parts { 91 int begin; 92 int end; 93 int type; 94 }; 95 96 #define SC_TYPE_TEXT 1 97 #define SC_TYPE_URL 2 98 #define SC_TYPE_EMAIL 3 99 100 static char *EmailRe = "[^][@:;<>\\\"()[:space:][:cntrl:]]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]"; 101 static char *URLRe = "((http|https|ftp|mailto):(//)?[^[:space:]>\"\t]*|www\\.[-a-z0-9\\.]+)[^[:space:];\t\">]*"; 102 103 static NEOERR *split_and_convert (const char *src, int slen, 104 STRING *out, HTML_CONVERT_OPTS *opts) 105 { 106 NEOERR *err = STATUS_OK; 107 static int compiled = 0; 108 static regex_t email_re, url_re; 109 regmatch_t email_match, url_match; 110 int errcode; 111 char *ptr, *esc; 112 char errbuf[256]; 113 struct _parts *parts; 114 int part_count; 115 int part; 116 int x, i; 117 int spaces = 0; 118 119 if (!compiled) 120 { 121 if ((errcode = regcomp (&email_re, EmailRe, REG_ICASE | REG_EXTENDED))) 122 { 123 regerror (errcode, &email_re, errbuf, sizeof(errbuf)); 124 return nerr_raise (NERR_PARSE, "Unable to compile EmailRE: %s", errbuf); 125 } 126 if ((errcode = regcomp (&url_re, URLRe, REG_ICASE | REG_EXTENDED))) 127 { 128 regerror (errcode, &url_re, errbuf, sizeof(errbuf)); 129 return nerr_raise (NERR_PARSE, "Unable to compile URLRe: %s", errbuf); 130 } 131 compiled = 1; 132 } 133 134 part_count = 20; 135 parts = (struct _parts *) malloc (sizeof(struct _parts) * part_count); 136 part = 0; 137 138 x = 0; 139 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) 140 { 141 email_match.rm_so = -1; 142 email_match.rm_eo = -1; 143 } 144 else 145 { 146 email_match.rm_so += x; 147 email_match.rm_eo += x; 148 } 149 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) 150 { 151 url_match.rm_so = -1; 152 url_match.rm_eo = -1; 153 } 154 else 155 { 156 url_match.rm_so += x; 157 url_match.rm_eo += x; 158 } 159 while ((x < slen) && !((email_match.rm_so == -1) && (url_match.rm_so == -1))) 160 { 161 if (part >= part_count) 162 { 163 part_count *= 2; 164 parts = (struct _parts *) realloc (parts, sizeof(struct _parts) * part_count); 165 } 166 if ((url_match.rm_so != -1) && ((email_match.rm_so == -1) || (url_match.rm_so <= email_match.rm_so))) 167 { 168 parts[part].begin = url_match.rm_so; 169 parts[part].end = url_match.rm_eo; 170 parts[part].type = SC_TYPE_URL; 171 x = parts[part].end + 1; 172 part++; 173 if (x < slen) 174 { 175 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) 176 { 177 url_match.rm_so = -1; 178 url_match.rm_eo = -1; 179 } 180 else 181 { 182 url_match.rm_so += x; 183 url_match.rm_eo += x; 184 } 185 if ((email_match.rm_so != -1) && (x > email_match.rm_so)) 186 { 187 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) 188 { 189 email_match.rm_so = -1; 190 email_match.rm_eo = -1; 191 } 192 else 193 { 194 email_match.rm_so += x; 195 email_match.rm_eo += x; 196 } 197 } 198 } 199 } 200 else 201 { 202 parts[part].begin = email_match.rm_so; 203 parts[part].end = email_match.rm_eo; 204 parts[part].type = SC_TYPE_EMAIL; 205 x = parts[part].end + 1; 206 part++; 207 if (x < slen) 208 { 209 if (regexec (&email_re, src+x, 1, &email_match, 0) != 0) 210 { 211 email_match.rm_so = -1; 212 email_match.rm_eo = -1; 213 } 214 else 215 { 216 email_match.rm_so += x; 217 email_match.rm_eo += x; 218 } 219 if ((url_match.rm_so != -1) && (x > url_match.rm_so)) 220 { 221 if (regexec (&url_re, src+x, 1, &url_match, 0) != 0) 222 { 223 url_match.rm_so = -1; 224 url_match.rm_eo = -1; 225 } 226 else 227 { 228 url_match.rm_so += x; 229 url_match.rm_eo += x; 230 } 231 } 232 } 233 } 234 } 235 236 i = 0; 237 x = 0; 238 while (x < slen) 239 { 240 if ((i >= part) || (x < parts[i].begin)) 241 { 242 ptr = strpbrk(src + x, "&<>\r\n "); 243 if (ptr == NULL) 244 { 245 if (spaces) 246 { 247 int sp; 248 for (sp = 0; sp < spaces - 1; sp++) 249 { 250 err = string_append (out, " "); 251 if (err != STATUS_OK) break; 252 } 253 if (err != STATUS_OK) break; 254 err = string_append_char (out, ' '); 255 } 256 spaces = 0; 257 if (i < part) 258 { 259 err = string_appendn (out, src + x, parts[i].begin - x); 260 x = parts[i].begin; 261 } 262 else 263 { 264 err = string_append (out, src + x); 265 x = slen; 266 } 267 } 268 else 269 { 270 if ((i >= part) || ((ptr - src) < parts[i].begin)) 271 { 272 if (spaces) 273 { 274 int sp; 275 for (sp = 0; sp < spaces - 1; sp++) 276 { 277 err = string_append (out, " "); 278 if (err != STATUS_OK) break; 279 } 280 if (err != STATUS_OK) break; 281 err = string_append_char (out, ' '); 282 } 283 spaces = 0; 284 err = string_appendn (out, src + x, (ptr - src) - x); 285 if (err != STATUS_OK) break; 286 x = ptr - src; 287 if (src[x] == ' ') 288 { 289 if (opts->space_convert) 290 { 291 spaces++; 292 } 293 else 294 err = string_append_char (out, ' '); 295 } 296 else 297 { 298 if (src[x] != '\n' && spaces) 299 { 300 int sp; 301 for (sp = 0; sp < spaces - 1; sp++) 302 { 303 err = string_append (out, " "); 304 if (err != STATUS_OK) break; 305 } 306 if (err != STATUS_OK) break; 307 err = string_append_char (out, ' '); 308 } 309 spaces = 0; 310 311 if (src[x] == '&') 312 err = string_append (out, "&"); 313 else if (src[x] == '<') 314 err = string_append (out, "<"); 315 else if (src[x] == '>') 316 err = string_append (out, ">"); 317 else if (src[x] == '\n') 318 if (opts->newlines_convert) 319 err = string_append (out, "<br/>\n"); 320 else if (x && src[x-1] == '\n') 321 err = string_append (out, "<p/>\n"); 322 else 323 err = string_append_char (out, '\n'); 324 else if (src[x] != '\r') 325 err = nerr_raise (NERR_ASSERT, "src[x] == '%c'", src[x]); 326 } 327 x++; 328 } 329 else 330 { 331 if (spaces) 332 { 333 int sp; 334 for (sp = 0; sp < spaces - 1; sp++) 335 { 336 err = string_append (out, " "); 337 if (err != STATUS_OK) break; 338 } 339 if (err != STATUS_OK) break; 340 err = string_append_char (out, ' '); 341 } 342 spaces = 0; 343 err = string_appendn (out, src + x, parts[i].begin - x); 344 x = parts[i].begin; 345 } 346 } 347 } 348 else 349 { 350 if (spaces) 351 { 352 int sp; 353 for (sp = 0; sp < spaces - 1; sp++) 354 { 355 err = string_append (out, " "); 356 if (err != STATUS_OK) break; 357 } 358 if (err != STATUS_OK) break; 359 err = string_append_char (out, ' '); 360 } 361 spaces = 0; 362 if (parts[i].type == SC_TYPE_URL) 363 { 364 char last_char = src[parts[i].end-1]; 365 int suffix=0; 366 if (last_char == '.' || last_char == ',') { suffix=1; } 367 err = string_append (out, " <a "); 368 if (err != STATUS_OK) break; 369 if (opts->url_class) 370 { 371 err = string_appendf (out, "class=%s ", opts->url_class); 372 if (err) break; 373 } 374 if (opts->url_target) 375 { 376 err = string_appendf (out, "target=\"%s\" ", opts->url_target); 377 if (err) break; 378 } 379 err = string_append(out, "href=\""); 380 if (err) break; 381 if (opts->bounce_url) 382 { 383 char *url, *esc_url, *new_url; 384 int url_len; 385 if (!strncasecmp(src + x, "www.", 4)) 386 { 387 url_len = 7 + parts[i].end - x - suffix; 388 url = (char *) malloc(url_len+1); 389 if (url == NULL) 390 { 391 err = nerr_raise(NERR_NOMEM, 392 "Unable to allocate memory to convert url"); 393 break; 394 } 395 strcpy(url, "http://"); 396 strncat(url, src + x, parts[i].end - x - suffix); 397 } 398 else 399 { 400 url_len = parts[i].end - x - suffix; 401 url = (char *) malloc(url_len+1); 402 if (url == NULL) 403 { 404 err = nerr_raise(NERR_NOMEM, 405 "Unable to allocate memory to convert url"); 406 break; 407 } 408 strncpy(url, src + x, parts[i].end - x - suffix); 409 url[url_len] = '\0'; 410 } 411 err = cgi_url_escape(url, &esc_url); 412 free(url); 413 if (err) { 414 free(esc_url); 415 break; 416 } 417 418 new_url = sprintf_alloc(opts->bounce_url, esc_url); 419 free(esc_url); 420 if (new_url == NULL) 421 { 422 err = nerr_raise(NERR_NOMEM, "Unable to allocate memory to convert url"); 423 break; 424 } 425 err = string_append (out, new_url); 426 free(new_url); 427 if (err) break; 428 } 429 else 430 { 431 if (!strncasecmp(src + x, "www.", 4)) 432 { 433 err = string_append (out, "http://"); 434 if (err != STATUS_OK) break; 435 } 436 err = string_appendn (out, src + x, parts[i].end - x - suffix); 437 if (err != STATUS_OK) break; 438 } 439 err = string_append (out, "\">"); 440 if (err != STATUS_OK) break; 441 if (opts->link_name) { 442 err = html_escape_alloc((opts->link_name), 443 strlen(opts->link_name), &esc); 444 } else { 445 err = html_escape_alloc((src + x), parts[i].end - x - suffix, &esc); 446 } 447 if (err != STATUS_OK) break; 448 err = string_append (out, esc); 449 free(esc); 450 if (err != STATUS_OK) break; 451 err = string_append (out, "</a>"); 452 if (suffix) { 453 err = string_appendn(out,src + parts[i].end - 1,1); 454 if (err != STATUS_OK) break; 455 } 456 } 457 else /* type == SC_TYPE_EMAIL */ 458 { 459 err = string_append (out, "<a "); 460 if (err != STATUS_OK) break; 461 if (opts->mailto_class) 462 { 463 err = string_appendf (out, "class=%s ", opts->mailto_class); 464 if (err) break; 465 } 466 err = string_append(out, "href=\"mailto:"); 467 if (err) break; 468 err = string_appendn (out, src + x, parts[i].end - x); 469 if (err != STATUS_OK) break; 470 err = string_append (out, "\">"); 471 if (err != STATUS_OK) break; 472 err = html_escape_alloc(src + x, parts[i].end - x, &esc); 473 if (err != STATUS_OK) break; 474 err = string_append (out, esc); 475 free(esc); 476 if (err != STATUS_OK) break; 477 err = string_append (out, "</a>"); 478 } 479 x = parts[i].end; 480 i++; 481 } 482 if (err != STATUS_OK) break; 483 } 484 free (parts); 485 return err; 486 } 487 488 static void strip_white_space_end (STRING *str) 489 { 490 int x = 0; 491 int ol = str->len; 492 char *ptr; 493 int i; 494 495 while (x < str->len) 496 { 497 ptr = strchr(str->buf + x, '\n'); 498 if (ptr == NULL) 499 { 500 /* just strip the white space at the end of the string */ 501 ol = strlen(str->buf); 502 while (ol && isspace(str->buf[ol-1])) 503 { 504 str->buf[ol - 1] = '\0'; 505 ol--; 506 } 507 str->len = ol; 508 return; 509 } 510 else 511 { 512 x = i = ptr - str->buf; 513 if (x) 514 { 515 x--; 516 while (x && isspace(str->buf[x]) && (str->buf[x] != '\n')) x--; 517 if (x) x++; 518 memmove (str->buf + x, ptr, ol - i + 1); 519 x++; 520 str->len -= ((i - x) + 1); 521 str->buf[str->len] = '\0'; 522 ol = str->len; 523 } 524 } 525 } 526 } 527 528 NEOERR *convert_text_html_alloc (const char *src, int slen, 529 char **out) 530 { 531 return nerr_pass(convert_text_html_alloc_options(src, slen, out, NULL)); 532 } 533 534 NEOERR *convert_text_html_alloc_options (const char *src, int slen, 535 char **out, 536 HTML_CONVERT_OPTS *opts) 537 { 538 NEOERR *err; 539 STRING out_s; 540 int formatting = 0; 541 HTML_CONVERT_OPTS my_opts; 542 543 string_init(&out_s); 544 545 if (opts == NULL) 546 { 547 opts = &my_opts; 548 opts->bounce_url = NULL; 549 opts->url_class = NULL; 550 opts->url_target = "_blank"; 551 opts->mailto_class = NULL; 552 opts->long_lines = 0; 553 opts->space_convert = 0; 554 opts->newlines_convert = 1; 555 opts->longline_width = 75; /* This hasn't been used in a while, actually */ 556 opts->check_ascii_art = 1; 557 opts->link_name = NULL; 558 } 559 560 do 561 { 562 if (opts->check_ascii_art) 563 { 564 formatting = has_space_formatting (src, slen); 565 if (formatting) opts->space_convert = 1; 566 } 567 if (formatting == 2) 568 { 569 /* Do <pre> formatting */ 570 opts->newlines_convert = 1; 571 err = string_append (&out_s, "<tt>"); 572 if (err != STATUS_OK) break; 573 err = split_and_convert(src, slen, &out_s, opts); 574 if (err != STATUS_OK) break; 575 err = string_append (&out_s, "</tt>"); 576 if (err != STATUS_OK) break; 577 /* Strip white space at end of lines */ 578 strip_white_space_end (&out_s); 579 } 580 else 581 { 582 /* int nl = has_long_lines (src, slen); */ 583 err = split_and_convert(src, slen, &out_s, opts); 584 } 585 } while (0); 586 if (err != STATUS_OK) 587 { 588 string_clear (&out_s); 589 return nerr_pass (err); 590 } 591 if (out_s.buf == NULL) 592 { 593 *out = strdup(""); 594 } 595 else 596 { 597 *out = out_s.buf; 598 } 599 return STATUS_OK; 600 } 601 602 NEOERR *html_escape_alloc (const char *src, int slen, 603 char **out) 604 { 605 return nerr_pass(neos_html_escape(src, slen, out)); 606 } 607 608 /* Replace ampersand with iso-8859-1 character code */ 609 static unsigned char _expand_amp_8859_1_char (const char *s) 610 { 611 if (s[0] == '\0') 612 return 0; 613 614 switch (s[0]) { 615 case '#': 616 if (s[1] == 'x') return strtol (s+2, NULL, 16); 617 return strtol (s+1, NULL, 10); 618 case 'a': 619 if (!strcmp(s, "agrave")) return 0xe0; /* */ 620 if (!strcmp(s, "aacute")) return 0xe1; /* */ 621 if (!strcmp(s, "acirc")) return 0xe2; /* */ 622 if (!strcmp(s, "atilde")) return 0xe3; /* */ 623 if (!strcmp(s, "auml")) return 0xe4; /* */ 624 if (!strcmp(s, "aring")) return 0xe5; /* */ 625 if (!strcmp(s, "aelig")) return 0xe6; /* */ 626 if (!strcmp(s, "amp")) return '&'; 627 return 0; 628 case 'c': 629 if (!strcmp(s, "ccedil")) return 0xe7; /* */ 630 return 0; 631 case 'e': 632 if (!strcmp(s, "egrave")) return 0xe8; /* */ 633 if (!strcmp(s, "eacute")) return 0xe9; /* */ 634 if (!strcmp(s, "ecirc")) return 0xea; /* */ 635 if (!strcmp(s, "euml")) return 0xeb; /* */ 636 if (!strcmp(s, "eth")) return 0xf0; /* */ 637 return 0; 638 case 'i': 639 if (!strcmp(s, "igrave")) return 0xec; /* */ 640 if (!strcmp(s, "iacute")) return 0xed; /* */ 641 if (!strcmp(s, "icirc")) return 0xee; /* */ 642 if (!strcmp(s, "iuml")) return 0xef; /* */ 643 return 0; 644 case 'g': 645 if (!strcmp(s, "gt")) return '>'; 646 return 0; 647 case 'l': 648 if (!strcmp(s, "lt")) return '<'; 649 return 0; 650 case 'n': 651 if (!strcmp(s, "ntilde")) return 0xf1; /* */ 652 if (!strcmp(s, "nbsp")) return ' '; 653 return 0; 654 case 'o': 655 if (!strcmp(s, "ograve")) return 0xf2; /* */ 656 if (!strcmp(s, "oacute")) return 0xf3; /* */ 657 if (!strcmp(s, "ocirc")) return 0xf4; /* */ 658 if (!strcmp(s, "otilde")) return 0xf5; /* */ 659 if (!strcmp(s, "ouml")) return 0xf6; /* */ 660 if (!strcmp(s, "oslash")) return 0xf8; /* */ 661 return 0; 662 case 'q': /* quot */ 663 if (!strcmp(s, "quot")) return '"'; 664 return 0; 665 case 's': 666 if (!strcmp(s, "szlig")) return 0xdf; /* */ 667 return 0; 668 case 't': 669 if (!strcmp(s, "thorn")) return 0xfe; /* */ 670 return 0; 671 case 'u': 672 if (!strcmp(s, "ugrave")) return 0xf9; /* */ 673 if (!strcmp(s, "uacute")) return 0xfa; /* */ 674 if (!strcmp(s, "ucirc")) return 0xfb; /* */ 675 if (!strcmp(s, "uuml")) return 0xfc; /* */ 676 return 0; 677 case 'y': 678 if (!strcmp(s, "yacute")) return 0xfd; /* */ 679 680 } 681 return 0; 682 } 683 684 char *html_expand_amp_8859_1(const char *amp, 685 char *buf) 686 { 687 unsigned char ch; 688 689 ch = _expand_amp_8859_1_char(amp); 690 if (ch == '\0') 691 { 692 if (!strcmp(amp, "copy")) return "(C)"; 693 return ""; 694 } 695 else { 696 buf[0] = (char)ch; 697 buf[1] = '\0'; 698 return buf; 699 } 700 } 701 702 NEOERR *html_strip_alloc(const char *src, int slen, 703 char **out) 704 { 705 NEOERR *err = STATUS_OK; 706 STRING out_s; 707 int x = 0; 708 int strip_match = -1; 709 int state = 0; 710 char amp[10]; 711 int amp_start = 0; 712 char buf[10]; 713 int ampl = 0; 714 715 string_init(&out_s); 716 err = string_append (&out_s, ""); 717 if (err) return nerr_pass (err); 718 719 while (x < slen) 720 { 721 switch (state) { 722 case 0: 723 /* Default */ 724 if (src[x] == '&') 725 { 726 state = 3; 727 ampl = 0; 728 amp_start = x; 729 } 730 else if (src[x] == '<') 731 { 732 state = 1; 733 } 734 else 735 { 736 if (strip_match == -1) 737 { 738 err = string_append_char(&out_s, src[x]); 739 if (err) break; 740 } 741 } 742 x++; 743 break; 744 case 1: 745 /* Starting TAG */ 746 if (src[x] == '>') 747 { 748 state = 0; 749 } 750 else if (src[x] == '/') 751 { 752 } 753 else 754 { 755 } 756 x++; 757 break; 758 case 2: 759 /* In TAG */ 760 if (src[x] == '>') 761 { 762 state = 0; 763 } 764 x++; 765 break; 766 case 3: 767 /* In AMP */ 768 if (src[x] == ';') 769 { 770 amp[ampl] = '\0'; 771 state = 0; 772 err = string_append(&out_s, html_expand_amp_8859_1(amp, buf)); 773 if (err) break; 774 } 775 else 776 { 777 if (ampl < sizeof(amp)-1) 778 amp[ampl++] = tolower(src[x]); 779 else 780 { 781 /* broken html... just back up */ 782 x = amp_start; 783 err = string_append_char(&out_s, src[x]); 784 if (err) break; 785 state = 0; 786 } 787 } 788 x++; 789 break; 790 } 791 if (err) break; 792 } 793 794 795 if (err) 796 { 797 string_clear (&out_s); 798 return nerr_pass (err); 799 } 800 *out = out_s.buf; 801 return STATUS_OK; 802 } 803