1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.util; 18 19 import java.util.ArrayList; 20 import java.util.HashMap; 21 import java.util.LinkedHashMap; 22 import java.util.regex.Matcher; 23 import java.util.regex.Pattern; 24 import java.util.Set; 25 import java.util.List; 26 27 /** 28 * 29 * Logic for parsing a text message typed by the user looking for smileys, 30 * urls, acronyms,formatting (e.g., '*'s for bold), me commands 31 * (e.g., "/me is asleep"), and punctuation. 32 * 33 * It constructs an array, which breaks the text up into its 34 * constituent pieces, which we return to the client. 35 * 36 */ 37 public abstract class AbstractMessageParser { 38 /** 39 * Interface representing the set of resources needed by a message parser 40 * 41 * @author jessan (Jessan Hutchison-Quillian) 42 */ 43 public static interface Resources { 44 45 /** Get the known set of URL schemes. */ 46 public Set<String> getSchemes(); 47 48 /** Get the possible values for the last part of a domain name. 49 * Values are expected to be reversed in the Trie. 50 */ 51 public TrieNode getDomainSuffixes(); 52 53 /** Get the smileys accepted by the parser. */ 54 public TrieNode getSmileys(); 55 56 /** Get the acronyms accepted by the parser. */ 57 public TrieNode getAcronyms(); 58 } 59 60 /** 61 * Subclasses must define the schemes, domains, smileys and acronyms 62 * that are necessary for parsing 63 */ 64 protected abstract Resources getResources(); 65 66 /** Music note that indicates user is listening to a music track. */ 67 public static final String musicNote = "\u266B "; 68 69 private String text; 70 private int nextChar; 71 private int nextClass; 72 private ArrayList<Part> parts; 73 private ArrayList<Token> tokens; 74 private HashMap<Character,Format> formatStart; 75 private boolean parseSmilies; 76 private boolean parseAcronyms; 77 private boolean parseFormatting; 78 private boolean parseUrls; 79 private boolean parseMeText; 80 private boolean parseMusic; 81 82 /** 83 * Create a message parser to parse urls, formatting, acronyms, smileys, 84 * /me text and music 85 * 86 * @param text the text to parse 87 */ 88 public AbstractMessageParser(String text) { 89 this(text, true, true, true, true, true, true); 90 } 91 92 /** 93 * Create a message parser, specifying the kinds of text to parse 94 * 95 * @param text the text to parse 96 * 97 */ 98 public AbstractMessageParser(String text, boolean parseSmilies, 99 boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, 100 boolean parseMusic, boolean parseMeText) { 101 this.text = text; 102 this.nextChar = 0; 103 this.nextClass = 10; 104 this.parts = new ArrayList<Part>(); 105 this.tokens = new ArrayList<Token>(); 106 this.formatStart = new HashMap<Character,Format>(); 107 this.parseSmilies = parseSmilies; 108 this.parseAcronyms = parseAcronyms; 109 this.parseFormatting = parseFormatting; 110 this.parseUrls = parseUrls; 111 this.parseMusic = parseMusic; 112 this.parseMeText = parseMeText; 113 } 114 115 /** Returns the raw text being parsed. */ 116 public final String getRawText() { return text; } 117 118 /** Return the number of parts. */ 119 public final int getPartCount() { return parts.size(); } 120 121 /** Return the part at the given index. */ 122 public final Part getPart(int index) { return parts.get(index); } 123 124 /** Return the list of parts from the parsed text */ 125 public final List<Part> getParts() { return parts; } 126 127 /** Parses the text string into an internal representation. */ 128 public void parse() { 129 // Look for music track (of which there would be only one and it'll be the 130 // first token) 131 if (parseMusicTrack()) { 132 buildParts(null); 133 return; 134 } 135 136 // Look for me commands. 137 String meText = null; 138 if (parseMeText && text.startsWith("/me") && (text.length() > 3) && 139 Character.isWhitespace(text.charAt(3))) { 140 meText = text.substring(0, 4); 141 text = text.substring(4); 142 } 143 144 // Break the text into tokens. 145 boolean wasSmiley = false; 146 while (nextChar < text.length()) { 147 if (!isWordBreak(nextChar)) { 148 if (!wasSmiley || !isSmileyBreak(nextChar)) { 149 throw new AssertionError("last chunk did not end at word break"); 150 } 151 } 152 153 if (parseSmiley()) { 154 wasSmiley = true; 155 } else { 156 wasSmiley = false; 157 158 if (!parseAcronym() && !parseURL() && !parseFormatting()) { 159 parseText(); 160 } 161 } 162 } 163 164 // Trim the whitespace before and after media components. 165 for (int i = 0; i < tokens.size(); ++i) { 166 if (tokens.get(i).isMedia()) { 167 if ((i > 0) && (tokens.get(i - 1) instanceof Html)) { 168 ((Html)tokens.get(i - 1)).trimLeadingWhitespace(); 169 } 170 if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) { 171 ((Html)tokens.get(i + 1)).trimTrailingWhitespace(); 172 } 173 } 174 } 175 176 // Remove any empty html tokens. 177 for (int i = 0; i < tokens.size(); ++i) { 178 if (tokens.get(i).isHtml() && 179 (tokens.get(i).toHtml(true).length() == 0)) { 180 tokens.remove(i); 181 --i; // visit this index again 182 } 183 } 184 185 buildParts(meText); 186 } 187 188 /** 189 * Get a the appropriate Token for a given URL 190 * 191 * @param text the anchor text 192 * @param url the url 193 * 194 */ 195 public static Token tokenForUrl(String url, String text) { 196 if(url == null) { 197 return null; 198 } 199 200 //Look for video links 201 Video video = Video.matchURL(url, text); 202 if (video != null) { 203 return video; 204 } 205 206 // Look for video links. 207 YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text); 208 if (ytVideo != null) { 209 return ytVideo; 210 } 211 212 // Look for photo links. 213 Photo photo = Photo.matchURL(url, text); 214 if (photo != null) { 215 return photo; 216 } 217 218 // Look for photo links. 219 FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text); 220 if (flickrPhoto != null) { 221 return flickrPhoto; 222 } 223 224 //Not media, so must be a regular URL 225 return new Link(url, text); 226 } 227 228 /** 229 * Builds the parts list. 230 * 231 * @param meText any meText parsed from the message 232 */ 233 private void buildParts(String meText) { 234 for (int i = 0; i < tokens.size(); ++i) { 235 Token token = tokens.get(i); 236 if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) { 237 parts.add(new Part()); 238 } 239 lastPart().add(token); 240 } 241 242 // The first part inherits the meText of the line. 243 if (parts.size() > 0) { 244 parts.get(0).setMeText(meText); 245 } 246 } 247 248 /** Returns the last part in the list. */ 249 private Part lastPart() { return parts.get(parts.size() - 1); } 250 251 /** 252 * Looks for a music track (\u266B is first character, everything else is 253 * track info). 254 */ 255 private boolean parseMusicTrack() { 256 257 if (parseMusic && text.startsWith(musicNote)) { 258 addToken(new MusicTrack(text.substring(musicNote.length()))); 259 nextChar = text.length(); 260 return true; 261 } 262 return false; 263 } 264 265 /** Consumes all of the text in the next word . */ 266 private void parseText() { 267 StringBuilder buf = new StringBuilder(); 268 int start = nextChar; 269 do { 270 char ch = text.charAt(nextChar++); 271 switch (ch) { 272 case '<': buf.append("<"); break; 273 case '>': buf.append(">"); break; 274 case '&': buf.append("&"); break; 275 case '"': buf.append("""); break; 276 case '\'': buf.append("'"); break; 277 case '\n': buf.append("<br>"); break; 278 default: buf.append(ch); break; 279 } 280 } while (!isWordBreak(nextChar)); 281 282 addToken(new Html(text.substring(start, nextChar), buf.toString())); 283 } 284 285 /** 286 * Looks for smileys (e.g., ":)") in the text. The set of known smileys is 287 * loaded from a file into a trie at server start. 288 */ 289 private boolean parseSmiley() { 290 if(!parseSmilies) { 291 return false; 292 } 293 TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar, 294 true); 295 if (match == null) { 296 return false; 297 } else { 298 int previousCharClass = getCharClass(nextChar - 1); 299 int nextCharClass = getCharClass(nextChar + match.getText().length()); 300 if ((previousCharClass == 2 || previousCharClass == 3) 301 && (nextCharClass == 2 || nextCharClass == 3)) { 302 return false; 303 } 304 addToken(new Smiley(match.getText())); 305 nextChar += match.getText().length(); 306 return true; 307 } 308 } 309 310 /** Looks for acronyms (e.g., "lol") in the text. 311 */ 312 private boolean parseAcronym() { 313 if(!parseAcronyms) { 314 return false; 315 } 316 TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar); 317 if (match == null) { 318 return false; 319 } else { 320 addToken(new Acronym(match.getText(), match.getValue())); 321 nextChar += match.getText().length(); 322 return true; 323 } 324 } 325 326 /** Determines if this is an allowable domain character. */ 327 private boolean isDomainChar(char c) { 328 return c == '-' || Character.isLetter(c) || Character.isDigit(c); 329 } 330 331 /** Determines if the given string is a valid domain. */ 332 private boolean isValidDomain(String domain) { 333 // For hostnames, check that it ends with a known domain suffix 334 if (matches(getResources().getDomainSuffixes(), reverse(domain))) { 335 return true; 336 } 337 return false; 338 } 339 340 /** 341 * Looks for a URL in two possible forms: either a proper URL with a known 342 * scheme or a domain name optionally followed by a path, query, or query. 343 */ 344 private boolean parseURL() { 345 // Make sure this is a valid place to start a URL. 346 if (!parseUrls || !isURLBreak(nextChar)) { 347 return false; 348 } 349 350 int start = nextChar; 351 352 // Search for the first block of letters. 353 int index = start; 354 while ((index < text.length()) && isDomainChar(text.charAt(index))) { 355 index += 1; 356 } 357 358 String url = ""; 359 boolean done = false; 360 361 if (index == text.length()) { 362 return false; 363 } else if (text.charAt(index) == ':') { 364 // Make sure this is a known scheme. 365 String scheme = text.substring(nextChar, index); 366 if (!getResources().getSchemes().contains(scheme)) { 367 return false; 368 } 369 } else if (text.charAt(index) == '.') { 370 // Search for the end of the domain name. 371 while (index < text.length()) { 372 char ch = text.charAt(index); 373 if ((ch != '.') && !isDomainChar(ch)) { 374 break; 375 } else { 376 index += 1; 377 } 378 } 379 380 // Make sure the domain name has a valid suffix. Since tries look for 381 // prefix matches, we reverse all the strings to get suffix comparisons. 382 String domain = text.substring(nextChar, index); 383 if (!isValidDomain(domain)) { 384 return false; 385 } 386 387 // Search for a port. We deal with this specially because a colon can 388 // also be a punctuation character. 389 if ((index + 1 < text.length()) && (text.charAt(index) == ':')) { 390 char ch = text.charAt(index + 1); 391 if (Character.isDigit(ch)) { 392 index += 1; 393 while ((index < text.length()) && 394 Character.isDigit(text.charAt(index))) { 395 index += 1; 396 } 397 } 398 } 399 400 // The domain name should be followed by end of line, whitespace, 401 // punctuation, or a colon, slash, question, or hash character. The 402 // tricky part here is that some URL characters are also punctuation, so 403 // we need to distinguish them. Since we looked for ports above, a colon 404 // is always punctuation here. To distinguish '?' cases, we look at the 405 // character that follows it. 406 if (index == text.length()) { 407 done = true; 408 } else { 409 char ch = text.charAt(index); 410 if (ch == '?') { 411 // If the next character is whitespace or punctuation (or missing), 412 // then this question mark looks like punctuation. 413 if (index + 1 == text.length()) { 414 done = true; 415 } else { 416 char ch2 = text.charAt(index + 1); 417 if (Character.isWhitespace(ch2) || isPunctuation(ch2)) { 418 done = true; 419 } 420 } 421 } else if (isPunctuation(ch)) { 422 done = true; 423 } else if (Character.isWhitespace(ch)) { 424 done = true; 425 } else if ((ch == '/') || (ch == '#')) { 426 // In this case, the URL is not done. We will search for the end of 427 // it below. 428 } else { 429 return false; 430 } 431 } 432 433 // We will assume the user meant HTTP. (One weird case is where they 434 // type a port of 443. That could mean HTTPS, but they might also want 435 // HTTP. We'll let them specify if they don't want HTTP.) 436 url = "http://"; 437 } else { 438 return false; 439 } 440 441 // If the URL is not done, search for the end, which is just before the 442 // next whitespace character. 443 if (!done) { 444 while ((index < text.length()) && 445 !Character.isWhitespace(text.charAt(index))) { 446 index += 1; 447 } 448 } 449 450 String urlText = text.substring(start, index); 451 url += urlText; 452 453 // Figure out the appropriate token type. 454 addURLToken(url, urlText); 455 456 nextChar = index; 457 return true; 458 } 459 460 /** 461 * Adds the appropriate token for the given URL. This might be a simple 462 * link or it might be a recognized media type. 463 */ 464 private void addURLToken(String url, String text) { 465 addToken(tokenForUrl(url, text)); 466 } 467 468 /** 469 * Deal with formatting characters. 470 * 471 * Parsing is as follows: 472 * - Treat all contiguous strings of formatting characters as one block. 473 * (This method processes one block.) 474 * - Only a single instance of a particular format character within a block 475 * is used to determine whether to turn on/off that type of formatting; 476 * other instances simply print the character itself. 477 * - If the format is to be turned on, we use the _first_ instance; if it 478 * is to be turned off, we use the _last_ instance (by appending the 479 * format.) 480 * 481 * Example: 482 * **string** turns into <b>*string*</b> 483 */ 484 private boolean parseFormatting() { 485 if(!parseFormatting) { 486 return false; 487 } 488 int endChar = nextChar; 489 while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) { 490 endChar += 1; 491 } 492 493 if ((endChar == nextChar) || !isWordBreak(endChar)) { 494 return false; 495 } 496 497 // Keeps track of whether we've seen a character (in map if we've seen it) 498 // and whether we should append a closing format token (if value in 499 // map is TRUE). Linked hashmap for consistent ordering. 500 LinkedHashMap<Character, Boolean> seenCharacters = 501 new LinkedHashMap<Character, Boolean>(); 502 503 for (int index = nextChar; index < endChar; ++index) { 504 char ch = text.charAt(index); 505 Character key = Character.valueOf(ch); 506 if (seenCharacters.containsKey(key)) { 507 // Already seen this character, just append an unmatched token, which 508 // will print plaintext character 509 addToken(new Format(ch, false)); 510 } else { 511 Format start = formatStart.get(key); 512 if (start != null) { 513 // Match the start token, and ask an end token to be appended 514 start.setMatched(true); 515 formatStart.remove(key); 516 seenCharacters.put(key, Boolean.TRUE); 517 } else { 518 // Append start token 519 start = new Format(ch, true); 520 formatStart.put(key, start); 521 addToken(start); 522 seenCharacters.put(key, Boolean.FALSE); 523 } 524 } 525 } 526 527 // Append any necessary end tokens 528 for (Character key : seenCharacters.keySet()) { 529 if (seenCharacters.get(key) == Boolean.TRUE) { 530 Format end = new Format(key.charValue(), false); 531 end.setMatched(true); 532 addToken(end); 533 } 534 } 535 536 nextChar = endChar; 537 return true; 538 } 539 540 /** Determines whether the given index could be a possible word break. */ 541 private boolean isWordBreak(int index) { 542 return getCharClass(index - 1) != getCharClass(index); 543 } 544 545 /** Determines whether the given index could be a possible smiley break. */ 546 private boolean isSmileyBreak(int index) { 547 if (index > 0 && index < text.length()) { 548 if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) { 549 return true; 550 } 551 } 552 553 return false; 554 } 555 556 /** 557 * Verifies that the character before the given index is end of line, 558 * whitespace, or punctuation. 559 */ 560 private boolean isURLBreak(int index) { 561 switch (getCharClass(index - 1)) { 562 case 2: 563 case 3: 564 case 4: 565 return false; 566 567 case 0: 568 case 1: 569 default: 570 return true; 571 } 572 } 573 574 /** Returns the class for the character at the given index. */ 575 private int getCharClass(int index) { 576 if ((index < 0) || (text.length() <= index)) { 577 return 0; 578 } 579 580 char ch = text.charAt(index); 581 if (Character.isWhitespace(ch)) { 582 return 1; 583 } else if (Character.isLetter(ch)) { 584 return 2; 585 } else if (Character.isDigit(ch)) { 586 return 3; 587 } else if (isPunctuation(ch)) { 588 // For punctuation, we return a unique value every time so that they are 589 // always different from any other character. Punctuation should always 590 // be considered a possible word break. 591 return ++nextClass; 592 } else { 593 return 4; 594 } 595 } 596 597 /** 598 * Returns true if <code>c1</code> could be the last character of 599 * a smiley and <code>c2</code> could be the first character of 600 * a different smiley, if {@link #isWordBreak} would not already 601 * recognize that this is possible. 602 */ 603 private static boolean isSmileyBreak(char c1, char c2) { 604 switch (c1) { 605 /* 606 * These characters can end smileys, but don't normally end words. 607 */ 608 case '$': case '&': case '*': case '+': case '-': 609 case '/': case '<': case '=': case '>': case '@': 610 case '[': case '\\': case ']': case '^': case '|': 611 case '}': case '~': 612 switch (c2) { 613 /* 614 * These characters can begin smileys, but don't normally 615 * begin words. 616 */ 617 case '#': case '$': case '%': case '*': case '/': 618 case '<': case '=': case '>': case '@': case '[': 619 case '\\': case '^': case '~': 620 return true; 621 } 622 } 623 624 return false; 625 } 626 627 /** Determines whether the given character is punctuation. */ 628 private static boolean isPunctuation(char ch) { 629 switch (ch) { 630 case '.': case ',': case '"': case ':': case ';': 631 case '?': case '!': case '(': case ')': 632 return true; 633 634 default: 635 return false; 636 } 637 } 638 639 /** 640 * Determines whether the given character is the beginning or end of a 641 * section with special formatting. 642 */ 643 private static boolean isFormatChar(char ch) { 644 switch (ch) { 645 case '*': case '_': case '^': 646 return true; 647 648 default: 649 return false; 650 } 651 } 652 653 /** Represents a unit of parsed output. */ 654 public static abstract class Token { 655 public enum Type { 656 657 HTML ("html"), 658 FORMAT ("format"), // subtype of HTML 659 LINK ("l"), 660 SMILEY ("e"), 661 ACRONYM ("a"), 662 MUSIC ("m"), 663 GOOGLE_VIDEO ("v"), 664 YOUTUBE_VIDEO ("yt"), 665 PHOTO ("p"), 666 FLICKR ("f"); 667 668 //stringreps for HTML and FORMAT don't really matter 669 //because they don't define getInfo(), which is where it is used 670 //For the other types, code depends on their stringreps 671 private String stringRep; 672 673 Type(String stringRep) { 674 this.stringRep = stringRep; 675 } 676 677 /** {@inheritDoc} */ 678 public String toString() { 679 return this.stringRep; 680 } 681 } 682 683 protected Type type; 684 protected String text; 685 686 protected Token(Type type, String text) { 687 this.type = type; 688 this.text = text; 689 } 690 691 /** Returns the type of the token. */ 692 public Type getType() { return type; } 693 694 /** 695 * Get the relevant information about a token 696 * 697 * @return a list of strings representing the token, not null 698 * The first item is always a string representation of the type 699 */ 700 public List<String> getInfo() { 701 List<String> info = new ArrayList<String>(); 702 info.add(getType().toString()); 703 return info; 704 } 705 706 /** Returns the raw text of the token. */ 707 public String getRawText() { return text; } 708 709 public boolean isMedia() { return false; } 710 public abstract boolean isHtml(); 711 public boolean isArray() { return !isHtml(); } 712 713 public String toHtml(boolean caps) { throw new AssertionError("not html"); } 714 715 // The token can change the caps of the text after that point. 716 public boolean controlCaps() { return false; } 717 public boolean setCaps() { return false; } 718 } 719 720 /** Represents a simple string of html text. */ 721 public static class Html extends Token { 722 private String html; 723 724 public Html(String text, String html) { 725 super(Type.HTML, text); 726 this.html = html; 727 } 728 729 public boolean isHtml() { return true; } 730 public String toHtml(boolean caps) { 731 return caps ? html.toUpperCase() : html; 732 } 733 /** 734 * Not supported. Info should not be needed for this type 735 */ 736 public List<String> getInfo() { 737 throw new UnsupportedOperationException(); 738 } 739 740 public void trimLeadingWhitespace() { 741 text = trimLeadingWhitespace(text); 742 html = trimLeadingWhitespace(html); 743 } 744 745 public void trimTrailingWhitespace() { 746 text = trimTrailingWhitespace(text); 747 html = trimTrailingWhitespace(html); 748 } 749 750 private static String trimLeadingWhitespace(String text) { 751 int index = 0; 752 while ((index < text.length()) && 753 Character.isWhitespace(text.charAt(index))) { 754 ++index; 755 } 756 return text.substring(index); 757 } 758 759 public static String trimTrailingWhitespace(String text) { 760 int index = text.length(); 761 while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) { 762 --index; 763 } 764 return text.substring(0, index); 765 } 766 } 767 768 /** Represents a music track token at the beginning. */ 769 public static class MusicTrack extends Token { 770 private String track; 771 772 public MusicTrack(String track) { 773 super(Type.MUSIC, track); 774 this.track = track; 775 } 776 777 public String getTrack() { return track; } 778 779 public boolean isHtml() { return false; } 780 781 public List<String> getInfo() { 782 List<String> info = super.getInfo(); 783 info.add(getTrack()); 784 return info; 785 } 786 } 787 788 /** Represents a link that was found in the input. */ 789 public static class Link extends Token { 790 private String url; 791 792 public Link(String url, String text) { 793 super(Type.LINK, text); 794 this.url = url; 795 } 796 797 public String getURL() { return url; } 798 799 public boolean isHtml() { return false; } 800 801 public List<String> getInfo() { 802 List<String> info = super.getInfo(); 803 info.add(getURL()); 804 info.add(getRawText()); 805 return info; 806 } 807 } 808 809 /** Represents a link to a Google Video. */ 810 public static class Video extends Token { 811 /** Pattern for a video URL. */ 812 private static final Pattern URL_PATTERN = Pattern.compile( 813 "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?" 814 + ".*?\\bdocid=(-?\\d+).*"); 815 816 private String docid; 817 818 public Video(String docid, String text) { 819 super(Type.GOOGLE_VIDEO, text); 820 this.docid = docid; 821 } 822 823 public String getDocID() { return docid; } 824 825 public boolean isHtml() { return false; } 826 public boolean isMedia() { return true; } 827 828 /** Returns a Video object if the given url is to a video. */ 829 public static Video matchURL(String url, String text) { 830 Matcher m = URL_PATTERN.matcher(url); 831 if (m.matches()) { 832 return new Video(m.group(1), text); 833 } else { 834 return null; 835 } 836 } 837 838 public List<String> getInfo() { 839 List<String> info = super.getInfo(); 840 info.add(getRssUrl(docid)); 841 info.add(getURL(docid)); 842 return info; 843 } 844 845 /** Returns the URL for the RSS description of the given video. */ 846 public static String getRssUrl(String docid) { 847 return "http://video.google.com/videofeed" 848 + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid; 849 } 850 851 /** (For testing purposes:) Returns a video URL with the given parts. */ 852 public static String getURL(String docid) { 853 return getURL(docid, null); 854 } 855 856 /** (For testing purposes:) Returns a video URL with the given parts. */ 857 public static String getURL(String docid, String extraParams) { 858 if (extraParams == null) { 859 extraParams = ""; 860 } else if (extraParams.length() > 0) { 861 extraParams += "&"; 862 } 863 return "http://video.google.com/videoplay?" + extraParams 864 + "docid=" + docid; 865 } 866 } 867 868 /** Represents a link to a YouTube video. */ 869 public static class YouTubeVideo extends Token { 870 /** Pattern for a video URL. */ 871 private static final Pattern URL_PATTERN = Pattern.compile( 872 "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?" 873 + ".*\\bv=([-_a-zA-Z0-9=]+).*"); 874 875 private String docid; 876 877 public YouTubeVideo(String docid, String text) { 878 super(Type.YOUTUBE_VIDEO, text); 879 this.docid = docid; 880 } 881 882 public String getDocID() { return docid; } 883 884 public boolean isHtml() { return false; } 885 public boolean isMedia() { return true; } 886 887 /** Returns a Video object if the given url is to a video. */ 888 public static YouTubeVideo matchURL(String url, String text) { 889 Matcher m = URL_PATTERN.matcher(url); 890 if (m.matches()) { 891 return new YouTubeVideo(m.group(1), text); 892 } else { 893 return null; 894 } 895 } 896 897 public List<String> getInfo() { 898 List<String> info = super.getInfo(); 899 info.add(getRssUrl(docid)); 900 info.add(getURL(docid)); 901 return info; 902 } 903 904 /** Returns the URL for the RSS description of the given video. */ 905 public static String getRssUrl(String docid) { 906 return "http://youtube.com/watch?v=" + docid; 907 } 908 909 /** (For testing purposes:) Returns a video URL with the given parts. */ 910 public static String getURL(String docid) { 911 return getURL(docid, null); 912 } 913 914 /** (For testing purposes:) Returns a video URL with the given parts. */ 915 public static String getURL(String docid, String extraParams) { 916 if (extraParams == null) { 917 extraParams = ""; 918 } else if (extraParams.length() > 0) { 919 extraParams += "&"; 920 } 921 return "http://youtube.com/watch?" + extraParams + "v=" + docid; 922 } 923 924 /** (For testing purposes:) Returns a video URL with the given parts. 925 * @param http If true, includes http:// 926 * @param prefix If non-null/non-blank, adds to URL before youtube.com. 927 * (e.g., prefix="br." --> "br.youtube.com") 928 */ 929 public static String getPrefixedURL(boolean http, String prefix, 930 String docid, String extraParams) { 931 String protocol = ""; 932 933 if (http) { 934 protocol = "http://"; 935 } 936 937 if (prefix == null) { 938 prefix = ""; 939 } 940 941 if (extraParams == null) { 942 extraParams = ""; 943 } else if (extraParams.length() > 0) { 944 extraParams += "&"; 945 } 946 947 return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" + 948 docid; 949 } 950 } 951 952 /** Represents a link to a Picasa photo or album. */ 953 public static class Photo extends Token { 954 /** Pattern for an album or photo URL. */ 955 // TODO (katyarogers) searchbrowse includes search lists and tags, 956 // it follows a different pattern than albums - would be nice to add later 957 private static final Pattern URL_PATTERN = Pattern.compile( 958 "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?"); 959 960 private String user; 961 private String album; 962 private String photo; // null for albums 963 964 public Photo(String user, String album, String photo, String text) { 965 super(Type.PHOTO, text); 966 this.user = user; 967 this.album = album; 968 this.photo = photo; 969 } 970 971 public String getUser() { return user; } 972 public String getAlbum() { return album; } 973 public String getPhoto() { return photo; } 974 975 public boolean isHtml() { return false; } 976 public boolean isMedia() { return true; } 977 978 /** Returns a Photo object if the given url is to a photo or album. */ 979 public static Photo matchURL(String url, String text) { 980 Matcher m = URL_PATTERN.matcher(url); 981 if (m.matches()) { 982 return new Photo(m.group(1), m.group(2), m.group(3), text); 983 } else { 984 return null; 985 } 986 } 987 988 public List<String> getInfo() { 989 List<String> info = super.getInfo(); 990 info.add(getRssUrl(getUser())); 991 info.add(getAlbumURL(getUser(), getAlbum())); 992 if (getPhoto() != null) { 993 info.add(getPhotoURL(getUser(), getAlbum(), getPhoto())); 994 } else { 995 info.add((String)null); 996 } 997 return info; 998 } 999 1000 /** Returns the URL for the RSS description of the user's albums. */ 1001 public static String getRssUrl(String user) { 1002 return "http://picasaweb.google.com/data/feed/api/user/" + user + 1003 "?category=album&alt=rss"; 1004 } 1005 1006 /** Returns the URL for an album. */ 1007 public static String getAlbumURL(String user, String album) { 1008 return "http://picasaweb.google.com/" + user + "/" + album; 1009 } 1010 1011 /** Returns the URL for a particular photo. */ 1012 public static String getPhotoURL(String user, String album, String photo) { 1013 return "http://picasaweb.google.com/" + user + "/" + album + "/photo#" 1014 + photo; 1015 } 1016 } 1017 1018 /** Represents a link to a Flickr photo or album. */ 1019 public static class FlickrPhoto extends Token { 1020 /** Pattern for a user album or photo URL. */ 1021 private static final Pattern URL_PATTERN = Pattern.compile( 1022 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*"); 1023 private static final Pattern GROUPING_PATTERN = Pattern.compile( 1024 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" + 1025 "([^/?#&]+)/?"); 1026 1027 private static final String SETS = "sets"; 1028 private static final String TAGS = "tags"; 1029 1030 private String user; 1031 private String photo; // null for user album 1032 private String grouping; // either "tags" or "sets" 1033 private String groupingId; // sets or tags identifier 1034 1035 public FlickrPhoto(String user, String photo, String grouping, 1036 String groupingId, String text) { 1037 super(Type.FLICKR, text); 1038 1039 /* System wide tags look like the URL to a Flickr user. */ 1040 if (!TAGS.equals(user)) { 1041 this.user = user; 1042 // Don't consider slide show URL a photo 1043 this.photo = (!"show".equals(photo) ? photo : null); 1044 this.grouping = grouping; 1045 this.groupingId = groupingId; 1046 } else { 1047 this.user = null; 1048 this.photo = null; 1049 this.grouping = TAGS; 1050 this.groupingId = photo; 1051 } 1052 } 1053 1054 public String getUser() { return user; } 1055 public String getPhoto() { return photo; } 1056 public String getGrouping() { return grouping; } 1057 public String getGroupingId() { return groupingId; } 1058 1059 public boolean isHtml() { return false; } 1060 public boolean isMedia() { return true; } 1061 1062 /** 1063 * Returns a FlickrPhoto object if the given url is to a photo or Flickr 1064 * user. 1065 */ 1066 public static FlickrPhoto matchURL(String url, String text) { 1067 Matcher m = GROUPING_PATTERN.matcher(url); 1068 if (m.matches()) { 1069 return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text); 1070 } 1071 1072 m = URL_PATTERN.matcher(url); 1073 if (m.matches()) { 1074 return new FlickrPhoto(m.group(1), m.group(2), null, null, text); 1075 } else { 1076 return null; 1077 } 1078 } 1079 1080 public List<String> getInfo() { 1081 List<String> info = super.getInfo(); 1082 info.add(getUrl()); 1083 info.add(getUser() != null ? getUser() : ""); 1084 info.add(getPhoto() != null ? getPhoto() : ""); 1085 info.add(getGrouping() != null ? getGrouping() : ""); 1086 info.add(getGroupingId() != null ? getGroupingId() : ""); 1087 return info; 1088 } 1089 1090 public String getUrl() { 1091 if (SETS.equals(grouping)) { 1092 return getUserSetsURL(user, groupingId); 1093 } else if (TAGS.equals(grouping)) { 1094 if (user != null) { 1095 return getUserTagsURL(user, groupingId); 1096 } else { 1097 return getTagsURL(groupingId); 1098 } 1099 } else if (photo != null) { 1100 return getPhotoURL(user, photo); 1101 } else { 1102 return getUserURL(user); 1103 } 1104 } 1105 1106 /** Returns the URL for the RSS description. */ 1107 public static String getRssUrl(String user) { 1108 return null; 1109 } 1110 1111 /** Returns the URL for a particular tag. */ 1112 public static String getTagsURL(String tag) { 1113 return "http://flickr.com/photos/tags/" + tag; 1114 } 1115 1116 /** Returns the URL to the user's Flickr homepage. */ 1117 public static String getUserURL(String user) { 1118 return "http://flickr.com/photos/" + user; 1119 } 1120 1121 /** Returns the URL for a particular photo. */ 1122 public static String getPhotoURL(String user, String photo) { 1123 return "http://flickr.com/photos/" + user + "/" + photo; 1124 } 1125 1126 /** Returns the URL for a user tag photo set. */ 1127 public static String getUserTagsURL(String user, String tagId) { 1128 return "http://flickr.com/photos/" + user + "/tags/" + tagId; 1129 } 1130 1131 /** Returns the URL for user set. */ 1132 public static String getUserSetsURL(String user, String setId) { 1133 return "http://flickr.com/photos/" + user + "/sets/" + setId; 1134 } 1135 } 1136 1137 /** Represents a smiley that was found in the input. */ 1138 public static class Smiley extends Token { 1139 // TODO: Pass the SWF URL down to the client. 1140 1141 public Smiley(String text) { 1142 super(Type.SMILEY, text); 1143 } 1144 1145 public boolean isHtml() { return false; } 1146 1147 public List<String> getInfo() { 1148 List<String> info = super.getInfo(); 1149 info.add(getRawText()); 1150 return info; 1151 } 1152 } 1153 1154 /** Represents an acronym that was found in the input. */ 1155 public static class Acronym extends Token { 1156 private String value; 1157 // TODO: SWF 1158 1159 public Acronym(String text, String value) { 1160 super(Type.ACRONYM, text); 1161 this.value = value; 1162 } 1163 1164 public String getValue() { return value; } 1165 1166 public boolean isHtml() { return false; } 1167 1168 public List<String> getInfo() { 1169 List<String> info = super.getInfo(); 1170 info.add(getRawText()); 1171 info.add(getValue()); 1172 return info; 1173 } 1174 } 1175 1176 /** Represents a character that changes formatting. */ 1177 public static class Format extends Token { 1178 private char ch; 1179 private boolean start; 1180 private boolean matched; 1181 1182 public Format(char ch, boolean start) { 1183 super(Type.FORMAT, String.valueOf(ch)); 1184 this.ch = ch; 1185 this.start = start; 1186 } 1187 1188 public void setMatched(boolean matched) { this.matched = matched; } 1189 1190 public boolean isHtml() { return true; } 1191 1192 public String toHtml(boolean caps) { 1193 // This character only implies special formatting if it was matched. 1194 // Otherwise, it was just a plain old character. 1195 if (matched) { 1196 return start ? getFormatStart(ch) : getFormatEnd(ch); 1197 } else { 1198 // We have to make sure we escape HTML characters as usual. 1199 return (ch == '"') ? """ : String.valueOf(ch); 1200 } 1201 } 1202 1203 /** 1204 * Not supported. Info should not be needed for this type 1205 */ 1206 public List<String> getInfo() { 1207 throw new UnsupportedOperationException(); 1208 } 1209 1210 public boolean controlCaps() { return (ch == '^'); } 1211 public boolean setCaps() { return start; } 1212 1213 private String getFormatStart(char ch) { 1214 switch (ch) { 1215 case '*': return "<b>"; 1216 case '_': return "<i>"; 1217 case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps 1218 case '"': return "<font color=\"#999999\">\u201c"; 1219 default: throw new AssertionError("unknown format '" + ch + "'"); 1220 } 1221 } 1222 1223 private String getFormatEnd(char ch) { 1224 switch (ch) { 1225 case '*': return "</b>"; 1226 case '_': return "</i>"; 1227 case '^': return "</font></b>"; // TODO: all caps 1228 case '"': return "\u201d</font>"; 1229 default: throw new AssertionError("unknown format '" + ch + "'"); 1230 } 1231 } 1232 } 1233 1234 /** Adds the given token to the parsed output. */ 1235 private void addToken(Token token) { 1236 tokens.add(token); 1237 } 1238 1239 /** Converts the entire message into a single HTML display string. */ 1240 public String toHtml() { 1241 StringBuilder html = new StringBuilder(); 1242 1243 for (Part part : parts) { 1244 boolean caps = false; 1245 1246 html.append("<p>"); 1247 for (Token token : part.getTokens()) { 1248 if (token.isHtml()) { 1249 html.append(token.toHtml(caps)); 1250 } else { 1251 switch (token.getType()) { 1252 case LINK: 1253 html.append("<a href=\""); 1254 html.append(((Link)token).getURL()); 1255 html.append("\">"); 1256 html.append(token.getRawText()); 1257 html.append("</a>"); 1258 break; 1259 1260 case SMILEY: 1261 // TODO: link to an appropriate image 1262 html.append(token.getRawText()); 1263 break; 1264 1265 case ACRONYM: 1266 html.append(token.getRawText()); 1267 break; 1268 1269 case MUSIC: 1270 // TODO: include a music glyph 1271 html.append(((MusicTrack)token).getTrack()); 1272 break; 1273 1274 case GOOGLE_VIDEO: 1275 // TODO: include a Google Video icon 1276 html.append("<a href=\""); 1277 html.append(((Video)token).getURL(((Video)token).getDocID())); 1278 html.append("\">"); 1279 html.append(token.getRawText()); 1280 html.append("</a>"); 1281 break; 1282 1283 case YOUTUBE_VIDEO: 1284 // TODO: include a YouTube icon 1285 html.append("<a href=\""); 1286 html.append(((YouTubeVideo)token).getURL( 1287 ((YouTubeVideo)token).getDocID())); 1288 html.append("\">"); 1289 html.append(token.getRawText()); 1290 html.append("</a>"); 1291 break; 1292 1293 case PHOTO: { 1294 // TODO: include a Picasa Web icon 1295 html.append("<a href=\""); 1296 html.append(Photo.getAlbumURL( 1297 ((Photo)token).getUser(), ((Photo)token).getAlbum())); 1298 html.append("\">"); 1299 html.append(token.getRawText()); 1300 html.append("</a>"); 1301 break; 1302 } 1303 1304 case FLICKR: 1305 // TODO: include a Flickr icon 1306 Photo p = (Photo) token; 1307 html.append("<a href=\""); 1308 html.append(((FlickrPhoto)token).getUrl()); 1309 html.append("\">"); 1310 html.append(token.getRawText()); 1311 html.append("</a>"); 1312 break; 1313 1314 default: 1315 throw new AssertionError("unknown token type: " + token.getType()); 1316 } 1317 } 1318 1319 if (token.controlCaps()) { 1320 caps = token.setCaps(); 1321 } 1322 } 1323 html.append("</p>\n"); 1324 } 1325 1326 return html.toString(); 1327 } 1328 1329 /** Returns the reverse of the given string. */ 1330 protected static String reverse(String str) { 1331 StringBuilder buf = new StringBuilder(); 1332 for (int i = str.length() - 1; i >= 0; --i) { 1333 buf.append(str.charAt(i)); 1334 } 1335 return buf.toString(); 1336 } 1337 1338 public static class TrieNode { 1339 private final HashMap<Character,TrieNode> children = 1340 new HashMap<Character,TrieNode>(); 1341 private String text; 1342 private String value; 1343 1344 public TrieNode() { this(""); } 1345 public TrieNode(String text) { 1346 this.text = text; 1347 } 1348 1349 public final boolean exists() { return value != null; } 1350 public final String getText() { return text; } 1351 public final String getValue() { return value; } 1352 public void setValue(String value) { this.value = value; } 1353 1354 public TrieNode getChild(char ch) { 1355 return children.get(Character.valueOf(ch)); 1356 } 1357 1358 public TrieNode getOrCreateChild(char ch) { 1359 Character key = Character.valueOf(ch); 1360 TrieNode node = children.get(key); 1361 if (node == null) { 1362 node = new TrieNode(text + String.valueOf(ch)); 1363 children.put(key, node); 1364 } 1365 return node; 1366 } 1367 1368 /** Adds the given string into the trie. */ 1369 public static void addToTrie(TrieNode root, String str, String value) { 1370 int index = 0; 1371 while (index < str.length()) { 1372 root = root.getOrCreateChild(str.charAt(index++)); 1373 } 1374 root.setValue(value); 1375 } 1376 } 1377 1378 1379 1380 /** Determines whether the given string is in the given trie. */ 1381 private static boolean matches(TrieNode root, String str) { 1382 int index = 0; 1383 while (index < str.length()) { 1384 root = root.getChild(str.charAt(index++)); 1385 if (root == null) { 1386 break; 1387 } else if (root.exists()) { 1388 return true; 1389 } 1390 } 1391 return false; 1392 } 1393 1394 /** 1395 * Returns the longest substring of the given string, starting at the given 1396 * index, that exists in the trie. 1397 */ 1398 private static TrieNode longestMatch( 1399 TrieNode root, AbstractMessageParser p, int start) { 1400 return longestMatch(root, p, start, false); 1401 } 1402 1403 /** 1404 * Returns the longest substring of the given string, starting at the given 1405 * index, that exists in the trie, with a special tokenizing case for 1406 * smileys if specified. 1407 */ 1408 private static TrieNode longestMatch( 1409 TrieNode root, AbstractMessageParser p, int start, boolean smiley) { 1410 int index = start; 1411 TrieNode bestMatch = null; 1412 while (index < p.getRawText().length()) { 1413 root = root.getChild(p.getRawText().charAt(index++)); 1414 if (root == null) { 1415 break; 1416 } else if (root.exists()) { 1417 if (p.isWordBreak(index)) { 1418 bestMatch = root; 1419 } else if (smiley && p.isSmileyBreak(index)) { 1420 bestMatch = root; 1421 } 1422 } 1423 } 1424 return bestMatch; 1425 } 1426 1427 1428 /** Represents set of tokens that are delivered as a single message. */ 1429 public static class Part { 1430 private String meText; 1431 private ArrayList<Token> tokens; 1432 1433 public Part() { 1434 this.tokens = new ArrayList<Token>(); 1435 } 1436 1437 public String getType(boolean isSend) { 1438 return (isSend ? "s" : "r") + getPartType(); 1439 } 1440 1441 private String getPartType() { 1442 if (isMedia()) { 1443 return "d"; 1444 } else if (meText != null) { 1445 return "m"; 1446 } else { 1447 return ""; 1448 } 1449 } 1450 1451 public boolean isMedia() { 1452 return (tokens.size() == 1) && tokens.get(0).isMedia(); 1453 } 1454 /** 1455 * Convenience method for getting the Token of a Part that represents 1456 * a media Token. Parts of this kind will always only have a single Token 1457 * 1458 * @return if this.isMedia(), 1459 * returns the Token representing the media contained in this Part, 1460 * otherwise returns null; 1461 */ 1462 public Token getMediaToken() { 1463 if(isMedia()) { 1464 return tokens.get(0); 1465 } 1466 return null; 1467 } 1468 1469 /** Adds the given token to this part. */ 1470 public void add(Token token) { 1471 if (isMedia()) { 1472 throw new AssertionError("media "); 1473 } 1474 tokens.add(token); 1475 } 1476 1477 public void setMeText(String meText) { 1478 this.meText = meText; 1479 } 1480 1481 /** Returns the original text of this part. */ 1482 public String getRawText() { 1483 StringBuilder buf = new StringBuilder(); 1484 if (meText != null) { 1485 buf.append(meText); 1486 } 1487 for (int i = 0; i < tokens.size(); ++i) { 1488 buf.append(tokens.get(i).getRawText()); 1489 } 1490 return buf.toString(); 1491 } 1492 1493 /** Returns the tokens in this part. */ 1494 public ArrayList<Token> getTokens() { return tokens; } 1495 1496 /** Adds the tokens into the given builder as an array. */ 1497 // public void toArray(JSArrayBuilder array) { 1498 // if (isMedia()) { 1499 // // For media, we send its array (i.e., we don't wrap this in another 1500 // // array as we do for non-media parts). 1501 // tokens.get(0).toArray(array); 1502 // } else { 1503 // array.beginArray(); 1504 // addToArray(array); 1505 // array.endArray(); 1506 // } 1507 // } 1508 } 1509 } 1510