1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.example.android.wiktionary; 18 19 import org.json.JSONArray; 20 import org.json.JSONException; 21 import org.json.JSONObject; 22 23 import android.net.Uri; 24 import android.text.TextUtils; 25 import android.webkit.WebView; 26 27 import java.util.ArrayList; 28 import java.util.HashSet; 29 import java.util.List; 30 import java.util.regex.Matcher; 31 import java.util.regex.Pattern; 32 33 /** 34 * Extended version of {@link SimpleWikiHelper}. This version adds methods to 35 * pick a random word, and to format generic wiki-style text into HTML. 36 */ 37 public class ExtendedWikiHelper extends SimpleWikiHelper { 38 /** 39 * HTML style sheet to include with any {@link #formatWikiText(String)} HTML 40 * results. It formats nicely for a mobile screen, and hides some content 41 * boxes to keep things tidy. 42 */ 43 private static final String STYLE_SHEET = "<style>h2 {font-size:1.2em;font-weight:normal;} " + 44 "a {color:#6688cc;} ol {padding-left:1.5em;} blockquote {margin-left:0em;} " + 45 ".interProject, .noprint {display:none;} " + 46 "li, blockquote {margin-top:0.5em;margin-bottom:0.5em;}</style>"; 47 48 /** 49 * Pattern of section titles we're interested in showing. This trims out 50 * extra sections that can clutter things up on a mobile screen. 51 */ 52 private static final Pattern sValidSections = 53 Pattern.compile("(verb|noun|adjective|pronoun|interjection)", Pattern.CASE_INSENSITIVE); 54 55 /** 56 * Pattern that can be used to split a returned wiki page into its various 57 * sections. Doesn't treat children sections differently. 58 */ 59 private static final Pattern sSectionSplit = 60 Pattern.compile("^=+(.+?)=+.+?(?=^=)", Pattern.MULTILINE | Pattern.DOTALL); 61 62 /** 63 * When picking random words in {@link #getRandomWord()}, we sometimes 64 * encounter special articles or templates. This pattern ignores any words 65 * like those, usually because they have ":" or other punctuation. 66 */ 67 private static final Pattern sInvalidWord = Pattern.compile("[^A-Za-z0-9 ]"); 68 69 /** 70 * {@link Uri} authority to use when creating internal links. 71 */ 72 public static final String WIKI_AUTHORITY = "wiktionary"; 73 74 /** 75 * {@link Uri} host to use when creating internal links. 76 */ 77 public static final String WIKI_LOOKUP_HOST = "lookup"; 78 79 /** 80 * Mime-type to use when showing parsed results in a {@link WebView}. 81 */ 82 public static final String MIME_TYPE = "text/html"; 83 84 /** 85 * Encoding to use when showing parsed results in a {@link WebView}. 86 */ 87 public static final String ENCODING = "utf-8"; 88 89 /** 90 * {@link Uri} to use when requesting a random page. 91 */ 92 private static final String WIKTIONARY_RANDOM = 93 "http://en.wiktionary.org/w/api.php?action=query&list=random&format=json"; 94 95 /** 96 * Fake section to insert at the bottom of a wiki response before parsing. 97 * This ensures that {@link #sSectionSplit} will always catch the last 98 * section, as it uses section headers in its searching. 99 */ 100 private static final String STUB_SECTION = "\n=Stub section="; 101 102 /** 103 * Number of times to try finding a random word in {@link #getRandomWord()}. 104 * These failures are usually when the found word fails the 105 * {@link #sInvalidWord} test, or when a network error happens. 106 */ 107 private static final int RANDOM_TRIES = 3; 108 109 /** 110 * Internal class to hold a wiki formatting rule. It's mostly a wrapper to 111 * simplify {@link Matcher#replaceAll(String)}. 112 */ 113 private static class FormatRule { 114 private Pattern mPattern; 115 private String mReplaceWith; 116 117 /** 118 * Create a wiki formatting rule. 119 * 120 * @param pattern Search string to be compiled into a {@link Pattern}. 121 * @param replaceWith String to replace any found occurances with. This 122 * string can also include back-references into the given 123 * pattern. 124 * @param flags Any flags to compile the {@link Pattern} with. 125 */ 126 public FormatRule(String pattern, String replaceWith, int flags) { 127 mPattern = Pattern.compile(pattern, flags); 128 mReplaceWith = replaceWith; 129 } 130 131 /** 132 * Create a wiki formatting rule. 133 * 134 * @param pattern Search string to be compiled into a {@link Pattern}. 135 * @param replaceWith String to replace any found occurances with. This 136 * string can also include back-references into the given 137 * pattern. 138 */ 139 public FormatRule(String pattern, String replaceWith) { 140 this(pattern, replaceWith, 0); 141 } 142 143 /** 144 * Apply this formatting rule to the given input string, and return the 145 * resulting new string. 146 */ 147 public String apply(String input) { 148 Matcher m = mPattern.matcher(input); 149 return m.replaceAll(mReplaceWith); 150 } 151 152 } 153 154 /** 155 * List of internal formatting rules to apply when parsing wiki text. These 156 * include indenting various bullets, apply italic and bold styles, and 157 * adding internal linking. 158 */ 159 private static final List<FormatRule> sFormatRules = new ArrayList<FormatRule>(); 160 161 static { 162 // Format header blocks and wrap outside content in ordered list 163 sFormatRules.add(new FormatRule("^=+(.+?)=+", "</ol><h2>$1</h2><ol>", 164 Pattern.MULTILINE)); 165 166 // Indent quoted blocks, handle ordered and bullet lists 167 sFormatRules.add(new FormatRule("^#+\\*?:(.+?)$", "<blockquote>$1</blockquote>", 168 Pattern.MULTILINE)); 169 sFormatRules.add(new FormatRule("^#+:?\\*(.+?)$", "<ul><li>$1</li></ul>", 170 Pattern.MULTILINE)); 171 sFormatRules.add(new FormatRule("^#+(.+?)$", "<li>$1</li>", 172 Pattern.MULTILINE)); 173 174 // Add internal links 175 sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\]\\]", 176 String.format("<a href=\"%s://%s/$1\">$1</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST))); 177 sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\|([^\\]]+)\\]\\]", 178 String.format("<a href=\"%s://%s/$1\">$2</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST))); 179 180 // Add bold and italic formatting 181 sFormatRules.add(new FormatRule("'''(.+?)'''", "<b>$1</b>")); 182 sFormatRules.add(new FormatRule("([^'])''([^'].*?[^'])''([^'])", "$1<i>$2</i>$3")); 183 184 // Remove odd category links and convert remaining links into flat text 185 sFormatRules.add(new FormatRule("(\\{+.+?\\}+|\\[\\[[^:]+:[^\\\\|\\]]+\\]\\]|" + 186 "\\[http.+?\\]|\\[\\[Category:.+?\\]\\])", "", Pattern.MULTILINE | Pattern.DOTALL)); 187 sFormatRules.add(new FormatRule("\\[\\[([^\\|\\]]+\\|)?(.+?)\\]\\]", "$2", 188 Pattern.MULTILINE)); 189 190 } 191 192 /** 193 * Query the Wiktionary API to pick a random dictionary word. Will try 194 * multiple times to find a valid word before giving up. 195 * 196 * @return Random dictionary word, or null if no valid word was found. 197 * @throws ApiException If any connection or server error occurs. 198 * @throws ParseException If there are problems parsing the response. 199 */ 200 public static String getRandomWord() throws ApiException, ParseException { 201 // Keep trying a few times until we find a valid word 202 int tries = 0; 203 while (tries++ < RANDOM_TRIES) { 204 // Query the API for a random word 205 String content = getUrlContent(WIKTIONARY_RANDOM); 206 try { 207 // Drill into the JSON response to find the returned word 208 JSONObject response = new JSONObject(content); 209 JSONObject query = response.getJSONObject("query"); 210 JSONArray random = query.getJSONArray("random"); 211 JSONObject word = random.getJSONObject(0); 212 String foundWord = word.getString("title"); 213 214 // If we found an actual word, and it wasn't rejected by our invalid 215 // filter, then accept and return it. 216 if (foundWord != null && 217 !sInvalidWord.matcher(foundWord).find()) { 218 return foundWord; 219 } 220 } catch (JSONException e) { 221 throw new ParseException("Problem parsing API response", e); 222 } 223 } 224 225 // No valid word found in number of tries, so return null 226 return null; 227 } 228 229 /** 230 * Format the given wiki-style text into formatted HTML content. This will 231 * create headers, lists, internal links, and style formatting for any wiki 232 * markup found. 233 * 234 * @param wikiText The raw text to format, with wiki-markup included. 235 * @return HTML formatted content, ready for display in {@link WebView}. 236 */ 237 public static String formatWikiText(String wikiText) { 238 if (wikiText == null) { 239 return null; 240 } 241 242 // Insert a fake last section into the document so our section splitter 243 // can correctly catch the last section. 244 wikiText = wikiText.concat(STUB_SECTION); 245 246 // Read through all sections, keeping only those matching our filter, 247 // and only including the first entry for each title. 248 HashSet<String> foundSections = new HashSet<String>(); 249 StringBuilder builder = new StringBuilder(); 250 251 Matcher sectionMatcher = sSectionSplit.matcher(wikiText); 252 while (sectionMatcher.find()) { 253 String title = sectionMatcher.group(1); 254 if (!foundSections.contains(title) && 255 sValidSections.matcher(title).matches()) { 256 String sectionContent = sectionMatcher.group(); 257 foundSections.add(title); 258 builder.append(sectionContent); 259 } 260 } 261 262 // Our new wiki text is the selected sections only 263 wikiText = builder.toString(); 264 265 // Apply all formatting rules, in order, to the wiki text 266 for (FormatRule rule : sFormatRules) { 267 wikiText = rule.apply(wikiText); 268 } 269 270 // Return the resulting HTML with style sheet, if we have content left 271 if (!TextUtils.isEmpty(wikiText)) { 272 return STYLE_SHEET + wikiText; 273 } else { 274 return null; 275 } 276 } 277 278 } 279