Home | History | Annotate | Download | only in wiktionary
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.example.android.wiktionary;
     18 
     19 import org.json.JSONArray;
     20 import org.json.JSONException;
     21 import org.json.JSONObject;
     22 
     23 import android.net.Uri;
     24 import android.text.TextUtils;
     25 import android.webkit.WebView;
     26 
     27 import java.util.ArrayList;
     28 import java.util.HashSet;
     29 import java.util.List;
     30 import java.util.regex.Matcher;
     31 import java.util.regex.Pattern;
     32 
     33 /**
     34  * Extended version of {@link SimpleWikiHelper}. This version adds methods to
     35  * pick a random word, and to format generic wiki-style text into HTML.
     36  */
     37 public class ExtendedWikiHelper extends SimpleWikiHelper {
     38     /**
     39      * HTML style sheet to include with any {@link #formatWikiText(String)} HTML
     40      * results. It formats nicely for a mobile screen, and hides some content
     41      * boxes to keep things tidy.
     42      */
     43     private static final String STYLE_SHEET = "<style>h2 {font-size:1.2em;font-weight:normal;} " +
     44             "a {color:#6688cc;} ol {padding-left:1.5em;} blockquote {margin-left:0em;} " +
     45             ".interProject, .noprint {display:none;} " +
     46             "li, blockquote {margin-top:0.5em;margin-bottom:0.5em;}</style>";
     47 
     48     /**
     49      * Pattern of section titles we're interested in showing. This trims out
     50      * extra sections that can clutter things up on a mobile screen.
     51      */
     52     private static final Pattern sValidSections =
     53         Pattern.compile("(verb|noun|adjective|pronoun|interjection)", Pattern.CASE_INSENSITIVE);
     54 
     55     /**
     56      * Pattern that can be used to split a returned wiki page into its various
     57      * sections. Doesn't treat children sections differently.
     58      */
     59     private static final Pattern sSectionSplit =
     60         Pattern.compile("^=+(.+?)=+.+?(?=^=)", Pattern.MULTILINE | Pattern.DOTALL);
     61 
     62     /**
     63      * When picking random words in {@link #getRandomWord()}, we sometimes
     64      * encounter special articles or templates. This pattern ignores any words
     65      * like those, usually because they have ":" or other punctuation.
     66      */
     67     private static final Pattern sInvalidWord = Pattern.compile("[^A-Za-z0-9 ]");
     68 
     69     /**
     70      * {@link Uri} authority to use when creating internal links.
     71      */
     72     public static final String WIKI_AUTHORITY = "wiktionary";
     73 
     74     /**
     75      * {@link Uri} host to use when creating internal links.
     76      */
     77     public static final String WIKI_LOOKUP_HOST = "lookup";
     78 
     79     /**
     80      * Mime-type to use when showing parsed results in a {@link WebView}.
     81      */
     82     public static final String MIME_TYPE = "text/html";
     83 
     84     /**
     85      * Encoding to use when showing parsed results in a {@link WebView}.
     86      */
     87     public static final String ENCODING = "utf-8";
     88 
     89     /**
     90      * {@link Uri} to use when requesting a random page.
     91      */
     92     private static final String WIKTIONARY_RANDOM =
     93         "http://en.wiktionary.org/w/api.php?action=query&list=random&format=json";
     94 
     95     /**
     96      * Fake section to insert at the bottom of a wiki response before parsing.
     97      * This ensures that {@link #sSectionSplit} will always catch the last
     98      * section, as it uses section headers in its searching.
     99      */
    100     private static final String STUB_SECTION = "\n=Stub section=";
    101 
    102     /**
    103      * Number of times to try finding a random word in {@link #getRandomWord()}.
    104      * These failures are usually when the found word fails the
    105      * {@link #sInvalidWord} test, or when a network error happens.
    106      */
    107     private static final int RANDOM_TRIES = 3;
    108 
    109     /**
    110      * Internal class to hold a wiki formatting rule. It's mostly a wrapper to
    111      * simplify {@link Matcher#replaceAll(String)}.
    112      */
    113     private static class FormatRule {
    114         private Pattern mPattern;
    115         private String mReplaceWith;
    116 
    117         /**
    118          * Create a wiki formatting rule.
    119          *
    120          * @param pattern Search string to be compiled into a {@link Pattern}.
    121          * @param replaceWith String to replace any found occurances with. This
    122          *            string can also include back-references into the given
    123          *            pattern.
    124          * @param flags Any flags to compile the {@link Pattern} with.
    125          */
    126         public FormatRule(String pattern, String replaceWith, int flags) {
    127             mPattern = Pattern.compile(pattern, flags);
    128             mReplaceWith = replaceWith;
    129         }
    130 
    131         /**
    132          * Create a wiki formatting rule.
    133          *
    134          * @param pattern Search string to be compiled into a {@link Pattern}.
    135          * @param replaceWith String to replace any found occurances with. This
    136          *            string can also include back-references into the given
    137          *            pattern.
    138          */
    139         public FormatRule(String pattern, String replaceWith) {
    140             this(pattern, replaceWith, 0);
    141         }
    142 
    143         /**
    144          * Apply this formatting rule to the given input string, and return the
    145          * resulting new string.
    146          */
    147         public String apply(String input) {
    148             Matcher m = mPattern.matcher(input);
    149             return m.replaceAll(mReplaceWith);
    150         }
    151 
    152     }
    153 
    154     /**
    155      * List of internal formatting rules to apply when parsing wiki text. These
    156      * include indenting various bullets, apply italic and bold styles, and
    157      * adding internal linking.
    158      */
    159     private static final List<FormatRule> sFormatRules = new ArrayList<FormatRule>();
    160 
    161     static {
    162         // Format header blocks and wrap outside content in ordered list
    163         sFormatRules.add(new FormatRule("^=+(.+?)=+", "</ol><h2>$1</h2><ol>",
    164                 Pattern.MULTILINE));
    165 
    166         // Indent quoted blocks, handle ordered and bullet lists
    167         sFormatRules.add(new FormatRule("^#+\\*?:(.+?)$", "<blockquote>$1</blockquote>",
    168                 Pattern.MULTILINE));
    169         sFormatRules.add(new FormatRule("^#+:?\\*(.+?)$", "<ul><li>$1</li></ul>",
    170                 Pattern.MULTILINE));
    171         sFormatRules.add(new FormatRule("^#+(.+?)$", "<li>$1</li>",
    172                 Pattern.MULTILINE));
    173 
    174         // Add internal links
    175         sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\]\\]",
    176                 String.format("<a href=\"%s://%s/$1\">$1</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)));
    177         sFormatRules.add(new FormatRule("\\[\\[([^:\\|\\]]+)\\|([^\\]]+)\\]\\]",
    178                 String.format("<a href=\"%s://%s/$1\">$2</a>", WIKI_AUTHORITY, WIKI_LOOKUP_HOST)));
    179 
    180         // Add bold and italic formatting
    181         sFormatRules.add(new FormatRule("'''(.+?)'''", "<b>$1</b>"));
    182         sFormatRules.add(new FormatRule("([^'])''([^'].*?[^'])''([^'])", "$1<i>$2</i>$3"));
    183 
    184         // Remove odd category links and convert remaining links into flat text
    185         sFormatRules.add(new FormatRule("(\\{+.+?\\}+|\\[\\[[^:]+:[^\\\\|\\]]+\\]\\]|" +
    186                 "\\[http.+?\\]|\\[\\[Category:.+?\\]\\])", "", Pattern.MULTILINE | Pattern.DOTALL));
    187         sFormatRules.add(new FormatRule("\\[\\[([^\\|\\]]+\\|)?(.+?)\\]\\]", "$2",
    188                 Pattern.MULTILINE));
    189 
    190     }
    191 
    192     /**
    193      * Query the Wiktionary API to pick a random dictionary word. Will try
    194      * multiple times to find a valid word before giving up.
    195      *
    196      * @return Random dictionary word, or null if no valid word was found.
    197      * @throws ApiException If any connection or server error occurs.
    198      * @throws ParseException If there are problems parsing the response.
    199      */
    200     public static String getRandomWord() throws ApiException, ParseException {
    201         // Keep trying a few times until we find a valid word
    202         int tries = 0;
    203         while (tries++ < RANDOM_TRIES) {
    204             // Query the API for a random word
    205             String content = getUrlContent(WIKTIONARY_RANDOM);
    206             try {
    207                 // Drill into the JSON response to find the returned word
    208                 JSONObject response = new JSONObject(content);
    209                 JSONObject query = response.getJSONObject("query");
    210                 JSONArray random = query.getJSONArray("random");
    211                 JSONObject word = random.getJSONObject(0);
    212                 String foundWord = word.getString("title");
    213 
    214                 // If we found an actual word, and it wasn't rejected by our invalid
    215                 // filter, then accept and return it.
    216                 if (foundWord != null &&
    217                         !sInvalidWord.matcher(foundWord).find()) {
    218                     return foundWord;
    219                 }
    220             } catch (JSONException e) {
    221                 throw new ParseException("Problem parsing API response", e);
    222             }
    223         }
    224 
    225         // No valid word found in number of tries, so return null
    226         return null;
    227     }
    228 
    229     /**
    230      * Format the given wiki-style text into formatted HTML content. This will
    231      * create headers, lists, internal links, and style formatting for any wiki
    232      * markup found.
    233      *
    234      * @param wikiText The raw text to format, with wiki-markup included.
    235      * @return HTML formatted content, ready for display in {@link WebView}.
    236      */
    237     public static String formatWikiText(String wikiText) {
    238         if (wikiText == null) {
    239             return null;
    240         }
    241 
    242         // Insert a fake last section into the document so our section splitter
    243         // can correctly catch the last section.
    244         wikiText = wikiText.concat(STUB_SECTION);
    245 
    246         // Read through all sections, keeping only those matching our filter,
    247         // and only including the first entry for each title.
    248         HashSet<String> foundSections = new HashSet<String>();
    249         StringBuilder builder = new StringBuilder();
    250 
    251         Matcher sectionMatcher = sSectionSplit.matcher(wikiText);
    252         while (sectionMatcher.find()) {
    253             String title = sectionMatcher.group(1);
    254             if (!foundSections.contains(title) &&
    255                     sValidSections.matcher(title).matches()) {
    256                 String sectionContent = sectionMatcher.group();
    257                 foundSections.add(title);
    258                 builder.append(sectionContent);
    259             }
    260         }
    261 
    262         // Our new wiki text is the selected sections only
    263         wikiText = builder.toString();
    264 
    265         // Apply all formatting rules, in order, to the wiki text
    266         for (FormatRule rule : sFormatRules) {
    267             wikiText = rule.apply(wikiText);
    268         }
    269 
    270         // Return the resulting HTML with style sheet, if we have content left
    271         if (!TextUtils.isEmpty(wikiText)) {
    272             return STYLE_SHEET + wikiText;
    273         } else {
    274             return null;
    275         }
    276     }
    277 
    278 }
    279