Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2013 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.base;
     18 
     19 import com.google.common.annotations.GwtCompatible;
     20 
     21 import junit.framework.TestCase;
     22 
     23 /**
     24  * Unit tests for {@link Utf8}.
     25  *
     26  * @author Jon Perlow
     27  * @author Martin Buchholz
     28  * @author Clment Roux
     29  */
     30 @GwtCompatible(emulated = true)
     31 public class Utf8Test extends TestCase {
     32   public void testEncodedLength_validStrings() {
     33     assertEquals(0, Utf8.encodedLength(""));
     34     assertEquals(11, Utf8.encodedLength("Hello world"));
     35     assertEquals(8, Utf8.encodedLength("Rsum"));
     36     assertEquals(461, Utf8.encodedLength("William Shakespeare"
     37         + "15644261616423[1]"
     38         + ""
     39         + ""
     40         + "[2]"
     41         + ""));
     42     // A surrogate pair
     43     assertEquals(4, Utf8.encodedLength(
     44         newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
     45   }
     46 
     47   public void testEncodedLength_invalidStrings() {
     48     testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
     49     testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
     50     testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
     51     testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
     52     testEncodedLengthFails(
     53         newString(
     54             Character.MIN_HIGH_SURROGATE,
     55             Character.MIN_HIGH_SURROGATE), 0);
     56   }
     57 
     58   private static void testEncodedLengthFails(String invalidString,
     59       int invalidCodePointIndex) {
     60     try {
     61       Utf8.encodedLength(invalidString);
     62       fail();
     63     } catch (IllegalArgumentException expected) {
     64       assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
     65           expected.getMessage());
     66     }
     67   }
     68 
     69   // 128 - [chars 0x0000 to 0x007f]
     70   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
     71       0x007f - 0x0000 + 1;
     72 
     73   // 128
     74   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
     75       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
     76 
     77   // 1920 [chars 0x0080 to 0x07FF]
     78   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
     79       0x07FF - 0x0080 + 1;
     80 
     81   // 18,304
     82   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
     83       // Both bytes are one byte characters
     84       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
     85       // The possible number of two byte characters
     86       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
     87 
     88   // 2048
     89   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
     90 
     91   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
     92   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
     93       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
     94 
     95   // 2,650,112
     96   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
     97       // All one byte characters
     98       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
     99       // One two byte character and a one byte character
    100       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    101           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    102        // Three byte characters
    103       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    104 
    105   // 1,048,576 [chars 0x10000L to 0x10FFFF]
    106   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
    107       0x10FFFF - 0x10000L + 1;
    108 
    109   // 289,571,839
    110   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
    111       // All one byte characters
    112       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
    113       // One and three byte characters
    114       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    115           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    116       // Two two byte characters
    117       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    118       // Permutations of one and two byte characters
    119       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    120           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    121           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    122       // Four byte characters
    123       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    124 
    125   /**
    126    * Tests that round tripping of a sample of four byte permutations work.
    127    * All permutations are prohibitively expensive to test for automated runs.
    128    * This method tests specific four-byte cases.
    129    */
    130   public void testIsWellFormed_4BytesSamples() {
    131     // Valid 4 byte.
    132     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
    133     // Bad trailing bytes
    134     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
    135     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
    136     // Special cases for byte2
    137     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
    138     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
    139   }
    140 
    141   /** Tests some hard-coded test cases. */
    142   public void testSomeSequences() {
    143     // Empty
    144     assertWellFormed();
    145     // One-byte characters, including control characters
    146     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
    147     // Two-byte characters
    148     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
    149     // Three-byte characters
    150     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
    151     // Four-byte characters
    152     // "\u024B62\u024B62"
    153     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
    154     // Mixed string
    155     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
    156     assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
    157         0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
    158         0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
    159     // Not a valid string
    160     assertNotWellFormed(-1, 0, -1, 0);
    161   }
    162 
    163   public void testShardsHaveExpectedRoundTrippables() {
    164     // A sanity check.
    165     long actual = 0;
    166     for (long expected : generateFourByteShardsExpectedRunnables()) {
    167       actual += expected;
    168     }
    169     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
    170   }
    171 
    172   private String newString(char... chars) {
    173     return new String(chars);
    174   }
    175 
    176   private byte[] toByteArray(int... bytes) {
    177     byte[] realBytes = new byte[bytes.length];
    178     for (int i = 0; i < bytes.length; i++) {
    179       realBytes[i] = (byte) bytes[i];
    180     }
    181     return realBytes;
    182   }
    183 
    184   private void assertWellFormed(int... bytes) {
    185     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
    186   }
    187 
    188   private void assertNotWellFormed(int... bytes) {
    189     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
    190   }
    191 
    192   private static long[] generateFourByteShardsExpectedRunnables() {
    193     long[] expected = new long[128];
    194     // 0-63 are all 5300224
    195     for (int i = 0; i <= 63; i++) {
    196       expected[i] = 5300224;
    197     }
    198     // 97-111 are all 2342912
    199     for (int i = 97; i <= 111; i++) {
    200      expected[i] = 2342912;
    201     }
    202     // 113-117 are all 1048576
    203     for (int i = 113; i <= 117; i++) {
    204       expected[i] = 1048576;
    205     }
    206     // One offs
    207     expected[112] = 786432;
    208     expected[118] = 786432;
    209     expected[119] = 1048576;
    210     expected[120] = 458752;
    211     expected[121] = 524288;
    212     expected[122] = 65536;
    213     // Anything not assigned was the default 0.
    214     return expected;
    215   }
    216 }
    217 
    218