Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2013 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.base;
     18 
     19 import com.google.common.annotations.GwtCompatible;
     20 import com.google.common.annotations.GwtIncompatible;
     21 
     22 import junit.framework.TestCase;
     23 
     24 import java.util.Arrays;
     25 import java.util.HashMap;
     26 import java.util.Random;
     27 
     28 /**
     29  * Unit tests for {@link Utf8}.
     30  *
     31  * @author Jon Perlow
     32  * @author Martin Buchholz
     33  * @author Clment Roux
     34  */
     35 @GwtCompatible(emulated = true)
     36 public class Utf8Test extends TestCase {
     37   public void testEncodedLength_validStrings() {
     38     assertEquals(0, Utf8.encodedLength(""));
     39     assertEquals(11, Utf8.encodedLength("Hello world"));
     40     assertEquals(8, Utf8.encodedLength("Rsum"));
     41     assertEquals(461, Utf8.encodedLength("William Shakespeare"
     42         + "15644261616423[1]"
     43         + ""
     44         + ""
     45         + "[2]"
     46         + ""));
     47     // A surrogate pair
     48     assertEquals(4, Utf8.encodedLength(
     49         newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
     50   }
     51 
     52   @GwtIncompatible("StringBuilder.appendCodePoint()")
     53   public void testEncodedLength_validStrings2() {
     54     HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
     55     utf8Lengths.put(0x00, 1);
     56     utf8Lengths.put(0x7f, 1);
     57     utf8Lengths.put(0x80, 2);
     58     utf8Lengths.put(0x7ff, 2);
     59     utf8Lengths.put(0x800, 3);
     60     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
     61     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
     62     utf8Lengths.put(Character.MAX_CODE_POINT, 4);
     63 
     64     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
     65     StringBuilder sb = new StringBuilder();
     66     Random rnd = new Random();
     67     for (int trial = 0; trial < 100; trial++) {
     68       sb.setLength(0);
     69       int utf8Length = 0;
     70       for (int i = 0; i < 6; i++) {
     71         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
     72         sb.appendCodePoint(randomCodePoint);
     73         utf8Length += utf8Lengths.get(randomCodePoint);
     74         if (utf8Length != Utf8.encodedLength(sb)) {
     75           StringBuilder repro = new StringBuilder();
     76           for (int j = 0; j < sb.length(); j++) {
     77             repro.append(" " + (int) sb.charAt(j));  // GWT compatible
     78           }
     79           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
     80         }
     81       }
     82     }
     83   }
     84 
     85   public void testEncodedLength_invalidStrings() {
     86     testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
     87     testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
     88     testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
     89     testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
     90     testEncodedLengthFails(
     91         newString(
     92             Character.MIN_HIGH_SURROGATE,
     93             Character.MIN_HIGH_SURROGATE), 0);
     94   }
     95 
     96   private static void testEncodedLengthFails(String invalidString,
     97       int invalidCodePointIndex) {
     98     try {
     99       Utf8.encodedLength(invalidString);
    100       fail();
    101     } catch (IllegalArgumentException expected) {
    102       assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
    103           expected.getMessage());
    104     }
    105   }
    106 
    107   // 128 - [chars 0x0000 to 0x007f]
    108   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
    109       0x007f - 0x0000 + 1;
    110 
    111   // 128
    112   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
    113       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    114 
    115   // 1920 [chars 0x0080 to 0x07FF]
    116   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
    117       0x07FF - 0x0080 + 1;
    118 
    119   // 18,304
    120   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
    121       // Both bytes are one byte characters
    122       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
    123       // The possible number of two byte characters
    124       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    125 
    126   // 2048
    127   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
    128 
    129   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
    130   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
    131       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
    132 
    133   // 2,650,112
    134   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
    135       // All one byte characters
    136       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
    137       // One two byte character and a one byte character
    138       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    139           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    140        // Three byte characters
    141       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    142 
    143   // 1,048,576 [chars 0x10000L to 0x10FFFF]
    144   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
    145       0x10FFFF - 0x10000L + 1;
    146 
    147   // 289,571,839
    148   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
    149       // All one byte characters
    150       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
    151       // One and three byte characters
    152       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    153           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    154       // Two two byte characters
    155       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    156       // Permutations of one and two byte characters
    157       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    158           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
    159           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
    160       // Four byte characters
    161       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
    162 
    163   /** Tests that round tripping of all two byte permutations work. */
    164   @GwtIncompatible("java.nio.charset.Charset")
    165   public void testIsWellFormed_1Byte() {
    166     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
    167   }
    168 
    169   /** Tests that round tripping of all two byte permutations work. */
    170   @GwtIncompatible("java.nio.charset.Charset")
    171   public void testIsWellFormed_2Bytes() {
    172     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
    173   }
    174 
    175   /** Tests that round tripping of all three byte permutations work. */
    176   @GwtIncompatible("java.nio.charset.Charset")
    177   public void testIsWellFormed_3Bytes() {
    178     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
    179   }
    180 
    181   /**
    182    * Tests that round tripping of a sample of four byte permutations work.
    183    * All permutations are prohibitively expensive to test for automated runs.
    184    * This method tests specific four-byte cases.
    185    */
    186   public void testIsWellFormed_4BytesSamples() {
    187     // Valid 4 byte.
    188     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
    189     // Bad trailing bytes
    190     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
    191     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
    192     // Special cases for byte2
    193     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
    194     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
    195   }
    196 
    197   /** Tests some hard-coded test cases. */
    198   public void testSomeSequences() {
    199     // Empty
    200     assertWellFormed();
    201     // One-byte characters, including control characters
    202     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
    203     // Two-byte characters
    204     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
    205     // Three-byte characters
    206     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
    207     // Four-byte characters
    208     // "\u024B62\u024B62"
    209     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
    210     // Mixed string
    211     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
    212     assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
    213         0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
    214         0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
    215     // Not a valid string
    216     assertNotWellFormed(-1, 0, -1, 0);
    217   }
    218 
    219   public void testShardsHaveExpectedRoundTrippables() {
    220     // A sanity check.
    221     long actual = 0;
    222     for (long expected : generateFourByteShardsExpectedRunnables()) {
    223       actual += expected;
    224     }
    225     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
    226   }
    227 
    228   private String newString(char... chars) {
    229     return new String(chars);
    230   }
    231 
    232   private byte[] toByteArray(int... bytes) {
    233     byte[] realBytes = new byte[bytes.length];
    234     for (int i = 0; i < bytes.length; i++) {
    235       realBytes[i] = (byte) bytes[i];
    236     }
    237     return realBytes;
    238   }
    239 
    240   private void assertWellFormed(int... bytes) {
    241     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
    242   }
    243 
    244   private void assertNotWellFormed(int... bytes) {
    245     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
    246   }
    247 
    248   private static long[] generateFourByteShardsExpectedRunnables() {
    249     long[] expected = new long[128];
    250     // 0-63 are all 5300224
    251     for (int i = 0; i <= 63; i++) {
    252       expected[i] = 5300224;
    253     }
    254     // 97-111 are all 2342912
    255     for (int i = 97; i <= 111; i++) {
    256      expected[i] = 2342912;
    257     }
    258     // 113-117 are all 1048576
    259     for (int i = 113; i <= 117; i++) {
    260       expected[i] = 1048576;
    261     }
    262     // One offs
    263     expected[112] = 786432;
    264     expected[118] = 786432;
    265     expected[119] = 1048576;
    266     expected[120] = 458752;
    267     expected[121] = 524288;
    268     expected[122] = 65536;
    269     // Anything not assigned was the default 0.
    270     return expected;
    271   }
    272 
    273   /**
    274    * Helper to run the loop to test all the permutations for the number of bytes
    275    * specified.
    276    *
    277    * @param numBytes the number of bytes in the byte array
    278    * @param expectedCount the expected number of roundtrippable permutations
    279    */
    280   @GwtIncompatible("java.nio.charset.Charset")
    281   private static void testBytes(int numBytes, long expectedCount) {
    282     testBytes(numBytes, expectedCount, 0, -1);
    283   }
    284 
    285   /**
    286    * Helper to run the loop to test all the permutations for the number of bytes
    287    * specified. This overload is useful for debugging to get the loop to start
    288    * at a certain character.
    289    *
    290    * @param numBytes the number of bytes in the byte array
    291    * @param expectedCount the expected number of roundtrippable permutations
    292    * @param start the starting bytes encoded as a long as big-endian
    293    * @param lim the limit of bytes to process encoded as a long as big-endian,
    294    *     or -1 to mean the max limit for numBytes
    295    */
    296   @GwtIncompatible("java.nio.charset.Charset")
    297   private static void testBytes(int numBytes, long expectedCount, long start,
    298       long lim) {
    299     byte[] bytes = new byte[numBytes];
    300     if (lim == -1) {
    301       lim = 1L << (numBytes * 8);
    302     }
    303     long countRoundTripped = 0;
    304     for (long byteChar = start; byteChar < lim; byteChar++) {
    305       long tmpByteChar = byteChar;
    306       for (int i = 0; i < numBytes; i++) {
    307         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
    308         tmpByteChar = tmpByteChar >> 8;
    309       }
    310       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
    311       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
    312       String s = new String(bytes, Charsets.UTF_8);
    313       byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
    314       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
    315 
    316       if (bytesEqual != isRoundTrippable) {
    317         fail();
    318       }
    319       if (isRoundTrippable) {
    320         countRoundTripped++;
    321       }
    322     }
    323     assertEquals(expectedCount, countRoundTripped);
    324   }
    325 }
    326