1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import com.google.common.annotations.GwtCompatible; 20 21 import junit.framework.TestCase; 22 23 /** 24 * Unit tests for {@link Utf8}. 25 * 26 * @author Jon Perlow 27 * @author Martin Buchholz 28 * @author Clment Roux 29 */ 30 @GwtCompatible(emulated = true) 31 public class Utf8Test extends TestCase { 32 public void testEncodedLength_validStrings() { 33 assertEquals(0, Utf8.encodedLength("")); 34 assertEquals(11, Utf8.encodedLength("Hello world")); 35 assertEquals(8, Utf8.encodedLength("Rsum")); 36 assertEquals(461, Utf8.encodedLength("William Shakespeare" 37 + "15644261616423[1]" 38 + "" 39 + "" 40 + "[2]" 41 + "")); 42 // A surrogate pair 43 assertEquals(4, Utf8.encodedLength( 44 newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE))); 45 } 46 47 public void testEncodedLength_invalidStrings() { 48 testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0); 49 testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6); 50 testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0); 51 testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6); 52 testEncodedLengthFails( 53 newString( 54 Character.MIN_HIGH_SURROGATE, 55 Character.MIN_HIGH_SURROGATE), 0); 56 } 57 58 private static void testEncodedLengthFails(String invalidString, 59 int invalidCodePointIndex) { 60 try { 61 Utf8.encodedLength(invalidString); 62 fail(); 63 } catch (IllegalArgumentException expected) { 64 assertEquals("Unpaired surrogate at index " + invalidCodePointIndex, 65 expected.getMessage()); 66 } 67 } 68 69 // 128 - [chars 0x0000 to 0x007f] 70 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 71 0x007f - 0x0000 + 1; 72 73 // 128 74 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 75 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 76 77 // 1920 [chars 0x0080 to 0x07FF] 78 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 79 0x07FF - 0x0080 + 1; 80 81 // 18,304 82 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 83 // Both bytes are one byte characters 84 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + 85 // The possible number of two byte characters 86 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 87 88 // 2048 89 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 90 91 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 92 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 93 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 94 95 // 2,650,112 96 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 97 // All one byte characters 98 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + 99 // One two byte character and a one byte character 100 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 101 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 102 // Three byte characters 103 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 104 105 // 1,048,576 [chars 0x10000L to 0x10FFFF] 106 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 107 0x10FFFF - 0x10000L + 1; 108 109 // 289,571,839 110 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 111 // All one byte characters 112 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + 113 // One and three byte characters 114 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 115 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 116 // Two two byte characters 117 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + 118 // Permutations of one and two byte characters 119 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 120 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 121 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 122 // Four byte characters 123 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 124 125 /** 126 * Tests that round tripping of a sample of four byte permutations work. 127 * All permutations are prohibitively expensive to test for automated runs. 128 * This method tests specific four-byte cases. 129 */ 130 public void testIsWellFormed_4BytesSamples() { 131 // Valid 4 byte. 132 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 133 // Bad trailing bytes 134 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 135 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 136 // Special cases for byte2 137 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 138 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 139 } 140 141 /** Tests some hard-coded test cases. */ 142 public void testSomeSequences() { 143 // Empty 144 assertWellFormed(); 145 // One-byte characters, including control characters 146 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 147 // Two-byte characters 148 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 149 // Three-byte characters 150 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 151 // Four-byte characters 152 // "\u024B62\u024B62" 153 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 154 // Mixed string 155 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 156 assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 157 0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 158 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 159 // Not a valid string 160 assertNotWellFormed(-1, 0, -1, 0); 161 } 162 163 public void testShardsHaveExpectedRoundTrippables() { 164 // A sanity check. 165 long actual = 0; 166 for (long expected : generateFourByteShardsExpectedRunnables()) { 167 actual += expected; 168 } 169 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 170 } 171 172 private String newString(char... chars) { 173 return new String(chars); 174 } 175 176 private byte[] toByteArray(int... bytes) { 177 byte[] realBytes = new byte[bytes.length]; 178 for (int i = 0; i < bytes.length; i++) { 179 realBytes[i] = (byte) bytes[i]; 180 } 181 return realBytes; 182 } 183 184 private void assertWellFormed(int... bytes) { 185 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 186 } 187 188 private void assertNotWellFormed(int... bytes) { 189 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 190 } 191 192 private static long[] generateFourByteShardsExpectedRunnables() { 193 long[] expected = new long[128]; 194 // 0-63 are all 5300224 195 for (int i = 0; i <= 63; i++) { 196 expected[i] = 5300224; 197 } 198 // 97-111 are all 2342912 199 for (int i = 97; i <= 111; i++) { 200 expected[i] = 2342912; 201 } 202 // 113-117 are all 1048576 203 for (int i = 113; i <= 117; i++) { 204 expected[i] = 1048576; 205 } 206 // One offs 207 expected[112] = 786432; 208 expected[118] = 786432; 209 expected[119] = 1048576; 210 expected[120] = 458752; 211 expected[121] = 524288; 212 expected[122] = 65536; 213 // Anything not assigned was the default 0. 214 return expected; 215 } 216 } 217 218