Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2013 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
      5  * in compliance with the License. You may obtain a copy of the License at
      6  *
      7  * http://www.apache.org/licenses/LICENSE-2.0
      8  *
      9  * Unless required by applicable law or agreed to in writing, software distributed under the License
     10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
     11  * or implied. See the License for the specific language governing permissions and limitations under
     12  * the License.
     13  */
     14 
     15 package com.google.common.base;
     16 
     17 import static com.google.common.base.Preconditions.checkPositionIndexes;
     18 
     19 import com.google.common.annotations.Beta;
     20 import com.google.common.annotations.GwtCompatible;
     21 
     22 /**
     23  * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
     24  * character encoding. UTF-8 is defined in section D92 of
     25  * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
     26  * Specification, Chapter 3</a>.
     27  *
     28  * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
     29  * introduced in Unicode 3.1. One implication of this is that it rejects
     30  * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
     31  * sequences, even though the JDK decoder may accept them.
     32  *
     33  * @author Martin Buchholz
     34  * @author Clment Roux
     35  * @since 16.0
     36  */
     37 @Beta
     38 @GwtCompatible
     39 public final class Utf8 {
     40   /**
     41    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
     42    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
     43    * both time and space.
     44    *
     45    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
     46    *     surrogates)
     47    */
     48   public static int encodedLength(CharSequence sequence) {
     49     // Warning to maintainers: this implementation is highly optimized.
     50     int utf16Length = sequence.length();
     51     int utf8Length = utf16Length;
     52     int i = 0;
     53 
     54     // This loop optimizes for pure ASCII.
     55     while (i < utf16Length && sequence.charAt(i) < 0x80) {
     56       i++;
     57     }
     58 
     59     // This loop optimizes for chars less than 0x800.
     60     for (; i < utf16Length; i++) {
     61       char c = sequence.charAt(i);
     62       if (c < 0x800) {
     63         utf8Length += ((0x7f - c) >>> 31);  // branch free!
     64       } else {
     65         utf8Length += encodedLengthGeneral(sequence, i);
     66         break;
     67       }
     68     }
     69 
     70     if (utf8Length < utf16Length) {
     71       // Necessary and sufficient condition for overflow because of maximum 3x expansion
     72       throw new IllegalArgumentException("UTF-8 length does not fit in int: "
     73                                          + (utf8Length + (1L << 32)));
     74     }
     75     return utf8Length;
     76   }
     77 
     78   private static int encodedLengthGeneral(CharSequence sequence, int start) {
     79     int utf16Length = sequence.length();
     80     int utf8Length = 0;
     81     for (int i = start; i < utf16Length; i++) {
     82       char c = sequence.charAt(i);
     83       if (c < 0x800) {
     84         utf8Length += (0x7f - c) >>> 31; // branch free!
     85       } else {
     86         utf8Length += 2;
     87         // jdk7+: if (Character.isSurrogate(c)) {
     88         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
     89           // Check that we have a well-formed surrogate pair.
     90           int cp = Character.codePointAt(sequence, i);
     91           if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
     92             throw new IllegalArgumentException("Unpaired surrogate at index " + i);
     93           }
     94           i++;
     95         }
     96       }
     97     }
     98     return utf8Length;
     99   }
    100 
    101   /**
    102    * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
    103    * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
    104    * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
    105    * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
    106    * well-formed.
    107    *
    108    * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
    109    * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
    110    */
    111   public static boolean isWellFormed(byte[] bytes) {
    112     return isWellFormed(bytes, 0, bytes.length);
    113   }
    114 
    115   /**
    116    * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
    117    * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
    118    * isWellFormed(bytes)} is true.
    119    *
    120    * @param bytes the input buffer
    121    * @param off the offset in the buffer of the first byte to read
    122    * @param len the number of bytes to read from the buffer
    123    */
    124   public static boolean isWellFormed(byte[] bytes, int off, int len) {
    125     int end = off + len;
    126     checkPositionIndexes(off, end, bytes.length);
    127     // Look for the first non-ASCII character.
    128     for (int i = off; i < end; i++) {
    129       if (bytes[i] < 0) {
    130         return isWellFormedSlowPath(bytes, i, end);
    131       }
    132     }
    133     return true;
    134   }
    135 
    136   private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
    137     int index = off;
    138     while (true) {
    139       int byte1;
    140 
    141       // Optimize for interior runs of ASCII bytes.
    142       do {
    143         if (index >= end) {
    144           return true;
    145         }
    146       } while ((byte1 = bytes[index++]) >= 0);
    147 
    148       if (byte1 < (byte) 0xE0) {
    149         // Two-byte form.
    150         if (index == end) {
    151           return false;
    152         }
    153         // Simultaneously check for illegal trailing-byte in leading position
    154         // and overlong 2-byte form.
    155         if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
    156           return false;
    157         }
    158       } else if (byte1 < (byte) 0xF0) {
    159         // Three-byte form.
    160         if (index + 1 >= end) {
    161           return false;
    162         }
    163         int byte2 = bytes[index++];
    164         if (byte2 > (byte) 0xBF
    165             // Overlong? 5 most significant bits must not all be zero.
    166             || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
    167             // Check for illegal surrogate codepoints.
    168             || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
    169             // Third byte trailing-byte test.
    170             || bytes[index++] > (byte) 0xBF) {
    171           return false;
    172         }
    173       } else {
    174         // Four-byte form.
    175         if (index + 2 >= end) {
    176           return false;
    177         }
    178         int byte2 = bytes[index++];
    179         if (byte2 > (byte) 0xBF
    180             // Check that 1 <= plane <= 16. Tricky optimized form of:
    181             // if (byte1 > (byte) 0xF4
    182             //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
    183             //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
    184             || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
    185             // Third byte trailing-byte test
    186             || bytes[index++] > (byte) 0xBF
    187             // Fourth byte trailing-byte test
    188             || bytes[index++] > (byte) 0xBF) {
    189           return false;
    190         }
    191       }
    192     }
    193   }
    194 
    195   private Utf8() {}
    196 }
    197