Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2011 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.base;
     18 
     19 import com.google.caliper.BeforeExperiment;
     20 import com.google.caliper.Benchmark;
     21 import com.google.caliper.Param;
     22 
     23 import java.util.Random;
     24 
     25 /**
     26  * Benchmark for the {@link Utf8} class.
     27  *
     28  *
     29  * @author Martin Buchholz
     30  */
     31 public class Utf8Benchmark {
     32 
     33   static class MaxCodePoint {
     34     final int value;
     35 
     36     /**
     37      * Convert the input string to a code point.  Accepts regular
     38      * decimal numerals, hex strings, and some symbolic names
     39      * meaningful to humans.
     40      */
     41     private static int decode(String userFriendly) {
     42       try {
     43         return Integer.decode(userFriendly);
     44       } catch (NumberFormatException ignored) {
     45         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
     46           // 1-byte UTF-8 sequences - "American" ASCII text
     47           return 0x80;
     48         } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
     49           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
     50           // sequences - "Western European" text
     51           return 0x90;
     52         } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
     53           // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
     54           return 0x100;
     55         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
     56           // Mostly 2-byte UTF-8 sequences - "European" text
     57           return 0x800;
     58         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
     59           // Mostly 3-byte UTF-8 sequences - "Asian" text
     60           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
     61         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
     62           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
     63           return Character.MAX_CODE_POINT;
     64         } else {
     65           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
     66         }
     67       }
     68     }
     69 
     70     public static MaxCodePoint valueOf(String userFriendly) {
     71       return new MaxCodePoint(userFriendly);
     72     }
     73 
     74     public MaxCodePoint(String userFriendly) {
     75       value = decode(userFriendly);
     76     }
     77   }
     78 
     79   /**
     80    * The default values of maxCodePoint below provide pretty good
     81    * performance models of different kinds of common human text.
     82    * @see MaxCodePoint#decode
     83    */
     84   @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
     85 
     86   @Param({"100"}) int stringCount;
     87   @Param({"16384"}) int charCount;
     88   private CharSequence[] seqs;  // actually, all StringBuilders
     89   private String[] strings;
     90   private byte[][] byteArrays;
     91 
     92   /**
     93    * Compute arrays of valid unicode text, and store it in 3 forms:
     94    * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
     95    * make it a little harder for the JVM).
     96    */
     97   @BeforeExperiment void setUp() {
     98     final long seed = 99;
     99     final Random rnd = new Random(seed);
    100     seqs = new CharSequence[stringCount];
    101     strings = new String[stringCount];
    102     byteArrays = new byte[stringCount][];
    103     for (int i = 0; i < stringCount; i++) {
    104       StringBuilder sb = new StringBuilder();
    105       for (int j = 0; j < charCount; j++) {
    106         int codePoint;
    107         // discard illegal surrogate "codepoints"
    108         do {
    109           codePoint = rnd.nextInt(maxCodePoint.value);
    110         } while (isSurrogate(codePoint));
    111         sb.appendCodePoint(codePoint);
    112       }
    113       seqs[i] = sb;
    114       strings[i] = sb.toString();
    115       byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
    116     }
    117   }
    118 
    119   /**
    120    * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
    121    * containing pseudo-randomly-generated codePoints less than {@code
    122    * maxCodePoint}.  A constant seed is used, so separate runs perform
    123    * identical computations.
    124    */
    125   @Benchmark void isWellFormed(int reps) {
    126     for (int i = 0; i < reps; i++) {
    127       for (byte[] byteArray : byteArrays) {
    128         if (!Utf8.isWellFormed(byteArray)) {
    129           throw new Error("unexpected invalid UTF-8");
    130         }
    131       }
    132     }
    133   }
    134 
    135   /**
    136    * Benchmarks {@link Utf8#length} on valid strings containing
    137    * pseudo-randomly-generated codePoints less than {@code
    138    * maxCodePoint}.  A constant seed is used, so separate runs perform
    139    * identical computations.
    140    */
    141   @Benchmark void lengthOfString(int reps) {
    142     for (int i = 0; i < reps; i++) {
    143       for (String string : strings) {
    144         if (Utf8.encodedLength(string) == 1237482374) {
    145           throw new Error("Unlikely! We're just defeating the optimizer!");
    146         }
    147       }
    148     }
    149   }
    150 
    151   /**
    152    * Benchmarks {@link Utf8#length} on valid StringBuilders containing
    153    * pseudo-randomly-generated codePoints less than {@code
    154    * maxCodePoint}.  A constant seed is used, so separate runs perform
    155    * identical computations.
    156    */
    157   @Benchmark void lengthOfStringBuilder(int reps) {
    158     for (int i = 0; i < reps; i++) {
    159       for (CharSequence seq : seqs) {
    160         if (Utf8.encodedLength(seq) == 1237482374) {
    161           throw new Error("Unlikely! We're just defeating the optimizer!");
    162         }
    163       }
    164     }
    165   }
    166 
    167   /** Character.isSurrogate was added in Java SE 7. */
    168   private boolean isSurrogate(int c) {
    169     return (Character.MIN_HIGH_SURROGATE <= c &&
    170             c <= Character.MAX_LOW_SURROGATE);
    171   }
    172 }
    173