Home | History | Annotate | Download | only in examples
      1 /*
      2  * Copyright (C) 2011 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package examples;
     18 
     19 import com.google.caliper.BeforeExperiment;
     20 import com.google.caliper.Benchmark;
     21 import com.google.caliper.Param;
     22 
     23 import java.nio.charset.Charset;
     24 import java.util.Random;
     25 
     26 /**
     27  * Benchmark for operations with the UTF-8 charset.
     28  */
     29 public class Utf8Benchmark {
     30 
     31   static final Charset UTF_8 = Charset.forName("UTF-8");
     32 
     33   /**
     34    * The maximum code point used in generated text.  Different values
     35    * provide reasonable models of different real-world human text.
     36    */
     37   static class MaxCodePoint {
     38     final int value;
     39 
     40     /**
     41      * Convert the input string to a code point.  Accepts regular
     42      * decimal numerals, hex strings, and some symbolic names
     43      * meaningful to humans.
     44      */
     45     private static int decode(String userFriendly) {
     46       try {
     47         return Integer.decode(userFriendly);
     48       } catch (NumberFormatException ignored) {
     49         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
     50           // 1-byte UTF-8 sequences - "American" ASCII text
     51           return 0x80;
     52         } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
     53           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
     54           // sequences - "Western European" text
     55           return 0x90;
     56         } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
     57           // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
     58           return 0x100;
     59         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
     60           // Mostly 2-byte UTF-8 sequences - "European" text
     61           return 0x800;
     62         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
     63           // Mostly 3-byte UTF-8 sequences - "Asian" text
     64           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
     65         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
     66           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
     67           return Character.MAX_CODE_POINT;
     68         } else {
     69           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
     70         }
     71       }
     72     }
     73 
     74     public static MaxCodePoint valueOf(String userFriendly) {
     75       return new MaxCodePoint(userFriendly);
     76     }
     77 
     78     private MaxCodePoint(String userFriendly) {
     79       value = decode(userFriendly);
     80     }
     81   }
     82 
     83   /**
     84    * The default values of maxCodePoint below provide pretty good
     85    * performance models of different kinds of common human text.
     86    * @see MaxCodePoint#decode
     87    */
     88   @Param({"0x80", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
     89 
     90   static final int STRING_COUNT = 1 << 7;
     91 
     92   @Param({"65536"}) int charCount;
     93   private String[] strings;
     94 
     95   /**
     96    * Computes arrays of valid unicode Strings.
     97    */
     98   @BeforeExperiment void setUp() {
     99     final long seed = 99;
    100     final Random rnd = new Random(seed);
    101     strings = new String[STRING_COUNT];
    102     for (int i = 0; i < STRING_COUNT; i++) {
    103       StringBuilder sb = new StringBuilder();
    104       for (int j = 0; j < charCount; j++) {
    105         int codePoint;
    106         // discard illegal surrogate "codepoints"
    107         do {
    108           codePoint = rnd.nextInt(maxCodePoint.value);
    109         } while (isSurrogate(codePoint));
    110         sb.appendCodePoint(codePoint);
    111       }
    112       strings[i] = sb.toString();
    113     }
    114     // The reps will continue until the non-determinism detector is pacified!
    115     getBytes(100);
    116   }
    117 
    118   /**
    119    * Benchmarks {@link String#getBytes} on valid strings containing
    120    * pseudo-randomly-generated codePoints less than {@code
    121    * maxCodePoint}.  A constant seed is used, so separate runs perform
    122    * identical computations.
    123    */
    124   @Benchmark void getBytes(int reps) {
    125     final String[] strings = this.strings;
    126     final int mask = STRING_COUNT - 1;
    127     for (int i = 0; i < reps; i++) {
    128       String string = strings[i & mask];
    129       byte[] bytes = string.getBytes(UTF_8);
    130       if (bytes[0] == 86 && bytes[bytes.length - 1] == 99) {
    131         throw new Error("Unlikely! We're just defeating the optimizer!");
    132       }
    133     }
    134   }
    135 
    136   /** Character.isSurrogate was added in Java SE 7. */
    137   private boolean isSurrogate(int c) {
    138     return (Character.MIN_HIGH_SURROGATE <= c &&
    139             c <= Character.MAX_LOW_SURROGATE);
    140   }
    141 }
    142