Home | History | Annotate | Download | only in hash
      1 /*
      2  * Copyright (C) 2011 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
      5  * in compliance with the License. You may obtain a copy of the License at
      6  *
      7  * http://www.apache.org/licenses/LICENSE-2.0
      8  *
      9  * Unless required by applicable law or agreed to in writing, software distributed under the License
     10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
     11  * or implied. See the License for the specific language governing permissions and limitations under
     12  * the License.
     13  */
     14 
     15 package com.google.common.hash;
     16 
     17 import static com.google.common.base.Preconditions.checkArgument;
     18 
     19 import com.google.common.math.LongMath;
     20 import com.google.common.primitives.Ints;
     21 import com.google.common.primitives.Longs;
     22 
     23 import java.math.RoundingMode;
     24 import java.util.Arrays;
     25 
     26 /**
     27  * Collections of strategies of generating the k * log(M) bits required for an element to
     28  * be mapped to a BloomFilter of M bits and k hash functions. These
     29  * strategies are part of the serialized form of the Bloom filters that use them, thus they must be
     30  * preserved as is (no updates allowed, only introduction of new versions).
     31  *
     32  * Important: the order of the constants cannot change, and they cannot be deleted - we depend
     33  * on their ordinal for BloomFilter serialization.
     34  *
     35  * @author Dimitris Andreou
     36  * @author Kurt Alfred Kluever
     37  */
     38 enum BloomFilterStrategies implements BloomFilter.Strategy {
     39   /**
     40    * See "Less Hashing, Same Performance: Building a Better Bloom Filter" by Adam Kirsch and
     41    * Michael Mitzenmacher. The paper argues that this trick doesn't significantly deteriorate the
     42    * performance of a Bloom filter (yet only needs two 32bit hash functions).
     43    */
     44   MURMUR128_MITZ_32() {
     45     @Override public <T> boolean put(T object, Funnel<? super T> funnel,
     46         int numHashFunctions, BitArray bits) {
     47       long bitSize = bits.bitSize();
     48       long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
     49       int hash1 = (int) hash64;
     50       int hash2 = (int) (hash64 >>> 32);
     51 
     52       boolean bitsChanged = false;
     53       for (int i = 1; i <= numHashFunctions; i++) {
     54         int combinedHash = hash1 + (i * hash2);
     55         // Flip all the bits if it's negative (guaranteed positive number)
     56         if (combinedHash < 0) {
     57           combinedHash = ~combinedHash;
     58         }
     59         bitsChanged |= bits.set(combinedHash % bitSize);
     60       }
     61       return bitsChanged;
     62     }
     63 
     64     @Override public <T> boolean mightContain(T object, Funnel<? super T> funnel,
     65         int numHashFunctions, BitArray bits) {
     66       long bitSize = bits.bitSize();
     67       long hash64 = Hashing.murmur3_128().hashObject(object, funnel).asLong();
     68       int hash1 = (int) hash64;
     69       int hash2 = (int) (hash64 >>> 32);
     70 
     71       for (int i = 1; i <= numHashFunctions; i++) {
     72         int combinedHash = hash1 + (i * hash2);
     73         // Flip all the bits if it's negative (guaranteed positive number)
     74         if (combinedHash < 0) {
     75           combinedHash = ~combinedHash;
     76         }
     77         if (!bits.get(combinedHash % bitSize)) {
     78           return false;
     79         }
     80       }
     81       return true;
     82     }
     83   },
     84   /**
     85    * This strategy uses all 128 bits of {@link Hashing#murmur3_128} when hashing. It looks
     86    * different than the implementation in MURMUR128_MITZ_32 because we're avoiding the
     87    * multiplication in the loop and doing a (much simpler) += hash2. We're also changing the
     88    * index to a positive number by AND'ing with Long.MAX_VALUE instead of flipping the bits.
     89    */
     90   MURMUR128_MITZ_64() {
     91     @Override
     92     public <T> boolean put(T object, Funnel<? super T> funnel,
     93         int numHashFunctions, BitArray bits) {
     94       long bitSize = bits.bitSize();
     95       byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
     96       long hash1 = lowerEight(bytes);
     97       long hash2 = upperEight(bytes);
     98 
     99       boolean bitsChanged = false;
    100       long combinedHash = hash1;
    101       for (int i = 0; i < numHashFunctions; i++) {
    102         // Make the combined hash positive and indexable
    103         bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize);
    104         combinedHash += hash2;
    105       }
    106       return bitsChanged;
    107     }
    108 
    109     @Override
    110     public <T> boolean mightContain(T object, Funnel<? super T> funnel,
    111         int numHashFunctions, BitArray bits) {
    112       long bitSize = bits.bitSize();
    113       byte[] bytes = Hashing.murmur3_128().hashObject(object, funnel).getBytesInternal();
    114       long hash1 = lowerEight(bytes);
    115       long hash2 = upperEight(bytes);
    116 
    117       long combinedHash = hash1;
    118       for (int i = 0; i < numHashFunctions; i++) {
    119         // Make the combined hash positive and indexable
    120         if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) {
    121           return false;
    122         }
    123         combinedHash += hash2;
    124       }
    125       return true;
    126     }
    127 
    128     private /* static */ long lowerEight(byte[] bytes) {
    129       return Longs.fromBytes(
    130           bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
    131     }
    132 
    133     private /* static */ long upperEight(byte[] bytes) {
    134       return Longs.fromBytes(
    135           bytes[15], bytes[14], bytes[13], bytes[12], bytes[11], bytes[10], bytes[9], bytes[8]);
    136     }
    137   };
    138 
    139   // Note: We use this instead of java.util.BitSet because we need access to the long[] data field
    140   static final class BitArray {
    141     final long[] data;
    142     long bitCount;
    143 
    144     BitArray(long bits) {
    145       this(new long[Ints.checkedCast(LongMath.divide(bits, 64, RoundingMode.CEILING))]);
    146     }
    147 
    148     // Used by serialization
    149     BitArray(long[] data) {
    150       checkArgument(data.length > 0, "data length is zero!");
    151       this.data = data;
    152       long bitCount = 0;
    153       for (long value : data) {
    154         bitCount += Long.bitCount(value);
    155       }
    156       this.bitCount = bitCount;
    157     }
    158 
    159     /** Returns true if the bit changed value. */
    160     boolean set(long index) {
    161       if (!get(index)) {
    162         data[(int) (index >>> 6)] |= (1L << index);
    163         bitCount++;
    164         return true;
    165       }
    166       return false;
    167     }
    168 
    169     boolean get(long index) {
    170       return (data[(int) (index >>> 6)] & (1L << index)) != 0;
    171     }
    172 
    173     /** Number of bits */
    174     long bitSize() {
    175       return (long) data.length * Long.SIZE;
    176     }
    177 
    178     /** Number of set bits (1s) */
    179     long bitCount() {
    180       return bitCount;
    181     }
    182 
    183     BitArray copy() {
    184       return new BitArray(data.clone());
    185     }
    186 
    187     /** Combines the two BitArrays using bitwise OR. */
    188     void putAll(BitArray array) {
    189       checkArgument(data.length == array.data.length,
    190           "BitArrays must be of equal length (%s != %s)", data.length, array.data.length);
    191       bitCount = 0;
    192       for (int i = 0; i < data.length; i++) {
    193         data[i] |= array.data[i];
    194         bitCount += Long.bitCount(data[i]);
    195       }
    196     }
    197 
    198     @Override public boolean equals(Object o) {
    199       if (o instanceof BitArray) {
    200         BitArray bitArray = (BitArray) o;
    201         return Arrays.equals(data, bitArray.data);
    202       }
    203       return false;
    204     }
    205 
    206     @Override public int hashCode() {
    207       return Arrays.hashCode(data);
    208     }
    209   }
    210 }
    211