Home | History | Annotate | Download | only in benchmarks
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 // Our goal is to measure the cost of various C++ atomic operations.
     18 // Android doesn't really control those. But since some of these operations can be quite
     19 // expensive, this may be useful input for development of higher level code.
     20 // Expected mappings from C++ atomics to hardware primitives can be found at
     21 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .
     22 
     23 #include <benchmark/benchmark.h>
     24 #include <atomic>
     25 #include <mutex>
     26 
     27 // We time atomic operations separated by a volatile (not atomic!) increment.  This ensures
     28 // that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
     29 // like.  That in turn ensures that the CPU has outstanding memory operations when the fence
     30 // is executed.
     31 
     32 // In most respects, we compute best case values. Since there is only one thread, there are no
     33 // coherence misses.
     34 
     35 // We assume that the compiler is not smart enough to optimize away fences in a single-threaded
     36 // program. If that changes, we'll need to add a second thread.
     37 
     38 volatile unsigned counter;
     39 
     40 std::atomic<int> test_loc(0);
     41 
     42 volatile unsigned sink;
     43 
     44 std::mutex mtx;
     45 
     46 void BM_empty(benchmark::State& state) {
     47   while (state.KeepRunning()) {
     48     ++counter;
     49   }
     50 }
     51 BENCHMARK(BM_empty);
     52 
     53 static void BM_load_relaxed(benchmark::State& state) {
     54   unsigned result = 0;
     55   while (state.KeepRunning()) {
     56     result += test_loc.load(std::memory_order_relaxed);
     57     ++counter;
     58   }
     59   sink = result;
     60 }
     61 BENCHMARK(BM_load_relaxed);
     62 
     63 static void BM_load_acquire(benchmark::State& state) {
     64   unsigned result = 0;
     65   while (state.KeepRunning()) {
     66     result += test_loc.load(std::memory_order_acquire);
     67     ++counter;
     68   }
     69   sink = result;
     70 }
     71 BENCHMARK(BM_load_acquire);
     72 
     73 static void BM_store_release(benchmark::State& state) {
     74   int i = counter;
     75   while (state.KeepRunning()) {
     76     test_loc.store(++i, std::memory_order_release);
     77     ++counter;
     78   }
     79 }
     80 BENCHMARK(BM_store_release);
     81 
     82 static void BM_store_seq_cst(benchmark::State& state) {
     83   int i = counter;
     84   while (state.KeepRunning()) {
     85     test_loc.store(++i, std::memory_order_seq_cst);
     86     ++counter;
     87   }
     88 }
     89 BENCHMARK(BM_store_seq_cst);
     90 
     91 static void BM_fetch_add_relaxed(benchmark::State& state) {
     92   unsigned result = 0;
     93   while (state.KeepRunning()) {
     94     result += test_loc.fetch_add(1, std::memory_order_relaxed);
     95     ++counter;
     96   }
     97   sink = result;
     98 }
     99 BENCHMARK(BM_fetch_add_relaxed);
    100 
    101 static void BM_fetch_add_seq_cst(benchmark::State& state) {
    102   unsigned result = 0;
    103   while (state.KeepRunning()) {
    104     result += test_loc.fetch_add(1, std::memory_order_seq_cst);
    105     ++counter;
    106   }
    107   sink = result;
    108 }
    109 BENCHMARK(BM_fetch_add_seq_cst);
    110 
    111 // The fence benchmarks include a relaxed load to make it much harder to optimize away
    112 // the fence.
    113 
    114 static void BM_acquire_fence(benchmark::State& state) {
    115   unsigned result = 0;
    116   while (state.KeepRunning()) {
    117     result += test_loc.load(std::memory_order_relaxed);
    118     std::atomic_thread_fence(std::memory_order_acquire);
    119     ++counter;
    120   }
    121   sink = result;
    122 }
    123 BENCHMARK(BM_acquire_fence);
    124 
    125 static void BM_seq_cst_fence(benchmark::State& state) {
    126   unsigned result = 0;
    127   while (state.KeepRunning()) {
    128     result += test_loc.load(std::memory_order_relaxed);
    129     std::atomic_thread_fence(std::memory_order_seq_cst);
    130     ++counter;
    131   }
    132   sink = result;
    133 }
    134 BENCHMARK(BM_seq_cst_fence);
    135 
    136 // For comparison, also throw in a critical section version:
    137 
    138 static void BM_fetch_add_cs(benchmark::State& state) {
    139   unsigned result = 0;
    140   while (state.KeepRunning()) {
    141     {
    142       std::lock_guard<std::mutex> _(mtx);
    143       result += ++counter;
    144     }
    145   }
    146   sink = result;
    147 }
    148 BENCHMARK(BM_fetch_add_cs);
    149