Home | History | Annotate | Download | only in benchmarks
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 // Our goal is to measure the cost of various C++ atomic operations.
     18 // Android doesn't really control those. But since some of these operations can be quite
     19 // expensive, this may be useful input for development of higher level code.
     20 // Expected mappings from C++ atomics to hardware primitives can be found at
     21 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html .
     22 
     23 #include <atomic>
     24 #include <mutex>
     25 
     26 #include <benchmark/benchmark.h>
     27 #include "util.h"
     28 
     29 // We time atomic operations separated by a volatile (not atomic!) increment.  This ensures
     30 // that the compiler emits memory instructions (e.g. load or store) prior to any fence or the
     31 // like.  That in turn ensures that the CPU has outstanding memory operations when the fence
     32 // is executed.
     33 
     34 // In most respects, we compute best case values. Since there is only one thread, there are no
     35 // coherence misses.
     36 
     37 // We assume that the compiler is not smart enough to optimize away fences in a single-threaded
     38 // program. If that changes, we'll need to add a second thread.
     39 
     40 volatile unsigned counter;
     41 
     42 std::atomic<int> test_loc(0);
     43 
     44 volatile unsigned sink;
     45 
     46 std::mutex mtx;
     47 
     48 void BM_atomic_empty(benchmark::State& state) {
     49   while (state.KeepRunning()) {
     50     ++counter;
     51   }
     52 }
     53 BIONIC_BENCHMARK(BM_atomic_empty);
     54 
     55 static void BM_atomic_load_relaxed(benchmark::State& state) {
     56   unsigned result = 0;
     57   while (state.KeepRunning()) {
     58     result += test_loc.load(std::memory_order_relaxed);
     59     ++counter;
     60   }
     61   sink = result;
     62 }
     63 BIONIC_BENCHMARK(BM_atomic_load_relaxed);
     64 
     65 static void BM_atomic_load_acquire(benchmark::State& state) {
     66   unsigned result = 0;
     67   while (state.KeepRunning()) {
     68     result += test_loc.load(std::memory_order_acquire);
     69     ++counter;
     70   }
     71   sink = result;
     72 }
     73 BIONIC_BENCHMARK(BM_atomic_load_acquire);
     74 
     75 static void BM_atomic_store_release(benchmark::State& state) {
     76   int i = counter;
     77   while (state.KeepRunning()) {
     78     test_loc.store(++i, std::memory_order_release);
     79     ++counter;
     80   }
     81 }
     82 BIONIC_BENCHMARK(BM_atomic_store_release);
     83 
     84 static void BM_atomic_store_seq_cst(benchmark::State& state) {
     85   int i = counter;
     86   while (state.KeepRunning()) {
     87     test_loc.store(++i, std::memory_order_seq_cst);
     88     ++counter;
     89   }
     90 }
     91 BIONIC_BENCHMARK(BM_atomic_store_seq_cst);
     92 
     93 static void BM_atomic_fetch_add_relaxed(benchmark::State& state) {
     94   unsigned result = 0;
     95   while (state.KeepRunning()) {
     96     result += test_loc.fetch_add(1, std::memory_order_relaxed);
     97     ++counter;
     98   }
     99   sink = result;
    100 }
    101 BIONIC_BENCHMARK(BM_atomic_fetch_add_relaxed);
    102 
    103 static void BM_atomic_fetch_add_seq_cst(benchmark::State& state) {
    104   unsigned result = 0;
    105   while (state.KeepRunning()) {
    106     result += test_loc.fetch_add(1, std::memory_order_seq_cst);
    107     ++counter;
    108   }
    109   sink = result;
    110 }
    111 BIONIC_BENCHMARK(BM_atomic_fetch_add_seq_cst);
    112 
    113 // The fence benchmarks include a relaxed load to make it much harder to optimize away
    114 // the fence.
    115 
    116 static void BM_atomic_acquire_fence(benchmark::State& state) {
    117   unsigned result = 0;
    118   while (state.KeepRunning()) {
    119     result += test_loc.load(std::memory_order_relaxed);
    120     std::atomic_thread_fence(std::memory_order_acquire);
    121     ++counter;
    122   }
    123   sink = result;
    124 }
    125 BIONIC_BENCHMARK(BM_atomic_acquire_fence);
    126 
    127 static void BM_atomic_seq_cst_fence(benchmark::State& state) {
    128   unsigned result = 0;
    129   while (state.KeepRunning()) {
    130     result += test_loc.load(std::memory_order_relaxed);
    131     std::atomic_thread_fence(std::memory_order_seq_cst);
    132     ++counter;
    133   }
    134   sink = result;
    135 }
    136 BIONIC_BENCHMARK(BM_atomic_seq_cst_fence);
    137 
    138 // For comparison, also throw in a critical section version:
    139 
    140 static void BM_atomic_fetch_add_cs(benchmark::State& state) {
    141   unsigned result = 0;
    142   while (state.KeepRunning()) {
    143     {
    144       std::lock_guard<std::mutex> _(mtx);
    145       result += ++counter;
    146     }
    147   }
    148   sink = result;
    149 }
    150 BIONIC_BENCHMARK(BM_atomic_fetch_add_cs);
    151