1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This file is an internal atomic implementation, use base/atomicops.h instead. 6 // 7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. 8 9 #ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 10 #define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 11 12 namespace base { 13 namespace subtle { 14 15 // Memory barriers on ARM are funky, but the kernel is here to help: 16 // 17 // * ARMv5 didn't support SMP, there is no memory barrier instruction at 18 // all on this architecture, or when targeting its machine code. 19 // 20 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by 21 // writing a random value to a very specific coprocessor register. 22 // 23 // * On ARMv7, the "dmb" instruction is used to perform a full memory 24 // barrier (though writing to the co-processor will still work). 25 // However, on single core devices (e.g. Nexus One, or Nexus S), 26 // this instruction will take up to 200 ns, which is huge, even though 27 // it's completely un-needed on these devices. 28 // 29 // * There is no easy way to determine at runtime if the device is 30 // single or multi-core. However, the kernel provides a useful helper 31 // function at a fixed memory address (0xffff0fa0), which will always 32 // perform a memory barrier in the most efficient way. I.e. on single 33 // core devices, this is an empty function that exits immediately. 34 // On multi-core devices, it implements a full memory barrier. 35 // 36 // * This source could be compiled to ARMv5 machine code that runs on a 37 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers 38 // are needed for correct execution. Always call the kernel helper, even 39 // when targeting ARMv5TE. 40 // 41 42 inline void MemoryBarrier() { 43 // Note: This is a function call, which is also an implicit compiler 44 // barrier. 45 typedef void (*KernelMemoryBarrierFunc)(); 46 ((KernelMemoryBarrierFunc)0xffff0fa0)(); 47 } 48 49 // An ARM toolchain would only define one of these depending on which 50 // variant of the target architecture is being used. This tests against 51 // any known ARMv6 or ARMv7 variant, where it is possible to directly 52 // use ldrex/strex instructions to implement fast atomic operations. 53 #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ 54 defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ 55 defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ 56 defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ 57 defined(__ARM_ARCH_6KZ__) || defined(__ARM_ARCH_6T2__) 58 59 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, 60 Atomic32 old_value, 61 Atomic32 new_value) { 62 Atomic32 prev_value; 63 int reloop; 64 do { 65 // The following is equivalent to: 66 // 67 // prev_value = LDREX(ptr) 68 // reloop = 0 69 // if (prev_value != old_value) 70 // reloop = STREX(ptr, new_value) 71 __asm__ __volatile__(" ldrex %0, [%3]\n" 72 " mov %1, #0\n" 73 " cmp %0, %4\n" 74 #ifdef __thumb2__ 75 " it eq\n" 76 #endif 77 " strexeq %1, %5, [%3]\n" 78 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr) 79 : "r"(ptr), "r"(old_value), "r"(new_value) 80 : "cc", "memory"); 81 } while (reloop != 0); 82 return prev_value; 83 } 84 85 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, 86 Atomic32 old_value, 87 Atomic32 new_value) { 88 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value); 89 MemoryBarrier(); 90 return result; 91 } 92 93 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, 94 Atomic32 old_value, 95 Atomic32 new_value) { 96 MemoryBarrier(); 97 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); 98 } 99 100 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, 101 Atomic32 increment) { 102 Atomic32 value; 103 int reloop; 104 do { 105 // Equivalent to: 106 // 107 // value = LDREX(ptr) 108 // value += increment 109 // reloop = STREX(ptr, value) 110 // 111 __asm__ __volatile__(" ldrex %0, [%3]\n" 112 " add %0, %0, %4\n" 113 " strex %1, %0, [%3]\n" 114 : "=&r"(value), "=&r"(reloop), "+m"(*ptr) 115 : "r"(ptr), "r"(increment) 116 : "cc", "memory"); 117 } while (reloop); 118 return value; 119 } 120 121 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, 122 Atomic32 increment) { 123 // TODO(digit): Investigate if it's possible to implement this with 124 // a single MemoryBarrier() operation between the LDREX and STREX. 125 // See http://crbug.com/246514 126 MemoryBarrier(); 127 Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment); 128 MemoryBarrier(); 129 return result; 130 } 131 132 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, 133 Atomic32 new_value) { 134 Atomic32 old_value; 135 int reloop; 136 do { 137 // old_value = LDREX(ptr) 138 // reloop = STREX(ptr, new_value) 139 __asm__ __volatile__(" ldrex %0, [%3]\n" 140 " strex %1, %4, [%3]\n" 141 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr) 142 : "r"(ptr), "r"(new_value) 143 : "cc", "memory"); 144 } while (reloop != 0); 145 return old_value; 146 } 147 148 // This tests against any known ARMv5 variant. 149 #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ 150 defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) 151 152 // The kernel also provides a helper function to perform an atomic 153 // compare-and-swap operation at the hard-wired address 0xffff0fc0. 154 // On ARMv5, this is implemented by a special code path that the kernel 155 // detects and treats specially when thread pre-emption happens. 156 // On ARMv6 and higher, it uses LDREX/STREX instructions instead. 157 // 158 // Note that this always perform a full memory barrier, there is no 159 // need to add calls MemoryBarrier() before or after it. It also 160 // returns 0 on success, and 1 on exit. 161 // 162 // Available and reliable since Linux 2.6.24. Both Android and ChromeOS 163 // use newer kernel revisions, so this should not be a concern. 164 namespace { 165 166 inline int LinuxKernelCmpxchg(Atomic32 old_value, 167 Atomic32 new_value, 168 volatile Atomic32* ptr) { 169 typedef int (*KernelCmpxchgFunc)(Atomic32, Atomic32, volatile Atomic32*); 170 return ((KernelCmpxchgFunc)0xffff0fc0)(old_value, new_value, ptr); 171 } 172 173 } // namespace 174 175 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, 176 Atomic32 old_value, 177 Atomic32 new_value) { 178 Atomic32 prev_value; 179 for (;;) { 180 prev_value = *ptr; 181 if (prev_value != old_value) 182 return prev_value; 183 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) 184 return old_value; 185 } 186 } 187 188 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, 189 Atomic32 new_value) { 190 Atomic32 old_value; 191 do { 192 old_value = *ptr; 193 } while (LinuxKernelCmpxchg(old_value, new_value, ptr)); 194 return old_value; 195 } 196 197 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, 198 Atomic32 increment) { 199 return Barrier_AtomicIncrement(ptr, increment); 200 } 201 202 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, 203 Atomic32 increment) { 204 for (;;) { 205 // Atomic exchange the old value with an incremented one. 206 Atomic32 old_value = *ptr; 207 Atomic32 new_value = old_value + increment; 208 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) { 209 // The exchange took place as expected. 210 return new_value; 211 } 212 // Otherwise, *ptr changed mid-loop and we need to retry. 213 } 214 } 215 216 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, 217 Atomic32 old_value, 218 Atomic32 new_value) { 219 Atomic32 prev_value; 220 for (;;) { 221 prev_value = *ptr; 222 if (prev_value != old_value) { 223 // Always ensure acquire semantics. 224 MemoryBarrier(); 225 return prev_value; 226 } 227 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) 228 return old_value; 229 } 230 } 231 232 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, 233 Atomic32 old_value, 234 Atomic32 new_value) { 235 // This could be implemented as: 236 // MemoryBarrier(); 237 // return NoBarrier_CompareAndSwap(); 238 // 239 // But would use 3 barriers per succesful CAS. To save performance, 240 // use Acquire_CompareAndSwap(). Its implementation guarantees that: 241 // - A succesful swap uses only 2 barriers (in the kernel helper). 242 // - An early return due to (prev_value != old_value) performs 243 // a memory barrier with no store, which is equivalent to the 244 // generic implementation above. 245 return Acquire_CompareAndSwap(ptr, old_value, new_value); 246 } 247 248 #else 249 # error "Your CPU's ARM architecture is not supported yet" 250 #endif 251 252 // NOTE: Atomicity of the following load and store operations is only 253 // guaranteed in case of 32-bit alignement of |ptr| values. 254 255 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { 256 *ptr = value; 257 } 258 259 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { 260 *ptr = value; 261 MemoryBarrier(); 262 } 263 264 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { 265 MemoryBarrier(); 266 *ptr = value; 267 } 268 269 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; } 270 271 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { 272 Atomic32 value = *ptr; 273 MemoryBarrier(); 274 return value; 275 } 276 277 inline Atomic32 Release_Load(volatile const Atomic32* ptr) { 278 MemoryBarrier(); 279 return *ptr; 280 } 281 282 } // namespace base::subtle 283 } // namespace base 284 285 #endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ 286