Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
      4 
      5 ; This file checks that atomic (non-seq_cst) stores of immediate values are
      6 ; done in one mov instruction and not 2. More precisely, it makes sure that the
      7 ; immediate is not first copied uselessly into a register.
      8 
      9 ; Similarily, it checks that a binary operation of an immediate with an atomic
     10 ; variable that is stored back in that variable is done as a single instruction.
     11 ; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
     12 ; should be just an add instruction, instead of loading x into a register, doing
     13 ; an add and storing the result back.
     14 ; The binary operations supported are currently add, and, or, xor.
     15 ; sub is not supported because they are translated by an addition of the
     16 ; negated immediate.
     17 ;
     18 ; We also check the same patterns:
     19 ; - For inc/dec.
     20 ; - For register instead of immediate operands.
     21 ; - For floating point operations.
     22 
     23 ; seq_cst stores are left as (lock) xchgl, but we try to check every other
     24 ; attribute at least once.
     25 
     26 ; Please note that these operations do not require the lock prefix: only
     27 ; sequentially consistent stores require this kind of protection on X86.
     28 ; And even for seq_cst operations, llvm uses the xchg instruction which has
     29 ; an implicit lock prefix, so making it explicit is not required.
     30 
     31 define void @store_atomic_imm_8(i8* %p) {
     32 ; X64-LABEL: store_atomic_imm_8:
     33 ; X64: movb
     34 ; X64-NOT: movb
     35 ; X32-LABEL: store_atomic_imm_8:
     36 ; X32: movb
     37 ; X32-NOT: movb
     38   store atomic i8 42, i8* %p release, align 1
     39   ret void
     40 }
     41 
     42 define void @store_atomic_imm_16(i16* %p) {
     43 ; X64-LABEL: store_atomic_imm_16:
     44 ; X64: movw
     45 ; X64-NOT: movw
     46 ; X32-LABEL: store_atomic_imm_16:
     47 ; X32: movw
     48 ; X32-NOT: movw
     49   store atomic i16 42, i16* %p monotonic, align 2
     50   ret void
     51 }
     52 
     53 define void @store_atomic_imm_32(i32* %p) {
     54 ; X64-LABEL: store_atomic_imm_32:
     55 ; X64: movl
     56 ; X64-NOT: movl
     57 ;   On 32 bits, there is an extra movl for each of those functions
     58 ;   (probably for alignment reasons).
     59 ; X32-LABEL: store_atomic_imm_32:
     60 ; X32: movl 4(%esp), %eax
     61 ; X32: movl
     62 ; X32-NOT: movl
     63   store atomic i32 42, i32* %p release, align 4
     64   ret void
     65 }
     66 
     67 define void @store_atomic_imm_64(i64* %p) {
     68 ; X64-LABEL: store_atomic_imm_64:
     69 ; X64: movq
     70 ; X64-NOT: movq
     71 ;   These are implemented with a CAS loop on 32 bit architectures, and thus
     72 ;   cannot be optimized in the same way as the others.
     73 ; X32-LABEL: store_atomic_imm_64:
     74 ; X32: cmpxchg8b
     75   store atomic i64 42, i64* %p release, align 8
     76   ret void
     77 }
     78 
     79 ; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
     80 ; even on X64, one must use movabsq that can only target a register.
     81 define void @store_atomic_imm_64_big(i64* %p) {
     82 ; X64-LABEL: store_atomic_imm_64_big:
     83 ; X64: movabsq
     84 ; X64: movq
     85   store atomic i64 100000000000, i64* %p monotonic, align 8
     86   ret void
     87 }
     88 
     89 ; It would be incorrect to replace a lock xchgl by a movl
     90 define void @store_atomic_imm_32_seq_cst(i32* %p) {
     91 ; X64-LABEL: store_atomic_imm_32_seq_cst:
     92 ; X64: xchgl
     93 ; X32-LABEL: store_atomic_imm_32_seq_cst:
     94 ; X32: xchgl
     95   store atomic i32 42, i32* %p seq_cst, align 4
     96   ret void
     97 }
     98 
     99 ; ----- ADD -----
    100 
    101 define void @add_8i(i8* %p) {
    102 ; X64-LABEL: add_8i:
    103 ; X64-NOT: lock
    104 ; X64: addb
    105 ; X64-NOT: movb
    106 ; X32-LABEL: add_8i:
    107 ; X32-NOT: lock
    108 ; X32: addb
    109 ; X32-NOT: movb
    110   %1 = load atomic i8, i8* %p seq_cst, align 1
    111   %2 = add i8 %1, 2
    112   store atomic i8 %2, i8* %p release, align 1
    113   ret void
    114 }
    115 
    116 define void @add_8r(i8* %p, i8 %v) {
    117 ; X64-LABEL: add_8r:
    118 ; X64-NOT: lock
    119 ; X64: addb
    120 ; X64-NOT: movb
    121 ; X32-LABEL: add_8r:
    122 ; X32-NOT: lock
    123 ; X32: addb
    124 ; X32-NOT: movb
    125   %1 = load atomic i8, i8* %p seq_cst, align 1
    126   %2 = add i8 %1, %v
    127   store atomic i8 %2, i8* %p release, align 1
    128   ret void
    129 }
    130 
    131 define void @add_16i(i16* %p) {
    132 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    133 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    134 ; X64-LABEL: add_16i:
    135 ; X64-NOT: addw
    136 ; X32-LABEL: add_16i:
    137 ; X32-NOT: addw
    138   %1 = load atomic i16, i16* %p acquire, align 2
    139   %2 = add i16 %1, 2
    140   store atomic i16 %2, i16* %p release, align 2
    141   ret void
    142 }
    143 
    144 define void @add_16r(i16* %p, i16 %v) {
    145 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    146 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    147 ; X64-LABEL: add_16r:
    148 ; X64-NOT: addw
    149 ; X32-LABEL: add_16r:
    150 ; X32-NOT: addw [.*], (
    151   %1 = load atomic i16, i16* %p acquire, align 2
    152   %2 = add i16 %1, %v
    153   store atomic i16 %2, i16* %p release, align 2
    154   ret void
    155 }
    156 
    157 define void @add_32i(i32* %p) {
    158 ; X64-LABEL: add_32i:
    159 ; X64-NOT: lock
    160 ; X64: addl
    161 ; X64-NOT: movl
    162 ; X32-LABEL: add_32i:
    163 ; X32-NOT: lock
    164 ; X32: addl
    165 ; X32-NOT: movl
    166   %1 = load atomic i32, i32* %p acquire, align 4
    167   %2 = add i32 %1, 2
    168   store atomic i32 %2, i32* %p monotonic, align 4
    169   ret void
    170 }
    171 
    172 define void @add_32r(i32* %p, i32 %v) {
    173 ; X64-LABEL: add_32r:
    174 ; X64-NOT: lock
    175 ; X64: addl
    176 ; X64-NOT: movl
    177 ; X32-LABEL: add_32r:
    178 ; X32-NOT: lock
    179 ; X32: addl
    180 ; X32-NOT: movl
    181   %1 = load atomic i32, i32* %p acquire, align 4
    182   %2 = add i32 %1, %v
    183   store atomic i32 %2, i32* %p monotonic, align 4
    184   ret void
    185 }
    186 
    187 ; The following is a corner case where the load is added to itself. The pattern
    188 ; matching should not fold this. We only test with 32-bit add, but the same
    189 ; applies to other sizes and operations.
    190 define void @add_32r_self(i32* %p) {
    191 ; X64-LABEL: add_32r_self:
    192 ; X64-NOT: lock
    193 ; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
    194 ; X64: addl %[[R]], %[[R]]
    195 ; X64: movl %[[R]], (%[[M]])
    196 ; X32-LABEL: add_32r_self:
    197 ; X32-NOT: lock
    198 ; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
    199 ; X32: addl %[[R]], %[[R]]
    200 ; X32: movl %[[R]], (%[[M]])
    201   %1 = load atomic i32, i32* %p acquire, align 4
    202   %2 = add i32 %1, %1
    203   store atomic i32 %2, i32* %p monotonic, align 4
    204   ret void
    205 }
    206 
    207 ; The following is a corner case where the load's result is returned. The
    208 ; optimizer isn't allowed to duplicate the load because it's atomic.
    209 define i32 @add_32r_ret_load(i32* %p, i32 %v) {
    210 ; X64-LABEL: add_32r_ret_load:
    211 ; X64-NOT: lock
    212 ; X64:      movl (%rdi), %eax
    213 ; X64-NEXT: addl %eax, %esi
    214 ; X64-NEXT: movl %esi, (%rdi)
    215 ; X64-NEXT: retq
    216 ; X32-LABEL: add_32r_ret_load:
    217 ; X32-NOT: lock
    218 ; X32:      movl 4(%esp), %[[P:[a-z]+]]
    219 ; X32-NEXT: movl (%[[P]]),
    220 ; X32-NOT: %[[P]]
    221 ; More code here, we just don't want it to load from P.
    222 ; X32: movl %{{.*}}, (%[[P]])
    223 ; X32-NEXT: retl
    224   %1 = load atomic i32, i32* %p acquire, align 4
    225   %2 = add i32 %1, %v
    226   store atomic i32 %2, i32* %p monotonic, align 4
    227   ret i32 %1
    228 }
    229 
    230 define void @add_64i(i64* %p) {
    231 ; X64-LABEL: add_64i:
    232 ; X64-NOT: lock
    233 ; X64: addq
    234 ; X64-NOT: movq
    235 ;   We do not check X86-32 as it cannot do 'addq'.
    236 ; X32-LABEL: add_64i:
    237   %1 = load atomic i64, i64* %p acquire, align 8
    238   %2 = add i64 %1, 2
    239   store atomic i64 %2, i64* %p release, align 8
    240   ret void
    241 }
    242 
    243 define void @add_64r(i64* %p, i64 %v) {
    244 ; X64-LABEL: add_64r:
    245 ; X64-NOT: lock
    246 ; X64: addq
    247 ; X64-NOT: movq
    248 ;   We do not check X86-32 as it cannot do 'addq'.
    249 ; X32-LABEL: add_64r:
    250   %1 = load atomic i64, i64* %p acquire, align 8
    251   %2 = add i64 %1, %v
    252   store atomic i64 %2, i64* %p release, align 8
    253   ret void
    254 }
    255 
    256 define void @add_32i_seq_cst(i32* %p) {
    257 ; X64-LABEL: add_32i_seq_cst:
    258 ; X64: xchgl
    259 ; X32-LABEL: add_32i_seq_cst:
    260 ; X32: xchgl
    261   %1 = load atomic i32, i32* %p monotonic, align 4
    262   %2 = add i32 %1, 2
    263   store atomic i32 %2, i32* %p seq_cst, align 4
    264   ret void
    265 }
    266 
    267 define void @add_32r_seq_cst(i32* %p, i32 %v) {
    268 ; X64-LABEL: add_32r_seq_cst:
    269 ; X64: xchgl
    270 ; X32-LABEL: add_32r_seq_cst:
    271 ; X32: xchgl
    272   %1 = load atomic i32, i32* %p monotonic, align 4
    273   %2 = add i32 %1, %v
    274   store atomic i32 %2, i32* %p seq_cst, align 4
    275   ret void
    276 }
    277 
    278 ; ----- AND -----
    279 
    280 define void @and_8i(i8* %p) {
    281 ; X64-LABEL: and_8i:
    282 ; X64-NOT: lock
    283 ; X64: andb
    284 ; X64-NOT: movb
    285 ; X32-LABEL: and_8i:
    286 ; X32-NOT: lock
    287 ; X32: andb
    288 ; X32-NOT: movb
    289   %1 = load atomic i8, i8* %p monotonic, align 1
    290   %2 = and i8 %1, 2
    291   store atomic i8 %2, i8* %p release, align 1
    292   ret void
    293 }
    294 
    295 define void @and_8r(i8* %p, i8 %v) {
    296 ; X64-LABEL: and_8r:
    297 ; X64-NOT: lock
    298 ; X64: andb
    299 ; X64-NOT: movb
    300 ; X32-LABEL: and_8r:
    301 ; X32-NOT: lock
    302 ; X32: andb
    303 ; X32-NOT: movb
    304   %1 = load atomic i8, i8* %p monotonic, align 1
    305   %2 = and i8 %1, %v
    306   store atomic i8 %2, i8* %p release, align 1
    307   ret void
    308 }
    309 
    310 define void @and_16i(i16* %p) {
    311 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    312 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    313 ; X64-LABEL: and_16i:
    314 ; X64-NOT: andw
    315 ; X32-LABEL: and_16i:
    316 ; X32-NOT: andw
    317   %1 = load atomic i16, i16* %p acquire, align 2
    318   %2 = and i16 %1, 2
    319   store atomic i16 %2, i16* %p release, align 2
    320   ret void
    321 }
    322 
    323 define void @and_16r(i16* %p, i16 %v) {
    324 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    325 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    326 ; X64-LABEL: and_16r:
    327 ; X64-NOT: andw
    328 ; X32-LABEL: and_16r:
    329 ; X32-NOT: andw [.*], (
    330   %1 = load atomic i16, i16* %p acquire, align 2
    331   %2 = and i16 %1, %v
    332   store atomic i16 %2, i16* %p release, align 2
    333   ret void
    334 }
    335 
    336 define void @and_32i(i32* %p) {
    337 ; X64-LABEL: and_32i:
    338 ; X64-NOT: lock
    339 ; X64: andl
    340 ; X64-NOT: movl
    341 ; X32-LABEL: and_32i:
    342 ; X32-NOT: lock
    343 ; X32: andl
    344 ; X32-NOT: movl
    345   %1 = load atomic i32, i32* %p acquire, align 4
    346   %2 = and i32 %1, 2
    347   store atomic i32 %2, i32* %p release, align 4
    348   ret void
    349 }
    350 
    351 define void @and_32r(i32* %p, i32 %v) {
    352 ; X64-LABEL: and_32r:
    353 ; X64-NOT: lock
    354 ; X64: andl
    355 ; X64-NOT: movl
    356 ; X32-LABEL: and_32r:
    357 ; X32-NOT: lock
    358 ; X32: andl
    359 ; X32-NOT: movl
    360   %1 = load atomic i32, i32* %p acquire, align 4
    361   %2 = and i32 %1, %v
    362   store atomic i32 %2, i32* %p release, align 4
    363   ret void
    364 }
    365 
    366 define void @and_64i(i64* %p) {
    367 ; X64-LABEL: and_64i:
    368 ; X64-NOT: lock
    369 ; X64: andq
    370 ; X64-NOT: movq
    371 ;   We do not check X86-32 as it cannot do 'andq'.
    372 ; X32-LABEL: and_64i:
    373   %1 = load atomic i64, i64* %p acquire, align 8
    374   %2 = and i64 %1, 2
    375   store atomic i64 %2, i64* %p release, align 8
    376   ret void
    377 }
    378 
    379 define void @and_64r(i64* %p, i64 %v) {
    380 ; X64-LABEL: and_64r:
    381 ; X64-NOT: lock
    382 ; X64: andq
    383 ; X64-NOT: movq
    384 ;   We do not check X86-32 as it cannot do 'andq'.
    385 ; X32-LABEL: and_64r:
    386   %1 = load atomic i64, i64* %p acquire, align 8
    387   %2 = and i64 %1, %v
    388   store atomic i64 %2, i64* %p release, align 8
    389   ret void
    390 }
    391 
    392 define void @and_32i_seq_cst(i32* %p) {
    393 ; X64-LABEL: and_32i_seq_cst:
    394 ; X64: xchgl
    395 ; X32-LABEL: and_32i_seq_cst:
    396 ; X32: xchgl
    397   %1 = load atomic i32, i32* %p monotonic, align 4
    398   %2 = and i32 %1, 2
    399   store atomic i32 %2, i32* %p seq_cst, align 4
    400   ret void
    401 }
    402 
    403 define void @and_32r_seq_cst(i32* %p, i32 %v) {
    404 ; X64-LABEL: and_32r_seq_cst:
    405 ; X64: xchgl
    406 ; X32-LABEL: and_32r_seq_cst:
    407 ; X32: xchgl
    408   %1 = load atomic i32, i32* %p monotonic, align 4
    409   %2 = and i32 %1, %v
    410   store atomic i32 %2, i32* %p seq_cst, align 4
    411   ret void
    412 }
    413 
    414 ; ----- OR -----
    415 
    416 define void @or_8i(i8* %p) {
    417 ; X64-LABEL: or_8i:
    418 ; X64-NOT: lock
    419 ; X64: orb
    420 ; X64-NOT: movb
    421 ; X32-LABEL: or_8i:
    422 ; X32-NOT: lock
    423 ; X32: orb
    424 ; X32-NOT: movb
    425   %1 = load atomic i8, i8* %p acquire, align 1
    426   %2 = or i8 %1, 2
    427   store atomic i8 %2, i8* %p release, align 1
    428   ret void
    429 }
    430 
    431 define void @or_8r(i8* %p, i8 %v) {
    432 ; X64-LABEL: or_8r:
    433 ; X64-NOT: lock
    434 ; X64: orb
    435 ; X64-NOT: movb
    436 ; X32-LABEL: or_8r:
    437 ; X32-NOT: lock
    438 ; X32: orb
    439 ; X32-NOT: movb
    440   %1 = load atomic i8, i8* %p acquire, align 1
    441   %2 = or i8 %1, %v
    442   store atomic i8 %2, i8* %p release, align 1
    443   ret void
    444 }
    445 
    446 define void @or_16i(i16* %p) {
    447 ; X64-LABEL: or_16i:
    448 ; X64-NOT: orw
    449 ; X32-LABEL: or_16i:
    450 ; X32-NOT: orw
    451   %1 = load atomic i16, i16* %p acquire, align 2
    452   %2 = or i16 %1, 2
    453   store atomic i16 %2, i16* %p release, align 2
    454   ret void
    455 }
    456 
    457 define void @or_16r(i16* %p, i16 %v) {
    458 ; X64-LABEL: or_16r:
    459 ; X64-NOT: orw
    460 ; X32-LABEL: or_16r:
    461 ; X32-NOT: orw [.*], (
    462   %1 = load atomic i16, i16* %p acquire, align 2
    463   %2 = or i16 %1, %v
    464   store atomic i16 %2, i16* %p release, align 2
    465   ret void
    466 }
    467 
    468 define void @or_32i(i32* %p) {
    469 ; X64-LABEL: or_32i:
    470 ; X64-NOT: lock
    471 ; X64: orl
    472 ; X64-NOT: movl
    473 ; X32-LABEL: or_32i:
    474 ; X32-NOT: lock
    475 ; X32: orl
    476 ; X32-NOT: movl
    477   %1 = load atomic i32, i32* %p acquire, align 4
    478   %2 = or i32 %1, 2
    479   store atomic i32 %2, i32* %p release, align 4
    480   ret void
    481 }
    482 
    483 define void @or_32r(i32* %p, i32 %v) {
    484 ; X64-LABEL: or_32r:
    485 ; X64-NOT: lock
    486 ; X64: orl
    487 ; X64-NOT: movl
    488 ; X32-LABEL: or_32r:
    489 ; X32-NOT: lock
    490 ; X32: orl
    491 ; X32-NOT: movl
    492   %1 = load atomic i32, i32* %p acquire, align 4
    493   %2 = or i32 %1, %v
    494   store atomic i32 %2, i32* %p release, align 4
    495   ret void
    496 }
    497 
    498 define void @or_64i(i64* %p) {
    499 ; X64-LABEL: or_64i:
    500 ; X64-NOT: lock
    501 ; X64: orq
    502 ; X64-NOT: movq
    503 ;   We do not check X86-32 as it cannot do 'orq'.
    504 ; X32-LABEL: or_64i:
    505   %1 = load atomic i64, i64* %p acquire, align 8
    506   %2 = or i64 %1, 2
    507   store atomic i64 %2, i64* %p release, align 8
    508   ret void
    509 }
    510 
    511 define void @or_64r(i64* %p, i64 %v) {
    512 ; X64-LABEL: or_64r:
    513 ; X64-NOT: lock
    514 ; X64: orq
    515 ; X64-NOT: movq
    516 ;   We do not check X86-32 as it cannot do 'orq'.
    517 ; X32-LABEL: or_64r:
    518   %1 = load atomic i64, i64* %p acquire, align 8
    519   %2 = or i64 %1, %v
    520   store atomic i64 %2, i64* %p release, align 8
    521   ret void
    522 }
    523 
    524 define void @or_32i_seq_cst(i32* %p) {
    525 ; X64-LABEL: or_32i_seq_cst:
    526 ; X64: xchgl
    527 ; X32-LABEL: or_32i_seq_cst:
    528 ; X32: xchgl
    529   %1 = load atomic i32, i32* %p monotonic, align 4
    530   %2 = or i32 %1, 2
    531   store atomic i32 %2, i32* %p seq_cst, align 4
    532   ret void
    533 }
    534 
    535 define void @or_32r_seq_cst(i32* %p, i32 %v) {
    536 ; X64-LABEL: or_32r_seq_cst:
    537 ; X64: xchgl
    538 ; X32-LABEL: or_32r_seq_cst:
    539 ; X32: xchgl
    540   %1 = load atomic i32, i32* %p monotonic, align 4
    541   %2 = or i32 %1, %v
    542   store atomic i32 %2, i32* %p seq_cst, align 4
    543   ret void
    544 }
    545 
    546 ; ----- XOR -----
    547 
    548 define void @xor_8i(i8* %p) {
    549 ; X64-LABEL: xor_8i:
    550 ; X64-NOT: lock
    551 ; X64: xorb
    552 ; X64-NOT: movb
    553 ; X32-LABEL: xor_8i:
    554 ; X32-NOT: lock
    555 ; X32: xorb
    556 ; X32-NOT: movb
    557   %1 = load atomic i8, i8* %p acquire, align 1
    558   %2 = xor i8 %1, 2
    559   store atomic i8 %2, i8* %p release, align 1
    560   ret void
    561 }
    562 
    563 define void @xor_8r(i8* %p, i8 %v) {
    564 ; X64-LABEL: xor_8r:
    565 ; X64-NOT: lock
    566 ; X64: xorb
    567 ; X64-NOT: movb
    568 ; X32-LABEL: xor_8r:
    569 ; X32-NOT: lock
    570 ; X32: xorb
    571 ; X32-NOT: movb
    572   %1 = load atomic i8, i8* %p acquire, align 1
    573   %2 = xor i8 %1, %v
    574   store atomic i8 %2, i8* %p release, align 1
    575   ret void
    576 }
    577 
    578 define void @xor_16i(i16* %p) {
    579 ; X64-LABEL: xor_16i:
    580 ; X64-NOT: xorw
    581 ; X32-LABEL: xor_16i:
    582 ; X32-NOT: xorw
    583   %1 = load atomic i16, i16* %p acquire, align 2
    584   %2 = xor i16 %1, 2
    585   store atomic i16 %2, i16* %p release, align 2
    586   ret void
    587 }
    588 
    589 define void @xor_16r(i16* %p, i16 %v) {
    590 ; X64-LABEL: xor_16r:
    591 ; X64-NOT: xorw
    592 ; X32-LABEL: xor_16r:
    593 ; X32-NOT: xorw [.*], (
    594   %1 = load atomic i16, i16* %p acquire, align 2
    595   %2 = xor i16 %1, %v
    596   store atomic i16 %2, i16* %p release, align 2
    597   ret void
    598 }
    599 
    600 define void @xor_32i(i32* %p) {
    601 ; X64-LABEL: xor_32i:
    602 ; X64-NOT: lock
    603 ; X64: xorl
    604 ; X64-NOT: movl
    605 ; X32-LABEL: xor_32i:
    606 ; X32-NOT: lock
    607 ; X32: xorl
    608 ; X32-NOT: movl
    609   %1 = load atomic i32, i32* %p acquire, align 4
    610   %2 = xor i32 %1, 2
    611   store atomic i32 %2, i32* %p release, align 4
    612   ret void
    613 }
    614 
    615 define void @xor_32r(i32* %p, i32 %v) {
    616 ; X64-LABEL: xor_32r:
    617 ; X64-NOT: lock
    618 ; X64: xorl
    619 ; X64-NOT: movl
    620 ; X32-LABEL: xor_32r:
    621 ; X32-NOT: lock
    622 ; X32: xorl
    623 ; X32-NOT: movl
    624   %1 = load atomic i32, i32* %p acquire, align 4
    625   %2 = xor i32 %1, %v
    626   store atomic i32 %2, i32* %p release, align 4
    627   ret void
    628 }
    629 
    630 define void @xor_64i(i64* %p) {
    631 ; X64-LABEL: xor_64i:
    632 ; X64-NOT: lock
    633 ; X64: xorq
    634 ; X64-NOT: movq
    635 ;   We do not check X86-32 as it cannot do 'xorq'.
    636 ; X32-LABEL: xor_64i:
    637   %1 = load atomic i64, i64* %p acquire, align 8
    638   %2 = xor i64 %1, 2
    639   store atomic i64 %2, i64* %p release, align 8
    640   ret void
    641 }
    642 
    643 define void @xor_64r(i64* %p, i64 %v) {
    644 ; X64-LABEL: xor_64r:
    645 ; X64-NOT: lock
    646 ; X64: xorq
    647 ; X64-NOT: movq
    648 ;   We do not check X86-32 as it cannot do 'xorq'.
    649 ; X32-LABEL: xor_64r:
    650   %1 = load atomic i64, i64* %p acquire, align 8
    651   %2 = xor i64 %1, %v
    652   store atomic i64 %2, i64* %p release, align 8
    653   ret void
    654 }
    655 
    656 define void @xor_32i_seq_cst(i32* %p) {
    657 ; X64-LABEL: xor_32i_seq_cst:
    658 ; X64: xchgl
    659 ; X32-LABEL: xor_32i_seq_cst:
    660 ; X32: xchgl
    661   %1 = load atomic i32, i32* %p monotonic, align 4
    662   %2 = xor i32 %1, 2
    663   store atomic i32 %2, i32* %p seq_cst, align 4
    664   ret void
    665 }
    666 
    667 define void @xor_32r_seq_cst(i32* %p, i32 %v) {
    668 ; X64-LABEL: xor_32r_seq_cst:
    669 ; X64: xchgl
    670 ; X32-LABEL: xor_32r_seq_cst:
    671 ; X32: xchgl
    672   %1 = load atomic i32, i32* %p monotonic, align 4
    673   %2 = xor i32 %1, %v
    674   store atomic i32 %2, i32* %p seq_cst, align 4
    675   ret void
    676 }
    677 
    678 ; ----- INC -----
    679 
    680 define void @inc_8(i8* %p) {
    681 ; X64-LABEL: inc_8:
    682 ; X64-NOT: lock
    683 ; X64: incb
    684 ; X64-NOT: movb
    685 ; X32-LABEL: inc_8:
    686 ; X32-NOT: lock
    687 ; X32: incb
    688 ; X32-NOT: movb
    689 ; SLOW_INC-LABEL: inc_8:
    690 ; SLOW_INC-NOT: incb
    691 ; SLOW_INC-NOT: movb
    692   %1 = load atomic i8, i8* %p seq_cst, align 1
    693   %2 = add i8 %1, 1
    694   store atomic i8 %2, i8* %p release, align 1
    695   ret void
    696 }
    697 
    698 define void @inc_16(i16* %p) {
    699 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    700 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    701 ; X64-LABEL: inc_16:
    702 ; X64-NOT: incw
    703 ; X32-LABEL: inc_16:
    704 ; X32-NOT: incw
    705 ; SLOW_INC-LABEL: inc_16:
    706 ; SLOW_INC-NOT: incw
    707   %1 = load atomic i16, i16* %p acquire, align 2
    708   %2 = add i16 %1, 1
    709   store atomic i16 %2, i16* %p release, align 2
    710   ret void
    711 }
    712 
    713 define void @inc_32(i32* %p) {
    714 ; X64-LABEL: inc_32:
    715 ; X64-NOT: lock
    716 ; X64: incl
    717 ; X64-NOT: movl
    718 ; X32-LABEL: inc_32:
    719 ; X32-NOT: lock
    720 ; X32: incl
    721 ; X32-NOT: movl
    722 ; SLOW_INC-LABEL: inc_32:
    723 ; SLOW_INC-NOT: incl
    724 ; SLOW_INC-NOT: movl
    725   %1 = load atomic i32, i32* %p acquire, align 4
    726   %2 = add i32 %1, 1
    727   store atomic i32 %2, i32* %p monotonic, align 4
    728   ret void
    729 }
    730 
    731 define void @inc_64(i64* %p) {
    732 ; X64-LABEL: inc_64:
    733 ; X64-NOT: lock
    734 ; X64: incq
    735 ; X64-NOT: movq
    736 ;   We do not check X86-32 as it cannot do 'incq'.
    737 ; X32-LABEL: inc_64:
    738 ; SLOW_INC-LABEL: inc_64:
    739 ; SLOW_INC-NOT: incq
    740 ; SLOW_INC-NOT: movq
    741   %1 = load atomic i64, i64* %p acquire, align 8
    742   %2 = add i64 %1, 1
    743   store atomic i64 %2, i64* %p release, align 8
    744   ret void
    745 }
    746 
    747 define void @inc_32_seq_cst(i32* %p) {
    748 ; X64-LABEL: inc_32_seq_cst:
    749 ; X64: xchgl
    750 ; X32-LABEL: inc_32_seq_cst:
    751 ; X32: xchgl
    752   %1 = load atomic i32, i32* %p monotonic, align 4
    753   %2 = add i32 %1, 1
    754   store atomic i32 %2, i32* %p seq_cst, align 4
    755   ret void
    756 }
    757 
    758 ; ----- DEC -----
    759 
    760 define void @dec_8(i8* %p) {
    761 ; X64-LABEL: dec_8:
    762 ; X64-NOT: lock
    763 ; X64: decb
    764 ; X64-NOT: movb
    765 ; X32-LABEL: dec_8:
    766 ; X32-NOT: lock
    767 ; X32: decb
    768 ; X32-NOT: movb
    769 ; SLOW_INC-LABEL: dec_8:
    770 ; SLOW_INC-NOT: decb
    771 ; SLOW_INC-NOT: movb
    772   %1 = load atomic i8, i8* %p seq_cst, align 1
    773   %2 = sub i8 %1, 1
    774   store atomic i8 %2, i8* %p release, align 1
    775   ret void
    776 }
    777 
    778 define void @dec_16(i16* %p) {
    779 ;   Currently the transformation is not done on 16 bit accesses, as the backend
    780 ;   treat 16 bit arithmetic as expensive on X86/X86_64.
    781 ; X64-LABEL: dec_16:
    782 ; X64-NOT: decw
    783 ; X32-LABEL: dec_16:
    784 ; X32-NOT: decw
    785 ; SLOW_INC-LABEL: dec_16:
    786 ; SLOW_INC-NOT: decw
    787   %1 = load atomic i16, i16* %p acquire, align 2
    788   %2 = sub i16 %1, 1
    789   store atomic i16 %2, i16* %p release, align 2
    790   ret void
    791 }
    792 
    793 define void @dec_32(i32* %p) {
    794 ; X64-LABEL: dec_32:
    795 ; X64-NOT: lock
    796 ; X64: decl
    797 ; X64-NOT: movl
    798 ; X32-LABEL: dec_32:
    799 ; X32-NOT: lock
    800 ; X32: decl
    801 ; X32-NOT: movl
    802 ; SLOW_INC-LABEL: dec_32:
    803 ; SLOW_INC-NOT: decl
    804 ; SLOW_INC-NOT: movl
    805   %1 = load atomic i32, i32* %p acquire, align 4
    806   %2 = sub i32 %1, 1
    807   store atomic i32 %2, i32* %p monotonic, align 4
    808   ret void
    809 }
    810 
    811 define void @dec_64(i64* %p) {
    812 ; X64-LABEL: dec_64:
    813 ; X64-NOT: lock
    814 ; X64: decq
    815 ; X64-NOT: movq
    816 ;   We do not check X86-32 as it cannot do 'decq'.
    817 ; X32-LABEL: dec_64:
    818 ; SLOW_INC-LABEL: dec_64:
    819 ; SLOW_INC-NOT: decq
    820 ; SLOW_INC-NOT: movq
    821   %1 = load atomic i64, i64* %p acquire, align 8
    822   %2 = sub i64 %1, 1
    823   store atomic i64 %2, i64* %p release, align 8
    824   ret void
    825 }
    826 
    827 define void @dec_32_seq_cst(i32* %p) {
    828 ; X64-LABEL: dec_32_seq_cst:
    829 ; X64: xchgl
    830 ; X32-LABEL: dec_32_seq_cst:
    831 ; X32: xchgl
    832   %1 = load atomic i32, i32* %p monotonic, align 4
    833   %2 = sub i32 %1, 1
    834   store atomic i32 %2, i32* %p seq_cst, align 4
    835   ret void
    836 }
    837 
    838 ; ----- FADD -----
    839 
    840 define void @fadd_32r(float* %loc, float %val) {
    841 ; X64-LABEL: fadd_32r:
    842 ; X64-NOT: lock
    843 ; X64-NOT: mov
    844 ; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
    845 ; X64-NEXT: movss %[[XMM]], (%[[M]])
    846 ; X32-LABEL: fadd_32r:
    847 ; Don't check x86-32.
    848 ; LLVM's SSE handling is conservative on x86-32 even without using atomics.
    849   %floc = bitcast float* %loc to i32*
    850   %1 = load atomic i32, i32* %floc seq_cst, align 4
    851   %2 = bitcast i32 %1 to float
    852   %add = fadd float %2, %val
    853   %3 = bitcast float %add to i32
    854   store atomic i32 %3, i32* %floc release, align 4
    855   ret void
    856 }
    857 
    858 define void @fadd_64r(double* %loc, double %val) {
    859 ; X64-LABEL: fadd_64r:
    860 ; X64-NOT: lock
    861 ; X64-NOT: mov
    862 ; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
    863 ; X64-NEXT: movsd %[[XMM]], (%[[M]])
    864 ; X32-LABEL: fadd_64r:
    865 ; Don't check x86-32 (see comment above).
    866   %floc = bitcast double* %loc to i64*
    867   %1 = load atomic i64, i64* %floc seq_cst, align 8
    868   %2 = bitcast i64 %1 to double
    869   %add = fadd double %2, %val
    870   %3 = bitcast double %add to i64
    871   store atomic i64 %3, i64* %floc release, align 8
    872   ret void
    873 }
    874 
    875 @glob32 = global float 0.000000e+00, align 4
    876 @glob64 = global double 0.000000e+00, align 8
    877 
    878 ; Floating-point add to a global using an immediate.
    879 define void @fadd_32g() {
    880 ; X64-LABEL: fadd_32g:
    881 ; X64-NOT: lock
    882 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    883 ; X64-NEXT: addss glob32(%rip), %[[XMM]]
    884 ; X64-NEXT: movss %[[XMM]], glob32(%rip)
    885 ; X32-LABEL: fadd_32g:
    886 ; Don't check x86-32 (see comment above).
    887   %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
    888   %f = bitcast i32 %i to float
    889   %add = fadd float %f, 1.000000e+00
    890   %s = bitcast float %add to i32
    891   store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
    892   ret void
    893 }
    894 
    895 define void @fadd_64g() {
    896 ; X64-LABEL: fadd_64g:
    897 ; X64-NOT: lock
    898 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    899 ; X64-NEXT: addsd glob64(%rip), %[[XMM]]
    900 ; X64-NEXT: movsd %[[XMM]], glob64(%rip)
    901 ; X32-LABEL: fadd_64g:
    902 ; Don't check x86-32 (see comment above).
    903   %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
    904   %f = bitcast i64 %i to double
    905   %add = fadd double %f, 1.000000e+00
    906   %s = bitcast double %add to i64
    907   store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
    908   ret void
    909 }
    910 
    911 ; Floating-point add to a hard-coded immediate location using an immediate.
    912 define void @fadd_32imm() {
    913 ; X64-LABEL: fadd_32imm:
    914 ; X64-NOT: lock
    915 ; X64:      movl $3735928559, %e[[M:[a-z]+]]
    916 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    917 ; X64-NEXT: addss (%r[[M]]), %[[XMM]]
    918 ; X64-NEXT: movss %[[XMM]], (%r[[M]])
    919 ; X32-LABEL: fadd_32imm:
    920 ; Don't check x86-32 (see comment above).
    921   %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
    922   %f = bitcast i32 %i to float
    923   %add = fadd float %f, 1.000000e+00
    924   %s = bitcast float %add to i32
    925   store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
    926   ret void
    927 }
    928 
    929 define void @fadd_64imm() {
    930 ; X64-LABEL: fadd_64imm:
    931 ; X64-NOT: lock
    932 ; X64:      movl $3735928559, %e[[M:[a-z]+]]
    933 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    934 ; X64-NEXT: addsd (%r[[M]]), %[[XMM]]
    935 ; X64-NEXT: movsd %[[XMM]], (%r[[M]])
    936 ; X32-LABEL: fadd_64imm:
    937 ; Don't check x86-32 (see comment above).
    938   %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
    939   %f = bitcast i64 %i to double
    940   %add = fadd double %f, 1.000000e+00
    941   %s = bitcast double %add to i64
    942   store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
    943   ret void
    944 }
    945 
    946 ; Floating-point add to a stack location.
    947 define void @fadd_32stack() {
    948 ; X64-LABEL: fadd_32stack:
    949 ; X64-NOT: lock
    950 ; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    951 ; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
    952 ; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp)
    953 ; X32-LABEL: fadd_32stack:
    954 ; Don't check x86-32 (see comment above).
    955   %ptr = alloca i32, align 4
    956   %bc3 = bitcast i32* %ptr to float*
    957   %load = load atomic i32, i32* %ptr acquire, align 4
    958   %bc0 = bitcast i32 %load to float
    959   %fadd = fadd float 1.000000e+00, %bc0
    960   %bc1 = bitcast float %fadd to i32
    961   store atomic i32 %bc1, i32* %ptr release, align 4
    962   ret void
    963 }
    964 
    965 define void @fadd_64stack() {
    966 ; X64-LABEL: fadd_64stack:
    967 ; X64-NOT: lock
    968 ; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
    969 ; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
    970 ; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp)
    971 ; X32-LABEL: fadd_64stack:
    972 ; Don't check x86-32 (see comment above).
    973   %ptr = alloca i64, align 8
    974   %bc3 = bitcast i64* %ptr to double*
    975   %load = load atomic i64, i64* %ptr acquire, align 8
    976   %bc0 = bitcast i64 %load to double
    977   %fadd = fadd double 1.000000e+00, %bc0
    978   %bc1 = bitcast double %fadd to i64
    979   store atomic i64 %bc1, i64* %ptr release, align 8
    980   ret void
    981 }
    982