Home | History | Annotate | Download | only in llvm2ice_tests
      1 ; Test that loads/stores don't move across a nacl.atomic.fence.all.
      2 ; This should apply to both atomic and non-atomic loads/stores
      3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only
      4 ; applies to atomic load/stores).
      5 ;
      6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
      7 
      8 declare void @llvm.nacl.atomic.fence.all()
      9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
     10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
     11 
     12 @g32_a = internal global [4 x i8] zeroinitializer, align 4
     13 @g32_b = internal global [4 x i8] zeroinitializer, align 4
     14 @g32_c = internal global [4 x i8] zeroinitializer, align 4
     15 @g32_d = internal global [4 x i8] zeroinitializer, align 4
     16 
     17 define internal i32 @test_fused_load_sub_a() {
     18 entry:
     19   %p_alloca = alloca i8, i32 4, align 4
     20   %p_alloca_bc = bitcast i8* %p_alloca to i32*
     21   store i32 999, i32* %p_alloca_bc, align 1
     22 
     23   %p_a = bitcast [4 x i8]* @g32_a to i32*
     24   %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
     25   %l_a2 = sub i32 1, %l_a
     26   call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
     27 
     28   %p_b = bitcast [4 x i8]* @g32_b to i32*
     29   %l_b = load i32, i32* %p_b, align 1
     30   %l_b2 = sub i32 1, %l_b
     31   store i32 %l_b2, i32* %p_b, align 1
     32 
     33   %p_c = bitcast [4 x i8]* @g32_c to i32*
     34   %l_c = load i32, i32* %p_c, align 1
     35   %l_c2 = sub i32 1, %l_c
     36   call void @llvm.nacl.atomic.fence.all()
     37   store i32 %l_c2, i32* %p_c, align 1
     38 
     39   ret i32 %l_c2
     40 }
     41 ; CHECK-LABEL: test_fused_load_sub_a
     42 ;    alloca store
     43 ; CHECK: mov DWORD PTR {{.*}},0x3e7
     44 ;    atomic store (w/ its own mfence)
     45 ; The load + sub are optimized into one everywhere.
     46 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
     47 ; CHECK: mov {{(DWORD PTR)?}}
     48 ; CHECK: mfence
     49 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}}
     50 ; CHECK: mov {{(DWORD PTR)?}}
     51 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
     52 ; CHECK: mfence
     53 ; CHECK: mov {{(DWORD PTR)?}}
     54 
     55 ; Test with the fence moved up a bit.
     56 define internal i32 @test_fused_load_sub_b() {
     57 entry:
     58   %p_alloca = alloca i8, i32 4, align 4
     59   %p_alloca_bc = bitcast i8* %p_alloca to i32*
     60   store i32 999, i32* %p_alloca_bc, align 1
     61 
     62   %p_a = bitcast [4 x i8]* @g32_a to i32*
     63   %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
     64   %l_a2 = sub i32 1, %l_a
     65   call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
     66 
     67   %p_b = bitcast [4 x i8]* @g32_b to i32*
     68   %l_b = load i32, i32* %p_b, align 1
     69   %l_b2 = sub i32 1, %l_b
     70   store i32 %l_b2, i32* %p_b, align 1
     71 
     72   %p_c = bitcast [4 x i8]* @g32_c to i32*
     73   call void @llvm.nacl.atomic.fence.all()
     74   %l_c = load i32, i32* %p_c, align 1
     75   %l_c2 = sub i32 1, %l_c
     76   store i32 %l_c2, i32* %p_c, align 1
     77 
     78   ret i32 %l_c2
     79 }
     80 ; CHECK-LABEL: test_fused_load_sub_b
     81 ;    alloca store
     82 ; CHECK: mov DWORD PTR {{.*}},0x3e7
     83 ;    atomic store (w/ its own mfence)
     84 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
     85 ; CHECK: mov {{(DWORD PTR)?}}
     86 ; CHECK: mfence
     87 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}}
     88 ; CHECK: mov {{(DWORD PTR)?}}
     89 ; CHECK: mfence
     90 ; Load + sub can still be optimized into one instruction
     91 ; because it is not separated by a fence.
     92 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
     93 ; CHECK: mov {{(DWORD PTR)?}}
     94 
     95 ; Test with the fence splitting a load/sub.
     96 define internal i32 @test_fused_load_sub_c() {
     97 entry:
     98   %p_alloca = alloca i8, i32 4, align 4
     99   %p_alloca_bc = bitcast i8* %p_alloca to i32*
    100   store i32 999, i32* %p_alloca_bc, align 1
    101 
    102   %p_a = bitcast [4 x i8]* @g32_a to i32*
    103   %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
    104   %l_a2 = sub i32 1, %l_a
    105   call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
    106 
    107   %p_b = bitcast [4 x i8]* @g32_b to i32*
    108   %l_b = load i32, i32* %p_b, align 1
    109   call void @llvm.nacl.atomic.fence.all()
    110   %l_b2 = sub i32 1, %l_b
    111   store i32 %l_b2, i32* %p_b, align 1
    112 
    113   %p_c = bitcast [4 x i8]* @g32_c to i32*
    114   %l_c = load i32, i32* %p_c, align 1
    115   %l_c2 = sub i32 1, %l_c
    116   store i32 %l_c2, i32* %p_c, align 1
    117 
    118   ret i32 %l_c2
    119 }
    120 ; CHECK-LABEL: test_fused_load_sub_c
    121 ;    alloca store
    122 ; CHECK: mov DWORD PTR {{.*}},0x3e7
    123 ;    atomic store (w/ its own mfence)
    124 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}}
    125 ; CHECK: mov {{(DWORD PTR)?}}
    126 ; CHECK: mfence
    127 ; This load + sub are no longer optimized into one,
    128 ; though perhaps it should be legal as long as
    129 ; the load stays on the same side of the fence.
    130 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_b)|(.bss)}}
    131 ; CHECK: mfence
    132 ; CHECK: mov {{.*}},0x1
    133 ; CHECK: sub
    134 ; CHECK: mov {{(DWORD PTR)?}}
    135 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}}
    136 ; CHECK: mov {{(DWORD PTR)?}}
    137 
    138 
    139 ; Test where a bunch of i8 loads could have been fused into one
    140 ; i32 load, but a fence blocks that.
    141 define internal i32 @could_have_fused_loads() {
    142 entry:
    143   %ptr1 = bitcast [4 x i8]* @g32_d to i8*
    144   %b1 = load i8, i8* %ptr1, align 1
    145 
    146   %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32
    147   %int_ptr_bump2 = add i32 %int_ptr2, 1
    148   %ptr2 = inttoptr i32 %int_ptr_bump2 to i8*
    149   %b2 = load i8, i8* %ptr2, align 1
    150 
    151   %int_ptr_bump3 = add i32 %int_ptr2, 2
    152   %ptr3 = inttoptr i32 %int_ptr_bump3 to i8*
    153   %b3 = load i8, i8* %ptr3, align 1
    154 
    155   call void @llvm.nacl.atomic.fence.all()
    156 
    157   %int_ptr_bump4 = add i32 %int_ptr2, 3
    158   %ptr4 = inttoptr i32 %int_ptr_bump4 to i8*
    159   %b4 = load i8, i8* %ptr4, align 1
    160 
    161   %b1.ext = zext i8 %b1 to i32
    162   %b2.ext = zext i8 %b2 to i32
    163   %b2.shift = shl i32 %b2.ext, 8
    164   %b12 = or i32 %b1.ext, %b2.shift
    165   %b3.ext = zext i8 %b3 to i32
    166   %b3.shift = shl i32 %b3.ext, 16
    167   %b123 = or i32 %b12, %b3.shift
    168   %b4.ext = zext i8 %b4 to i32
    169   %b4.shift = shl i32 %b4.ext, 24
    170   %b1234 = or i32 %b123, %b4.shift
    171   ret i32 %b1234
    172 }
    173 ; CHECK-LABEL: could_have_fused_loads
    174 ; CHECK: mov {{.*}},{{(BYTE PTR)?}}
    175 ; CHECK: mov {{.*}},BYTE PTR
    176 ; CHECK: mov {{.*}},BYTE PTR
    177 ; CHECK: mfence
    178 ; CHECK: mov {{.*}},BYTE PTR
    179 
    180 
    181 ; Test where an identical load from two branches could have been hoisted
    182 ; up, and then the code merged, but a fence prevents it.
    183 define internal i32 @could_have_hoisted_loads(i32 %x) {
    184 entry:
    185   %ptr = bitcast [4 x i8]* @g32_d to i32*
    186   %cmp = icmp eq i32 %x, 1
    187   br i1 %cmp, label %branch1, label %branch2
    188 branch1:
    189   %y = load i32, i32* %ptr, align 1
    190   ret i32 %y
    191 branch2:
    192   call void @llvm.nacl.atomic.fence.all()
    193   %z = load i32, i32* %ptr, align 1
    194   ret i32 %z
    195 }
    196 ; CHECK-LABEL: could_have_hoisted_loads
    197 ; CHECK: jne {{.*}}
    198 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}}
    199 ; CHECK: ret
    200 ; CHECK: mfence
    201 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}}
    202 ; CHECK: ret
    203