1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. 2 ; This should apply to both atomic and non-atomic loads/stores 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only 4 ; applies to atomic load/stores). 5 ; 6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s 7 8 declare void @llvm.nacl.atomic.fence.all() 9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) 10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) 11 12 @g32_a = internal global [4 x i8] zeroinitializer, align 4 13 @g32_b = internal global [4 x i8] zeroinitializer, align 4 14 @g32_c = internal global [4 x i8] zeroinitializer, align 4 15 @g32_d = internal global [4 x i8] zeroinitializer, align 4 16 17 define internal i32 @test_fused_load_sub_a() { 18 entry: 19 %p_alloca = alloca i8, i32 4, align 4 20 %p_alloca_bc = bitcast i8* %p_alloca to i32* 21 store i32 999, i32* %p_alloca_bc, align 1 22 23 %p_a = bitcast [4 x i8]* @g32_a to i32* 24 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) 25 %l_a2 = sub i32 1, %l_a 26 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) 27 28 %p_b = bitcast [4 x i8]* @g32_b to i32* 29 %l_b = load i32, i32* %p_b, align 1 30 %l_b2 = sub i32 1, %l_b 31 store i32 %l_b2, i32* %p_b, align 1 32 33 %p_c = bitcast [4 x i8]* @g32_c to i32* 34 %l_c = load i32, i32* %p_c, align 1 35 %l_c2 = sub i32 1, %l_c 36 call void @llvm.nacl.atomic.fence.all() 37 store i32 %l_c2, i32* %p_c, align 1 38 39 ret i32 %l_c2 40 } 41 ; CHECK-LABEL: test_fused_load_sub_a 42 ; alloca store 43 ; CHECK: mov DWORD PTR {{.*}},0x3e7 44 ; atomic store (w/ its own mfence) 45 ; The load + sub are optimized into one everywhere. 46 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}} 47 ; CHECK: mov {{(DWORD PTR)?}} 48 ; CHECK: mfence 49 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}} 50 ; CHECK: mov {{(DWORD PTR)?}} 51 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}} 52 ; CHECK: mfence 53 ; CHECK: mov {{(DWORD PTR)?}} 54 55 ; Test with the fence moved up a bit. 56 define internal i32 @test_fused_load_sub_b() { 57 entry: 58 %p_alloca = alloca i8, i32 4, align 4 59 %p_alloca_bc = bitcast i8* %p_alloca to i32* 60 store i32 999, i32* %p_alloca_bc, align 1 61 62 %p_a = bitcast [4 x i8]* @g32_a to i32* 63 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) 64 %l_a2 = sub i32 1, %l_a 65 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) 66 67 %p_b = bitcast [4 x i8]* @g32_b to i32* 68 %l_b = load i32, i32* %p_b, align 1 69 %l_b2 = sub i32 1, %l_b 70 store i32 %l_b2, i32* %p_b, align 1 71 72 %p_c = bitcast [4 x i8]* @g32_c to i32* 73 call void @llvm.nacl.atomic.fence.all() 74 %l_c = load i32, i32* %p_c, align 1 75 %l_c2 = sub i32 1, %l_c 76 store i32 %l_c2, i32* %p_c, align 1 77 78 ret i32 %l_c2 79 } 80 ; CHECK-LABEL: test_fused_load_sub_b 81 ; alloca store 82 ; CHECK: mov DWORD PTR {{.*}},0x3e7 83 ; atomic store (w/ its own mfence) 84 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}} 85 ; CHECK: mov {{(DWORD PTR)?}} 86 ; CHECK: mfence 87 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_b)|(.bss)}} 88 ; CHECK: mov {{(DWORD PTR)?}} 89 ; CHECK: mfence 90 ; Load + sub can still be optimized into one instruction 91 ; because it is not separated by a fence. 92 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}} 93 ; CHECK: mov {{(DWORD PTR)?}} 94 95 ; Test with the fence splitting a load/sub. 96 define internal i32 @test_fused_load_sub_c() { 97 entry: 98 %p_alloca = alloca i8, i32 4, align 4 99 %p_alloca_bc = bitcast i8* %p_alloca to i32* 100 store i32 999, i32* %p_alloca_bc, align 1 101 102 %p_a = bitcast [4 x i8]* @g32_a to i32* 103 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) 104 %l_a2 = sub i32 1, %l_a 105 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) 106 107 %p_b = bitcast [4 x i8]* @g32_b to i32* 108 %l_b = load i32, i32* %p_b, align 1 109 call void @llvm.nacl.atomic.fence.all() 110 %l_b2 = sub i32 1, %l_b 111 store i32 %l_b2, i32* %p_b, align 1 112 113 %p_c = bitcast [4 x i8]* @g32_c to i32* 114 %l_c = load i32, i32* %p_c, align 1 115 %l_c2 = sub i32 1, %l_c 116 store i32 %l_c2, i32* %p_c, align 1 117 118 ret i32 %l_c2 119 } 120 ; CHECK-LABEL: test_fused_load_sub_c 121 ; alloca store 122 ; CHECK: mov DWORD PTR {{.*}},0x3e7 123 ; atomic store (w/ its own mfence) 124 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_a)|(.bss)}} 125 ; CHECK: mov {{(DWORD PTR)?}} 126 ; CHECK: mfence 127 ; This load + sub are no longer optimized into one, 128 ; though perhaps it should be legal as long as 129 ; the load stays on the same side of the fence. 130 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_b)|(.bss)}} 131 ; CHECK: mfence 132 ; CHECK: mov {{.*}},0x1 133 ; CHECK: sub 134 ; CHECK: mov {{(DWORD PTR)?}} 135 ; CHECK: sub {{.*}},DWORD PTR {{.*}}{{(g32_c)|(.bss)}} 136 ; CHECK: mov {{(DWORD PTR)?}} 137 138 139 ; Test where a bunch of i8 loads could have been fused into one 140 ; i32 load, but a fence blocks that. 141 define internal i32 @could_have_fused_loads() { 142 entry: 143 %ptr1 = bitcast [4 x i8]* @g32_d to i8* 144 %b1 = load i8, i8* %ptr1, align 1 145 146 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 147 %int_ptr_bump2 = add i32 %int_ptr2, 1 148 %ptr2 = inttoptr i32 %int_ptr_bump2 to i8* 149 %b2 = load i8, i8* %ptr2, align 1 150 151 %int_ptr_bump3 = add i32 %int_ptr2, 2 152 %ptr3 = inttoptr i32 %int_ptr_bump3 to i8* 153 %b3 = load i8, i8* %ptr3, align 1 154 155 call void @llvm.nacl.atomic.fence.all() 156 157 %int_ptr_bump4 = add i32 %int_ptr2, 3 158 %ptr4 = inttoptr i32 %int_ptr_bump4 to i8* 159 %b4 = load i8, i8* %ptr4, align 1 160 161 %b1.ext = zext i8 %b1 to i32 162 %b2.ext = zext i8 %b2 to i32 163 %b2.shift = shl i32 %b2.ext, 8 164 %b12 = or i32 %b1.ext, %b2.shift 165 %b3.ext = zext i8 %b3 to i32 166 %b3.shift = shl i32 %b3.ext, 16 167 %b123 = or i32 %b12, %b3.shift 168 %b4.ext = zext i8 %b4 to i32 169 %b4.shift = shl i32 %b4.ext, 24 170 %b1234 = or i32 %b123, %b4.shift 171 ret i32 %b1234 172 } 173 ; CHECK-LABEL: could_have_fused_loads 174 ; CHECK: mov {{.*}},{{(BYTE PTR)?}} 175 ; CHECK: mov {{.*}},BYTE PTR 176 ; CHECK: mov {{.*}},BYTE PTR 177 ; CHECK: mfence 178 ; CHECK: mov {{.*}},BYTE PTR 179 180 181 ; Test where an identical load from two branches could have been hoisted 182 ; up, and then the code merged, but a fence prevents it. 183 define internal i32 @could_have_hoisted_loads(i32 %x) { 184 entry: 185 %ptr = bitcast [4 x i8]* @g32_d to i32* 186 %cmp = icmp eq i32 %x, 1 187 br i1 %cmp, label %branch1, label %branch2 188 branch1: 189 %y = load i32, i32* %ptr, align 1 190 ret i32 %y 191 branch2: 192 call void @llvm.nacl.atomic.fence.all() 193 %z = load i32, i32* %ptr, align 1 194 ret i32 %z 195 } 196 ; CHECK-LABEL: could_have_hoisted_loads 197 ; CHECK: jne {{.*}} 198 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}} 199 ; CHECK: ret 200 ; CHECK: mfence 201 ; CHECK: mov {{.*}},{{(DWORD PTR )?}}{{.*}}{{(g32_d)|(.bss)}} 202 ; CHECK: ret 203