1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 4 5 ; Make sure that we generate non-temporal stores for the test cases below. 6 ; We use xorps for zeroing, so domain information isn't available anymore. 7 8 define void @test_zero_v4f32(<4 x float>* %dst) { 9 ; CHECK-LABEL: test_zero_v4f32: 10 ; SSE: movntps 11 ; AVX: vmovntps 12 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 13 ret void 14 } 15 16 define void @test_zero_v4i32(<4 x i32>* %dst) { 17 ; CHECK-LABEL: test_zero_v4i32: 18 ; SSE: movntps 19 ; AVX: vmovntps 20 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 21 ret void 22 } 23 24 define void @test_zero_v2f64(<2 x double>* %dst) { 25 ; CHECK-LABEL: test_zero_v2f64: 26 ; SSE: movntps 27 ; AVX: vmovntps 28 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 29 ret void 30 } 31 32 define void @test_zero_v2i64(<2 x i64>* %dst) { 33 ; CHECK-LABEL: test_zero_v2i64: 34 ; SSE: movntps 35 ; AVX: vmovntps 36 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1 37 ret void 38 } 39 40 define void @test_zero_v8i16(<8 x i16>* %dst) { 41 ; CHECK-LABEL: test_zero_v8i16: 42 ; SSE: movntps 43 ; AVX: vmovntps 44 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1 45 ret void 46 } 47 48 define void @test_zero_v16i8(<16 x i8>* %dst) { 49 ; CHECK-LABEL: test_zero_v16i8: 50 ; SSE: movntps 51 ; AVX: vmovntps 52 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1 53 ret void 54 } 55 56 ; And now YMM versions. 57 58 define void @test_zero_v8f32(<8 x float>* %dst) { 59 ; CHECK-LABEL: test_zero_v8f32: 60 ; AVX: vmovntps %ymm 61 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1 62 ret void 63 } 64 65 define void @test_zero_v8i32(<8 x i32>* %dst) { 66 ; CHECK-LABEL: test_zero_v8i32: 67 ; AVX2: vmovntps %ymm 68 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1 69 ret void 70 } 71 72 define void @test_zero_v4f64(<4 x double>* %dst) { 73 ; CHECK-LABEL: test_zero_v4f64: 74 ; AVX: vmovntps %ymm 75 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1 76 ret void 77 } 78 79 define void @test_zero_v4i64(<4 x i64>* %dst) { 80 ; CHECK-LABEL: test_zero_v4i64: 81 ; AVX2: vmovntps %ymm 82 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1 83 ret void 84 } 85 86 define void @test_zero_v16i16(<16 x i16>* %dst) { 87 ; CHECK-LABEL: test_zero_v16i16: 88 ; AVX2: vmovntps %ymm 89 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1 90 ret void 91 } 92 93 define void @test_zero_v32i8(<32 x i8>* %dst) { 94 ; CHECK-LABEL: test_zero_v32i8: 95 ; AVX2: vmovntps %ymm 96 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1 97 ret void 98 } 99 100 101 ; Check that we also handle arguments. Here the type survives longer. 102 103 define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) { 104 ; CHECK-LABEL: test_arg_v4f32: 105 ; SSE: movntps 106 ; AVX: vmovntps 107 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1 108 ret void 109 } 110 111 define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) { 112 ; CHECK-LABEL: test_arg_v4i32: 113 ; SSE: movntps 114 ; AVX: vmovntps 115 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1 116 ret void 117 } 118 119 define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) { 120 ; CHECK-LABEL: test_arg_v2f64: 121 ; SSE: movntps 122 ; AVX: vmovntps 123 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1 124 ret void 125 } 126 127 define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) { 128 ; CHECK-LABEL: test_arg_v2i64: 129 ; SSE: movntps 130 ; AVX: vmovntps 131 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1 132 ret void 133 } 134 135 define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) { 136 ; CHECK-LABEL: test_arg_v8i16: 137 ; SSE: movntps 138 ; AVX: vmovntps 139 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1 140 ret void 141 } 142 143 define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) { 144 ; CHECK-LABEL: test_arg_v16i8: 145 ; SSE: movntps 146 ; AVX: vmovntps 147 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1 148 ret void 149 } 150 151 ; And now YMM versions. 152 153 define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) { 154 ; CHECK-LABEL: test_arg_v8f32: 155 ; AVX: vmovntps %ymm 156 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1 157 ret void 158 } 159 160 define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) { 161 ; CHECK-LABEL: test_arg_v8i32: 162 ; AVX2: vmovntps %ymm 163 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1 164 ret void 165 } 166 167 define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) { 168 ; CHECK-LABEL: test_arg_v4f64: 169 ; AVX: vmovntps %ymm 170 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1 171 ret void 172 } 173 174 define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) { 175 ; CHECK-LABEL: test_arg_v4i64: 176 ; AVX2: vmovntps %ymm 177 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1 178 ret void 179 } 180 181 define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) { 182 ; CHECK-LABEL: test_arg_v16i16: 183 ; AVX2: vmovntps %ymm 184 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1 185 ret void 186 } 187 188 define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) { 189 ; CHECK-LABEL: test_arg_v32i8: 190 ; AVX2: vmovntps %ymm 191 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1 192 ret void 193 } 194 195 196 ; Now check that if the execution domain is trivially visible, we use it. 197 ; We use an add to make the type survive all the way to the MOVNT. 198 199 define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) { 200 ; CHECK-LABEL: test_op_v4f32: 201 ; SSE: movntps 202 ; AVX: vmovntps 203 %r = fadd <4 x float> %a, %b 204 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1 205 ret void 206 } 207 208 define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) { 209 ; CHECK-LABEL: test_op_v4i32: 210 ; SSE: movntdq 211 ; AVX: vmovntdq 212 %r = add <4 x i32> %a, %b 213 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1 214 ret void 215 } 216 217 define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) { 218 ; CHECK-LABEL: test_op_v2f64: 219 ; SSE: movntpd 220 ; AVX: vmovntpd 221 %r = fadd <2 x double> %a, %b 222 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1 223 ret void 224 } 225 226 define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) { 227 ; CHECK-LABEL: test_op_v2i64: 228 ; SSE: movntdq 229 ; AVX: vmovntdq 230 %r = add <2 x i64> %a, %b 231 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1 232 ret void 233 } 234 235 define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) { 236 ; CHECK-LABEL: test_op_v8i16: 237 ; SSE: movntdq 238 ; AVX: vmovntdq 239 %r = add <8 x i16> %a, %b 240 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1 241 ret void 242 } 243 244 define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) { 245 ; CHECK-LABEL: test_op_v16i8: 246 ; SSE: movntdq 247 ; AVX: vmovntdq 248 %r = add <16 x i8> %a, %b 249 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1 250 ret void 251 } 252 253 ; And now YMM versions. 254 255 define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { 256 ; CHECK-LABEL: test_op_v8f32: 257 ; AVX: vmovntps %ymm 258 %r = fadd <8 x float> %a, %b 259 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1 260 ret void 261 } 262 263 define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) { 264 ; CHECK-LABEL: test_op_v8i32: 265 ; AVX2: vmovntdq %ymm 266 %r = add <8 x i32> %a, %b 267 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1 268 ret void 269 } 270 271 define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) { 272 ; CHECK-LABEL: test_op_v4f64: 273 ; AVX: vmovntpd %ymm 274 %r = fadd <4 x double> %a, %b 275 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1 276 ret void 277 } 278 279 define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) { 280 ; CHECK-LABEL: test_op_v4i64: 281 ; AVX2: vmovntdq %ymm 282 %r = add <4 x i64> %a, %b 283 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1 284 ret void 285 } 286 287 define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) { 288 ; CHECK-LABEL: test_op_v16i16: 289 ; AVX2: vmovntdq %ymm 290 %r = add <16 x i16> %a, %b 291 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1 292 ret void 293 } 294 295 define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { 296 ; CHECK-LABEL: test_op_v32i8: 297 ; AVX2: vmovntdq %ymm 298 %r = add <32 x i8> %a, %b 299 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1 300 ret void 301 } 302 303 !1 = !{i32 1} 304