1 ; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2 2 ; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 3 ; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 4 5 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 6 target triple = "x86_64-apple-macosx10.8.0" 7 8 define void @test1(i16* nocapture %head) nounwind { 9 vector.ph: 10 br label %vector.body 11 12 vector.body: ; preds = %vector.body, %vector.ph 13 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 14 %0 = getelementptr inbounds i16* %head, i64 %index 15 %1 = bitcast i16* %0 to <8 x i16>* 16 %2 = load <8 x i16>* %1, align 2 17 %3 = icmp slt <8 x i16> %2, zeroinitializer 18 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 19 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 20 store <8 x i16> %5, <8 x i16>* %1, align 2 21 %index.next = add i64 %index, 8 22 %6 = icmp eq i64 %index.next, 16384 23 br i1 %6, label %for.end, label %vector.body 24 25 for.end: ; preds = %vector.body 26 ret void 27 28 ; SSE2: @test1 29 ; SSE2: psubusw LCPI0_0(%rip), %xmm0 30 31 ; AVX1: @test1 32 ; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 33 34 ; AVX2: @test1 35 ; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 36 } 37 38 define void @test2(i16* nocapture %head) nounwind { 39 vector.ph: 40 br label %vector.body 41 42 vector.body: ; preds = %vector.body, %vector.ph 43 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 44 %0 = getelementptr inbounds i16* %head, i64 %index 45 %1 = bitcast i16* %0 to <8 x i16>* 46 %2 = load <8 x i16>* %1, align 2 47 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 48 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 49 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 50 store <8 x i16> %5, <8 x i16>* %1, align 2 51 %index.next = add i64 %index, 8 52 %6 = icmp eq i64 %index.next, 16384 53 br i1 %6, label %for.end, label %vector.body 54 55 for.end: ; preds = %vector.body 56 ret void 57 58 ; SSE2: @test2 59 ; SSE2: psubusw LCPI1_0(%rip), %xmm0 60 61 ; AVX1: @test2 62 ; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 63 64 ; AVX2: @test2 65 ; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 66 } 67 68 define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { 69 vector.ph: 70 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 71 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 72 br label %vector.body 73 74 vector.body: ; preds = %vector.body, %vector.ph 75 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 76 %1 = getelementptr inbounds i16* %head, i64 %index 77 %2 = bitcast i16* %1 to <8 x i16>* 78 %3 = load <8 x i16>* %2, align 2 79 %4 = icmp ult <8 x i16> %3, %broadcast15 80 %5 = sub <8 x i16> %3, %broadcast15 81 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 82 store <8 x i16> %6, <8 x i16>* %2, align 2 83 %index.next = add i64 %index, 8 84 %7 = icmp eq i64 %index.next, 16384 85 br i1 %7, label %for.end, label %vector.body 86 87 for.end: ; preds = %vector.body 88 ret void 89 90 ; SSE2: @test3 91 ; SSE2: psubusw %xmm0, %xmm1 92 93 ; AVX1: @test3 94 ; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 95 96 ; AVX2: @test3 97 ; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 98 } 99 100 define void @test4(i8* nocapture %head) nounwind { 101 vector.ph: 102 br label %vector.body 103 104 vector.body: ; preds = %vector.body, %vector.ph 105 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 106 %0 = getelementptr inbounds i8* %head, i64 %index 107 %1 = bitcast i8* %0 to <16 x i8>* 108 %2 = load <16 x i8>* %1, align 1 109 %3 = icmp slt <16 x i8> %2, zeroinitializer 110 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 111 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 112 store <16 x i8> %5, <16 x i8>* %1, align 1 113 %index.next = add i64 %index, 16 114 %6 = icmp eq i64 %index.next, 16384 115 br i1 %6, label %for.end, label %vector.body 116 117 for.end: ; preds = %vector.body 118 ret void 119 120 ; SSE2: @test4 121 ; SSE2: psubusb LCPI3_0(%rip), %xmm0 122 123 ; AVX1: @test4 124 ; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 125 126 ; AVX2: @test4 127 ; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 128 } 129 130 define void @test5(i8* nocapture %head) nounwind { 131 vector.ph: 132 br label %vector.body 133 134 vector.body: ; preds = %vector.body, %vector.ph 135 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 136 %0 = getelementptr inbounds i8* %head, i64 %index 137 %1 = bitcast i8* %0 to <16 x i8>* 138 %2 = load <16 x i8>* %1, align 1 139 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 140 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 141 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 142 store <16 x i8> %5, <16 x i8>* %1, align 1 143 %index.next = add i64 %index, 16 144 %6 = icmp eq i64 %index.next, 16384 145 br i1 %6, label %for.end, label %vector.body 146 147 for.end: ; preds = %vector.body 148 ret void 149 150 ; SSE2: @test5 151 ; SSE2: psubusb LCPI4_0(%rip), %xmm0 152 153 ; AVX1: @test5 154 ; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 155 156 ; AVX2: @test5 157 ; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 158 } 159 160 define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { 161 vector.ph: 162 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 163 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 164 br label %vector.body 165 166 vector.body: ; preds = %vector.body, %vector.ph 167 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 168 %1 = getelementptr inbounds i8* %head, i64 %index 169 %2 = bitcast i8* %1 to <16 x i8>* 170 %3 = load <16 x i8>* %2, align 1 171 %4 = icmp ult <16 x i8> %3, %broadcast15 172 %5 = sub <16 x i8> %3, %broadcast15 173 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 174 store <16 x i8> %6, <16 x i8>* %2, align 1 175 %index.next = add i64 %index, 16 176 %7 = icmp eq i64 %index.next, 16384 177 br i1 %7, label %for.end, label %vector.body 178 179 for.end: ; preds = %vector.body 180 ret void 181 182 ; SSE2: @test6 183 ; SSE2: psubusb %xmm0, %xmm1 184 185 ; AVX1: @test6 186 ; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 187 188 ; AVX2: @test6 189 ; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 190 } 191 192 define void @test7(i16* nocapture %head) nounwind { 193 vector.ph: 194 br label %vector.body 195 196 vector.body: ; preds = %vector.body, %vector.ph 197 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 198 %0 = getelementptr inbounds i16* %head, i64 %index 199 %1 = bitcast i16* %0 to <16 x i16>* 200 %2 = load <16 x i16>* %1, align 2 201 %3 = icmp slt <16 x i16> %2, zeroinitializer 202 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 203 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 204 store <16 x i16> %5, <16 x i16>* %1, align 2 205 %index.next = add i64 %index, 8 206 %6 = icmp eq i64 %index.next, 16384 207 br i1 %6, label %for.end, label %vector.body 208 209 for.end: ; preds = %vector.body 210 ret void 211 212 ; AVX2: @test7 213 ; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0 214 } 215 216 define void @test8(i16* nocapture %head) nounwind { 217 vector.ph: 218 br label %vector.body 219 220 vector.body: ; preds = %vector.body, %vector.ph 221 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 222 %0 = getelementptr inbounds i16* %head, i64 %index 223 %1 = bitcast i16* %0 to <16 x i16>* 224 %2 = load <16 x i16>* %1, align 2 225 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 226 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 227 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 228 store <16 x i16> %5, <16 x i16>* %1, align 2 229 %index.next = add i64 %index, 8 230 %6 = icmp eq i64 %index.next, 16384 231 br i1 %6, label %for.end, label %vector.body 232 233 for.end: ; preds = %vector.body 234 ret void 235 236 ; AVX2: @test8 237 ; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0 238 } 239 240 define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { 241 vector.ph: 242 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 243 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 244 br label %vector.body 245 246 vector.body: ; preds = %vector.body, %vector.ph 247 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 248 %1 = getelementptr inbounds i16* %head, i64 %index 249 %2 = bitcast i16* %1 to <16 x i16>* 250 %3 = load <16 x i16>* %2, align 2 251 %4 = icmp ult <16 x i16> %3, %broadcast15 252 %5 = sub <16 x i16> %3, %broadcast15 253 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 254 store <16 x i16> %6, <16 x i16>* %2, align 2 255 %index.next = add i64 %index, 8 256 %7 = icmp eq i64 %index.next, 16384 257 br i1 %7, label %for.end, label %vector.body 258 259 for.end: ; preds = %vector.body 260 ret void 261 262 263 ; AVX2: @test9 264 ; AVX2: vpsubusw %ymm0, %ymm1, %ymm1 265 } 266 267 define void @test10(i8* nocapture %head) nounwind { 268 vector.ph: 269 br label %vector.body 270 271 vector.body: ; preds = %vector.body, %vector.ph 272 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 273 %0 = getelementptr inbounds i8* %head, i64 %index 274 %1 = bitcast i8* %0 to <32 x i8>* 275 %2 = load <32 x i8>* %1, align 1 276 %3 = icmp slt <32 x i8> %2, zeroinitializer 277 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 278 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 279 store <32 x i8> %5, <32 x i8>* %1, align 1 280 %index.next = add i64 %index, 16 281 %6 = icmp eq i64 %index.next, 16384 282 br i1 %6, label %for.end, label %vector.body 283 284 for.end: ; preds = %vector.body 285 ret void 286 287 288 ; AVX2: @test10 289 ; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0 290 } 291 292 define void @test11(i8* nocapture %head) nounwind { 293 vector.ph: 294 br label %vector.body 295 296 vector.body: ; preds = %vector.body, %vector.ph 297 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 298 %0 = getelementptr inbounds i8* %head, i64 %index 299 %1 = bitcast i8* %0 to <32 x i8>* 300 %2 = load <32 x i8>* %1, align 1 301 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 302 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 303 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 304 store <32 x i8> %5, <32 x i8>* %1, align 1 305 %index.next = add i64 %index, 16 306 %6 = icmp eq i64 %index.next, 16384 307 br i1 %6, label %for.end, label %vector.body 308 309 for.end: ; preds = %vector.body 310 ret void 311 312 ; AVX2: @test11 313 ; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0 314 } 315 316 define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { 317 vector.ph: 318 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 319 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 320 br label %vector.body 321 322 vector.body: ; preds = %vector.body, %vector.ph 323 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 324 %1 = getelementptr inbounds i8* %head, i64 %index 325 %2 = bitcast i8* %1 to <32 x i8>* 326 %3 = load <32 x i8>* %2, align 1 327 %4 = icmp ult <32 x i8> %3, %broadcast15 328 %5 = sub <32 x i8> %3, %broadcast15 329 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 330 store <32 x i8> %6, <32 x i8>* %2, align 1 331 %index.next = add i64 %index, 16 332 %7 = icmp eq i64 %index.next, 16384 333 br i1 %7, label %for.end, label %vector.body 334 335 for.end: ; preds = %vector.body 336 ret void 337 338 ; AVX2: @test12 339 ; AVX2: vpsubusb %ymm0, %ymm1, %ymm1 340 } 341