1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 9 10 define <8 x i16> @test1(<8 x i16> %x) nounwind { 11 ; SSE-LABEL: test1: 12 ; SSE: # %bb.0: # %vector.ph 13 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 14 ; SSE-NEXT: retq 15 ; 16 ; AVX-LABEL: test1: 17 ; AVX: # %bb.0: # %vector.ph 18 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 19 ; AVX-NEXT: retq 20 vector.ph: 21 %0 = icmp slt <8 x i16> %x, zeroinitializer 22 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 23 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer 24 ret <8 x i16> %res 25 } 26 27 define <8 x i16> @test2(<8 x i16> %x) nounwind { 28 ; SSE-LABEL: test2: 29 ; SSE: # %bb.0: # %vector.ph 30 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 31 ; SSE-NEXT: retq 32 ; 33 ; AVX-LABEL: test2: 34 ; AVX: # %bb.0: # %vector.ph 35 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 36 ; AVX-NEXT: retq 37 vector.ph: 38 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 39 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 40 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer 41 ret <8 x i16> %res 42 } 43 44 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { 45 ; SSE-LABEL: test3: 46 ; SSE: # %bb.0: # %vector.ph 47 ; SSE-NEXT: movd %edi, %xmm1 48 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 49 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 50 ; SSE-NEXT: psubusw %xmm1, %xmm0 51 ; SSE-NEXT: retq 52 ; 53 ; AVX1-LABEL: test3: 54 ; AVX1: # %bb.0: # %vector.ph 55 ; AVX1-NEXT: vmovd %edi, %xmm1 56 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 57 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 58 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 59 ; AVX1-NEXT: retq 60 ; 61 ; AVX2-LABEL: test3: 62 ; AVX2: # %bb.0: # %vector.ph 63 ; AVX2-NEXT: vmovd %edi, %xmm1 64 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 65 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 66 ; AVX2-NEXT: retq 67 ; 68 ; AVX512-LABEL: test3: 69 ; AVX512: # %bb.0: # %vector.ph 70 ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 71 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 72 ; AVX512-NEXT: retq 73 vector.ph: 74 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 75 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 76 %1 = icmp ult <8 x i16> %x, %broadcast15 77 %2 = sub <8 x i16> %x, %broadcast15 78 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2 79 ret <8 x i16> %res 80 } 81 82 define <16 x i8> @test4(<16 x i8> %x) nounwind { 83 ; SSE-LABEL: test4: 84 ; SSE: # %bb.0: # %vector.ph 85 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 86 ; SSE-NEXT: retq 87 ; 88 ; AVX-LABEL: test4: 89 ; AVX: # %bb.0: # %vector.ph 90 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 91 ; AVX-NEXT: retq 92 vector.ph: 93 %0 = icmp slt <16 x i8> %x, zeroinitializer 94 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 95 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer 96 ret <16 x i8> %res 97 } 98 99 define <16 x i8> @test5(<16 x i8> %x) nounwind { 100 ; SSE-LABEL: test5: 101 ; SSE: # %bb.0: # %vector.ph 102 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 103 ; SSE-NEXT: retq 104 ; 105 ; AVX-LABEL: test5: 106 ; AVX: # %bb.0: # %vector.ph 107 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 108 ; AVX-NEXT: retq 109 vector.ph: 110 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 111 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 112 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer 113 ret <16 x i8> %res 114 } 115 116 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind { 117 ; SSE2-LABEL: test6: 118 ; SSE2: # %bb.0: # %vector.ph 119 ; SSE2-NEXT: movd %edi, %xmm1 120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 121 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 122 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 123 ; SSE2-NEXT: psubusb %xmm1, %xmm0 124 ; SSE2-NEXT: retq 125 ; 126 ; SSSE3-LABEL: test6: 127 ; SSSE3: # %bb.0: # %vector.ph 128 ; SSSE3-NEXT: movd %edi, %xmm1 129 ; SSSE3-NEXT: pxor %xmm2, %xmm2 130 ; SSSE3-NEXT: pshufb %xmm2, %xmm1 131 ; SSSE3-NEXT: psubusb %xmm1, %xmm0 132 ; SSSE3-NEXT: retq 133 ; 134 ; SSE41-LABEL: test6: 135 ; SSE41: # %bb.0: # %vector.ph 136 ; SSE41-NEXT: movd %edi, %xmm1 137 ; SSE41-NEXT: pxor %xmm2, %xmm2 138 ; SSE41-NEXT: pshufb %xmm2, %xmm1 139 ; SSE41-NEXT: psubusb %xmm1, %xmm0 140 ; SSE41-NEXT: retq 141 ; 142 ; AVX1-LABEL: test6: 143 ; AVX1: # %bb.0: # %vector.ph 144 ; AVX1-NEXT: vmovd %edi, %xmm1 145 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 146 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 147 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 148 ; AVX1-NEXT: retq 149 ; 150 ; AVX2-LABEL: test6: 151 ; AVX2: # %bb.0: # %vector.ph 152 ; AVX2-NEXT: vmovd %edi, %xmm1 153 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 154 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 155 ; AVX2-NEXT: retq 156 ; 157 ; AVX512-LABEL: test6: 158 ; AVX512: # %bb.0: # %vector.ph 159 ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 160 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 161 ; AVX512-NEXT: retq 162 vector.ph: 163 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 164 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 165 %1 = icmp ult <16 x i8> %x, %broadcast15 166 %2 = sub <16 x i8> %x, %broadcast15 167 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2 168 ret <16 x i8> %res 169 } 170 171 define <16 x i16> @test7(<16 x i16> %x) nounwind { 172 ; SSE-LABEL: test7: 173 ; SSE: # %bb.0: # %vector.ph 174 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 175 ; SSE-NEXT: psubusw %xmm2, %xmm0 176 ; SSE-NEXT: psubusw %xmm2, %xmm1 177 ; SSE-NEXT: retq 178 ; 179 ; AVX1-LABEL: test7: 180 ; AVX1: # %bb.0: # %vector.ph 181 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 182 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 183 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 184 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 185 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 186 ; AVX1-NEXT: retq 187 ; 188 ; AVX2-LABEL: test7: 189 ; AVX2: # %bb.0: # %vector.ph 190 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 191 ; AVX2-NEXT: retq 192 ; 193 ; AVX512-LABEL: test7: 194 ; AVX512: # %bb.0: # %vector.ph 195 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 196 ; AVX512-NEXT: retq 197 vector.ph: 198 %0 = icmp slt <16 x i16> %x, zeroinitializer 199 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 200 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 201 ret <16 x i16> %res 202 } 203 204 define <16 x i16> @test8(<16 x i16> %x) nounwind { 205 ; SSE-LABEL: test8: 206 ; SSE: # %bb.0: # %vector.ph 207 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 208 ; SSE-NEXT: psubusw %xmm2, %xmm0 209 ; SSE-NEXT: psubusw %xmm2, %xmm1 210 ; SSE-NEXT: retq 211 ; 212 ; AVX1-LABEL: test8: 213 ; AVX1: # %bb.0: # %vector.ph 214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 216 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 217 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 218 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 219 ; AVX1-NEXT: retq 220 ; 221 ; AVX2-LABEL: test8: 222 ; AVX2: # %bb.0: # %vector.ph 223 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 224 ; AVX2-NEXT: retq 225 ; 226 ; AVX512-LABEL: test8: 227 ; AVX512: # %bb.0: # %vector.ph 228 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 229 ; AVX512-NEXT: retq 230 vector.ph: 231 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 232 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 233 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 234 ret <16 x i16> %res 235 } 236 237 define <16 x i16> @test8a(<16 x i16> %x) nounwind { 238 ; SSE-LABEL: test8a: 239 ; SSE: # %bb.0: # %vector.ph 240 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 241 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1 242 ; SSE-NEXT: retq 243 ; 244 ; AVX1-LABEL: test8a: 245 ; AVX1: # %bb.0: # %vector.ph 246 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1 247 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 248 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 249 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 250 ; AVX1-NEXT: retq 251 ; 252 ; AVX2-LABEL: test8a: 253 ; AVX2: # %bb.0: # %vector.ph 254 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 255 ; AVX2-NEXT: retq 256 ; 257 ; AVX512-LABEL: test8a: 258 ; AVX512: # %bb.0: # %vector.ph 259 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 260 ; AVX512-NEXT: retq 261 vector.ph: 262 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751> 263 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752> 264 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer 265 ret <16 x i16> %res 266 } 267 268 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { 269 ; SSE-LABEL: test9: 270 ; SSE: # %bb.0: # %vector.ph 271 ; SSE-NEXT: movd %edi, %xmm2 272 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] 273 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 274 ; SSE-NEXT: psubusw %xmm2, %xmm0 275 ; SSE-NEXT: psubusw %xmm2, %xmm1 276 ; SSE-NEXT: retq 277 ; 278 ; AVX1-LABEL: test9: 279 ; AVX1: # %bb.0: # %vector.ph 280 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 281 ; AVX1-NEXT: vmovd %edi, %xmm2 282 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] 283 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 284 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 285 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 286 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 287 ; AVX1-NEXT: retq 288 ; 289 ; AVX2-LABEL: test9: 290 ; AVX2: # %bb.0: # %vector.ph 291 ; AVX2-NEXT: vmovd %edi, %xmm1 292 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 293 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 294 ; AVX2-NEXT: retq 295 ; 296 ; AVX512-LABEL: test9: 297 ; AVX512: # %bb.0: # %vector.ph 298 ; AVX512-NEXT: vpbroadcastw %edi, %ymm1 299 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 300 ; AVX512-NEXT: retq 301 vector.ph: 302 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 303 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 304 %1 = icmp ult <16 x i16> %x, %broadcast15 305 %2 = sub <16 x i16> %x, %broadcast15 306 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2 307 ret <16 x i16> %res 308 } 309 310 define <32 x i8> @test10(<32 x i8> %x) nounwind { 311 ; SSE-LABEL: test10: 312 ; SSE: # %bb.0: # %vector.ph 313 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 314 ; SSE-NEXT: psubusb %xmm2, %xmm0 315 ; SSE-NEXT: psubusb %xmm2, %xmm1 316 ; SSE-NEXT: retq 317 ; 318 ; AVX1-LABEL: test10: 319 ; AVX1: # %bb.0: # %vector.ph 320 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 321 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 322 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 323 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 325 ; AVX1-NEXT: retq 326 ; 327 ; AVX2-LABEL: test10: 328 ; AVX2: # %bb.0: # %vector.ph 329 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 330 ; AVX2-NEXT: retq 331 ; 332 ; AVX512-LABEL: test10: 333 ; AVX512: # %bb.0: # %vector.ph 334 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 335 ; AVX512-NEXT: retq 336 vector.ph: 337 %0 = icmp slt <32 x i8> %x, zeroinitializer 338 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 339 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 340 ret <32 x i8> %res 341 } 342 343 define <32 x i8> @test11(<32 x i8> %x) nounwind { 344 ; SSE-LABEL: test11: 345 ; SSE: # %bb.0: # %vector.ph 346 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 347 ; SSE-NEXT: psubusb %xmm2, %xmm0 348 ; SSE-NEXT: psubusb %xmm2, %xmm1 349 ; SSE-NEXT: retq 350 ; 351 ; AVX1-LABEL: test11: 352 ; AVX1: # %bb.0: # %vector.ph 353 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 354 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 355 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 356 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 357 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 358 ; AVX1-NEXT: retq 359 ; 360 ; AVX2-LABEL: test11: 361 ; AVX2: # %bb.0: # %vector.ph 362 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 363 ; AVX2-NEXT: retq 364 ; 365 ; AVX512-LABEL: test11: 366 ; AVX512: # %bb.0: # %vector.ph 367 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 368 ; AVX512-NEXT: retq 369 vector.ph: 370 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 371 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 372 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 373 ret <32 x i8> %res 374 } 375 376 define <32 x i8> @test11a(<32 x i8> %x) nounwind { 377 ; SSE-LABEL: test11a: 378 ; SSE: # %bb.0: # %vector.ph 379 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 380 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1 381 ; SSE-NEXT: retq 382 ; 383 ; AVX1-LABEL: test11a: 384 ; AVX1: # %bb.0: # %vector.ph 385 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1 386 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 387 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 388 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 389 ; AVX1-NEXT: retq 390 ; 391 ; AVX2-LABEL: test11a: 392 ; AVX2: # %bb.0: # %vector.ph 393 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 394 ; AVX2-NEXT: retq 395 ; 396 ; AVX512-LABEL: test11a: 397 ; AVX512: # %bb.0: # %vector.ph 398 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 399 ; AVX512-NEXT: retq 400 vector.ph: 401 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95> 402 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96> 403 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer 404 ret <32 x i8> %res 405 } 406 407 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind { 408 ; SSE2-LABEL: test12: 409 ; SSE2: # %bb.0: # %vector.ph 410 ; SSE2-NEXT: movd %edi, %xmm2 411 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 412 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] 413 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 414 ; SSE2-NEXT: psubusb %xmm2, %xmm0 415 ; SSE2-NEXT: psubusb %xmm2, %xmm1 416 ; SSE2-NEXT: retq 417 ; 418 ; SSSE3-LABEL: test12: 419 ; SSSE3: # %bb.0: # %vector.ph 420 ; SSSE3-NEXT: movd %edi, %xmm2 421 ; SSSE3-NEXT: pxor %xmm3, %xmm3 422 ; SSSE3-NEXT: pshufb %xmm3, %xmm2 423 ; SSSE3-NEXT: psubusb %xmm2, %xmm0 424 ; SSSE3-NEXT: psubusb %xmm2, %xmm1 425 ; SSSE3-NEXT: retq 426 ; 427 ; SSE41-LABEL: test12: 428 ; SSE41: # %bb.0: # %vector.ph 429 ; SSE41-NEXT: movd %edi, %xmm2 430 ; SSE41-NEXT: pxor %xmm3, %xmm3 431 ; SSE41-NEXT: pshufb %xmm3, %xmm2 432 ; SSE41-NEXT: psubusb %xmm2, %xmm0 433 ; SSE41-NEXT: psubusb %xmm2, %xmm1 434 ; SSE41-NEXT: retq 435 ; 436 ; AVX1-LABEL: test12: 437 ; AVX1: # %bb.0: # %vector.ph 438 ; AVX1-NEXT: vmovd %edi, %xmm1 439 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 440 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 441 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 442 ; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2 443 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 444 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 445 ; AVX1-NEXT: retq 446 ; 447 ; AVX2-LABEL: test12: 448 ; AVX2: # %bb.0: # %vector.ph 449 ; AVX2-NEXT: vmovd %edi, %xmm1 450 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 451 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 452 ; AVX2-NEXT: retq 453 ; 454 ; AVX512-LABEL: test12: 455 ; AVX512: # %bb.0: # %vector.ph 456 ; AVX512-NEXT: vpbroadcastb %edi, %ymm1 457 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 458 ; AVX512-NEXT: retq 459 vector.ph: 460 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 461 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 462 %1 = icmp ult <32 x i8> %x, %broadcast15 463 %2 = sub <32 x i8> %x, %broadcast15 464 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2 465 ret <32 x i8> %res 466 } 467 468 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { 469 ; SSE2-LABEL: test13: 470 ; SSE2: # %bb.0: # %vector.ph 471 ; SSE2-NEXT: pxor %xmm4, %xmm4 472 ; SSE2-NEXT: movdqa %xmm0, %xmm3 473 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 474 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 475 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 476 ; SSE2-NEXT: movdqa %xmm0, %xmm5 477 ; SSE2-NEXT: psubd %xmm2, %xmm0 478 ; SSE2-NEXT: movdqa %xmm2, %xmm6 479 ; SSE2-NEXT: pxor %xmm4, %xmm6 480 ; SSE2-NEXT: por %xmm4, %xmm5 481 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 482 ; SSE2-NEXT: movdqa %xmm1, %xmm2 483 ; SSE2-NEXT: pxor %xmm4, %xmm2 484 ; SSE2-NEXT: por %xmm3, %xmm4 485 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 486 ; SSE2-NEXT: packssdw %xmm6, %xmm2 487 ; SSE2-NEXT: psubd %xmm1, %xmm3 488 ; SSE2-NEXT: pslld $16, %xmm0 489 ; SSE2-NEXT: psrad $16, %xmm0 490 ; SSE2-NEXT: pslld $16, %xmm3 491 ; SSE2-NEXT: psrad $16, %xmm3 492 ; SSE2-NEXT: packssdw %xmm0, %xmm3 493 ; SSE2-NEXT: pandn %xmm3, %xmm2 494 ; SSE2-NEXT: movdqa %xmm2, %xmm0 495 ; SSE2-NEXT: retq 496 ; 497 ; SSSE3-LABEL: test13: 498 ; SSSE3: # %bb.0: # %vector.ph 499 ; SSSE3-NEXT: pxor %xmm3, %xmm3 500 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 501 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 502 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 503 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] 504 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 505 ; SSSE3-NEXT: psubd %xmm2, %xmm0 506 ; SSSE3-NEXT: movdqa %xmm2, %xmm6 507 ; SSSE3-NEXT: pxor %xmm3, %xmm6 508 ; SSSE3-NEXT: por %xmm3, %xmm5 509 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 510 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 511 ; SSSE3-NEXT: pxor %xmm3, %xmm2 512 ; SSSE3-NEXT: por %xmm4, %xmm3 513 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 514 ; SSSE3-NEXT: packssdw %xmm6, %xmm2 515 ; SSSE3-NEXT: psubd %xmm1, %xmm4 516 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 517 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 518 ; SSSE3-NEXT: pshufb %xmm1, %xmm4 519 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] 520 ; SSSE3-NEXT: pandn %xmm4, %xmm2 521 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 522 ; SSSE3-NEXT: retq 523 ; 524 ; SSE41-LABEL: test13: 525 ; SSE41: # %bb.0: # %vector.ph 526 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 527 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 528 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 529 ; SSE41-NEXT: movdqa %xmm4, %xmm0 530 ; SSE41-NEXT: pmaxud %xmm1, %xmm0 531 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 532 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 533 ; SSE41-NEXT: pxor %xmm5, %xmm0 534 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 535 ; SSE41-NEXT: pshufb %xmm6, %xmm0 536 ; SSE41-NEXT: movdqa %xmm3, %xmm7 537 ; SSE41-NEXT: pmaxud %xmm2, %xmm7 538 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 539 ; SSE41-NEXT: pxor %xmm5, %xmm7 540 ; SSE41-NEXT: pshufb %xmm6, %xmm7 541 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] 542 ; SSE41-NEXT: psubd %xmm2, %xmm3 543 ; SSE41-NEXT: psubd %xmm1, %xmm4 544 ; SSE41-NEXT: pshufb %xmm6, %xmm4 545 ; SSE41-NEXT: pshufb %xmm6, %xmm3 546 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] 547 ; SSE41-NEXT: pandn %xmm4, %xmm0 548 ; SSE41-NEXT: retq 549 ; 550 ; AVX1-LABEL: test13: 551 ; AVX1: # %bb.0: # %vector.ph 552 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 553 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 554 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 555 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3 556 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 557 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 558 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 559 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 560 ; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm6 561 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 562 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 563 ; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 564 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 565 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 566 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 567 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 568 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 569 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 570 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 571 ; AVX1-NEXT: vzeroupper 572 ; AVX1-NEXT: retq 573 ; 574 ; AVX2-LABEL: test13: 575 ; AVX2: # %bb.0: # %vector.ph 576 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 577 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 578 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 579 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 580 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 581 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 582 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 583 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 584 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 585 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 586 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 587 ; AVX2-NEXT: vzeroupper 588 ; AVX2-NEXT: retq 589 ; 590 ; AVX512-LABEL: test13: 591 ; AVX512: # %bb.0: # %vector.ph 592 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 593 ; AVX512-NEXT: vpcmpnltud %ymm1, %ymm0, %k1 594 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 595 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 596 ; AVX512-NEXT: vzeroupper 597 ; AVX512-NEXT: retq 598 vector.ph: 599 %lhs = zext <8 x i16> %x to <8 x i32> 600 %cond = icmp ult <8 x i32> %lhs, %y 601 %sub = sub <8 x i32> %lhs, %y 602 %trunc = trunc <8 x i32> %sub to <8 x i16> 603 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc 604 ret <8 x i16> %res 605 } 606 607 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { 608 ; SSE2-LABEL: test14: 609 ; SSE2: # %bb.0: # %vector.ph 610 ; SSE2-NEXT: movdqa %xmm0, %xmm5 611 ; SSE2-NEXT: pxor %xmm0, %xmm0 612 ; SSE2-NEXT: movdqa %xmm5, %xmm6 613 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 614 ; SSE2-NEXT: movdqa %xmm6, %xmm8 615 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] 616 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 617 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 618 ; SSE2-NEXT: movdqa %xmm5, %xmm10 619 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] 620 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 621 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 622 ; SSE2-NEXT: movdqa %xmm4, %xmm9 623 ; SSE2-NEXT: pxor %xmm0, %xmm9 624 ; SSE2-NEXT: psubd %xmm5, %xmm4 625 ; SSE2-NEXT: por %xmm0, %xmm5 626 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 627 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] 628 ; SSE2-NEXT: pand %xmm9, %xmm5 629 ; SSE2-NEXT: movdqa %xmm3, %xmm7 630 ; SSE2-NEXT: pxor %xmm0, %xmm7 631 ; SSE2-NEXT: psubd %xmm10, %xmm3 632 ; SSE2-NEXT: por %xmm0, %xmm10 633 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 634 ; SSE2-NEXT: pand %xmm9, %xmm10 635 ; SSE2-NEXT: packuswb %xmm5, %xmm10 636 ; SSE2-NEXT: movdqa %xmm2, %xmm5 637 ; SSE2-NEXT: pxor %xmm0, %xmm5 638 ; SSE2-NEXT: psubd %xmm6, %xmm2 639 ; SSE2-NEXT: por %xmm0, %xmm6 640 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 641 ; SSE2-NEXT: pand %xmm9, %xmm6 642 ; SSE2-NEXT: movdqa %xmm1, %xmm5 643 ; SSE2-NEXT: pxor %xmm0, %xmm5 644 ; SSE2-NEXT: por %xmm8, %xmm0 645 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 646 ; SSE2-NEXT: pand %xmm9, %xmm0 647 ; SSE2-NEXT: packuswb %xmm6, %xmm0 648 ; SSE2-NEXT: packuswb %xmm10, %xmm0 649 ; SSE2-NEXT: psubd %xmm8, %xmm1 650 ; SSE2-NEXT: pand %xmm9, %xmm4 651 ; SSE2-NEXT: pand %xmm9, %xmm3 652 ; SSE2-NEXT: packuswb %xmm4, %xmm3 653 ; SSE2-NEXT: pand %xmm9, %xmm2 654 ; SSE2-NEXT: pand %xmm9, %xmm1 655 ; SSE2-NEXT: packuswb %xmm2, %xmm1 656 ; SSE2-NEXT: packuswb %xmm3, %xmm1 657 ; SSE2-NEXT: pandn %xmm1, %xmm0 658 ; SSE2-NEXT: retq 659 ; 660 ; SSSE3-LABEL: test14: 661 ; SSSE3: # %bb.0: # %vector.ph 662 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 663 ; SSSE3-NEXT: pxor %xmm0, %xmm0 664 ; SSSE3-NEXT: movdqa %xmm5, %xmm7 665 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] 666 ; SSSE3-NEXT: movdqa %xmm7, %xmm8 667 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] 668 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] 669 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 670 ; SSSE3-NEXT: movdqa %xmm5, %xmm10 671 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] 672 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 673 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 674 ; SSSE3-NEXT: movdqa %xmm2, %xmm9 675 ; SSSE3-NEXT: pxor %xmm0, %xmm9 676 ; SSSE3-NEXT: psubd %xmm5, %xmm2 677 ; SSSE3-NEXT: por %xmm0, %xmm5 678 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 679 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 680 ; SSSE3-NEXT: pshufb %xmm9, %xmm5 681 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 682 ; SSSE3-NEXT: pxor %xmm0, %xmm6 683 ; SSSE3-NEXT: psubd %xmm10, %xmm1 684 ; SSSE3-NEXT: por %xmm0, %xmm10 685 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 686 ; SSSE3-NEXT: pshufb %xmm9, %xmm10 687 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] 688 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 689 ; SSSE3-NEXT: pxor %xmm0, %xmm5 690 ; SSSE3-NEXT: psubd %xmm7, %xmm4 691 ; SSSE3-NEXT: por %xmm0, %xmm7 692 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 693 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 694 ; SSSE3-NEXT: pshufb %xmm5, %xmm7 695 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 696 ; SSSE3-NEXT: pxor %xmm0, %xmm6 697 ; SSSE3-NEXT: por %xmm8, %xmm0 698 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 699 ; SSSE3-NEXT: pshufb %xmm5, %xmm0 700 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 701 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] 702 ; SSSE3-NEXT: psubd %xmm8, %xmm3 703 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 704 ; SSSE3-NEXT: pand %xmm5, %xmm4 705 ; SSSE3-NEXT: pand %xmm5, %xmm3 706 ; SSSE3-NEXT: packuswb %xmm4, %xmm3 707 ; SSSE3-NEXT: pand %xmm5, %xmm2 708 ; SSSE3-NEXT: pand %xmm5, %xmm1 709 ; SSSE3-NEXT: packuswb %xmm2, %xmm1 710 ; SSSE3-NEXT: packuswb %xmm3, %xmm1 711 ; SSSE3-NEXT: andnpd %xmm1, %xmm0 712 ; SSSE3-NEXT: retq 713 ; 714 ; SSE41-LABEL: test14: 715 ; SSE41: # %bb.0: # %vector.ph 716 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] 717 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 718 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 719 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 720 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 721 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 722 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 723 ; SSE41-NEXT: movdqa %xmm4, %xmm0 724 ; SSE41-NEXT: pmaxud %xmm10, %xmm0 725 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 726 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 727 ; SSE41-NEXT: pxor %xmm6, %xmm0 728 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 729 ; SSE41-NEXT: pshufb %xmm7, %xmm0 730 ; SSE41-NEXT: movdqa %xmm3, %xmm5 731 ; SSE41-NEXT: pmaxud %xmm9, %xmm5 732 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 733 ; SSE41-NEXT: pxor %xmm6, %xmm5 734 ; SSE41-NEXT: pshufb %xmm7, %xmm5 735 ; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 736 ; SSE41-NEXT: movdqa %xmm1, %xmm0 737 ; SSE41-NEXT: pmaxud %xmm8, %xmm0 738 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 739 ; SSE41-NEXT: pxor %xmm6, %xmm0 740 ; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 741 ; SSE41-NEXT: pshufb %xmm12, %xmm0 742 ; SSE41-NEXT: movdqa %xmm2, %xmm7 743 ; SSE41-NEXT: pmaxud %xmm11, %xmm7 744 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 745 ; SSE41-NEXT: pxor %xmm6, %xmm7 746 ; SSE41-NEXT: pshufb %xmm12, %xmm7 747 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 748 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] 749 ; SSE41-NEXT: psubd %xmm11, %xmm2 750 ; SSE41-NEXT: psubd %xmm8, %xmm1 751 ; SSE41-NEXT: psubd %xmm9, %xmm3 752 ; SSE41-NEXT: psubd %xmm10, %xmm4 753 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 754 ; SSE41-NEXT: pand %xmm5, %xmm4 755 ; SSE41-NEXT: pand %xmm5, %xmm3 756 ; SSE41-NEXT: packusdw %xmm4, %xmm3 757 ; SSE41-NEXT: pand %xmm5, %xmm1 758 ; SSE41-NEXT: pand %xmm5, %xmm2 759 ; SSE41-NEXT: packusdw %xmm2, %xmm1 760 ; SSE41-NEXT: packuswb %xmm3, %xmm1 761 ; SSE41-NEXT: pandn %xmm1, %xmm0 762 ; SSE41-NEXT: retq 763 ; 764 ; AVX1-LABEL: test14: 765 ; AVX1: # %bb.0: # %vector.ph 766 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] 767 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 768 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 769 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] 770 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 771 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 772 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 773 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 774 ; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7 775 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7 776 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 777 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 778 ; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4 779 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 780 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 781 ; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10 782 ; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7 783 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7 784 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 785 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 786 ; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5 787 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 788 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3 789 ; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3 790 ; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3 791 ; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4 792 ; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1 793 ; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2 794 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 795 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 796 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 797 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 798 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 799 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 800 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2 801 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 802 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 803 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 804 ; AVX1-NEXT: vzeroupper 805 ; AVX1-NEXT: retq 806 ; 807 ; AVX2-LABEL: test14: 808 ; AVX2: # %bb.0: # %vector.ph 809 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 810 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 811 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 812 ; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4 813 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4 814 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 815 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4 816 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 817 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 818 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6 819 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6 820 ; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5 821 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 822 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 823 ; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 824 ; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2 825 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 826 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 827 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 828 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 829 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 830 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 831 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 832 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 833 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 834 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 835 ; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0 836 ; AVX2-NEXT: vzeroupper 837 ; AVX2-NEXT: retq 838 ; 839 ; AVX512-LABEL: test14: 840 ; AVX512: # %bb.0: # %vector.ph 841 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 842 ; AVX512-NEXT: vpcmpnltud %zmm0, %zmm1, %k1 843 ; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0 844 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 845 ; AVX512-NEXT: vzeroupper 846 ; AVX512-NEXT: retq 847 vector.ph: 848 %rhs = zext <16 x i8> %x to <16 x i32> 849 %cond = icmp ult <16 x i32> %y, %rhs 850 %sub = sub <16 x i32> %y, %rhs 851 %truncsub = trunc <16 x i32> %sub to <16 x i8> 852 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub 853 ret <16 x i8> %res 854 } 855 856 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { 857 ; SSE2-LABEL: test15: 858 ; SSE2: # %bb.0: # %vector.ph 859 ; SSE2-NEXT: movdqa %xmm0, %xmm3 860 ; SSE2-NEXT: pxor %xmm4, %xmm4 861 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 862 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 863 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 864 ; SSE2-NEXT: movdqa %xmm3, %xmm5 865 ; SSE2-NEXT: psubd %xmm2, %xmm3 866 ; SSE2-NEXT: pxor %xmm4, %xmm2 867 ; SSE2-NEXT: por %xmm4, %xmm5 868 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 869 ; SSE2-NEXT: movdqa %xmm1, %xmm2 870 ; SSE2-NEXT: pxor %xmm4, %xmm2 871 ; SSE2-NEXT: por %xmm0, %xmm4 872 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 873 ; SSE2-NEXT: packssdw %xmm5, %xmm4 874 ; SSE2-NEXT: psubd %xmm1, %xmm0 875 ; SSE2-NEXT: pslld $16, %xmm3 876 ; SSE2-NEXT: psrad $16, %xmm3 877 ; SSE2-NEXT: pslld $16, %xmm0 878 ; SSE2-NEXT: psrad $16, %xmm0 879 ; SSE2-NEXT: packssdw %xmm3, %xmm0 880 ; SSE2-NEXT: pand %xmm4, %xmm0 881 ; SSE2-NEXT: retq 882 ; 883 ; SSSE3-LABEL: test15: 884 ; SSSE3: # %bb.0: # %vector.ph 885 ; SSSE3-NEXT: pxor %xmm4, %xmm4 886 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 887 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 888 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 889 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 890 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 891 ; SSSE3-NEXT: psubd %xmm2, %xmm0 892 ; SSSE3-NEXT: pxor %xmm4, %xmm2 893 ; SSSE3-NEXT: por %xmm4, %xmm5 894 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 895 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 896 ; SSSE3-NEXT: pxor %xmm4, %xmm2 897 ; SSSE3-NEXT: por %xmm3, %xmm4 898 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 899 ; SSSE3-NEXT: packssdw %xmm5, %xmm4 900 ; SSSE3-NEXT: psubd %xmm1, %xmm3 901 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 902 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 903 ; SSSE3-NEXT: pshufb %xmm1, %xmm3 904 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] 905 ; SSSE3-NEXT: pand %xmm4, %xmm3 906 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 907 ; SSSE3-NEXT: retq 908 ; 909 ; SSE41-LABEL: test15: 910 ; SSE41: # %bb.0: # %vector.ph 911 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 912 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 913 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 914 ; SSE41-NEXT: movdqa %xmm0, %xmm4 915 ; SSE41-NEXT: pminud %xmm1, %xmm4 916 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 917 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 918 ; SSE41-NEXT: pxor %xmm5, %xmm4 919 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 920 ; SSE41-NEXT: pshufb %xmm6, %xmm4 921 ; SSE41-NEXT: movdqa %xmm3, %xmm7 922 ; SSE41-NEXT: pminud %xmm2, %xmm7 923 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 924 ; SSE41-NEXT: pxor %xmm5, %xmm7 925 ; SSE41-NEXT: pshufb %xmm6, %xmm7 926 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] 927 ; SSE41-NEXT: psubd %xmm2, %xmm3 928 ; SSE41-NEXT: psubd %xmm1, %xmm0 929 ; SSE41-NEXT: pshufb %xmm6, %xmm0 930 ; SSE41-NEXT: pshufb %xmm6, %xmm3 931 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 932 ; SSE41-NEXT: pand %xmm4, %xmm0 933 ; SSE41-NEXT: retq 934 ; 935 ; AVX1-LABEL: test15: 936 ; AVX1: # %bb.0: # %vector.ph 937 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 938 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 939 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 940 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3 941 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 942 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 943 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 944 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 945 ; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm6 946 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 947 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 948 ; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 949 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 950 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 951 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 952 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 953 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 954 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 955 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 956 ; AVX1-NEXT: vzeroupper 957 ; AVX1-NEXT: retq 958 ; 959 ; AVX2-LABEL: test15: 960 ; AVX2: # %bb.0: # %vector.ph 961 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 962 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2 963 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 964 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 965 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 966 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 967 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 968 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 969 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 970 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 971 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 972 ; AVX2-NEXT: vzeroupper 973 ; AVX2-NEXT: retq 974 ; 975 ; AVX512-LABEL: test15: 976 ; AVX512: # %bb.0: # %vector.ph 977 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 978 ; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 979 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 980 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 981 ; AVX512-NEXT: vzeroupper 982 ; AVX512-NEXT: retq 983 vector.ph: 984 %lhs = zext <8 x i16> %x to <8 x i32> 985 %cond = icmp ugt <8 x i32> %lhs, %y 986 %sub = sub <8 x i32> %lhs, %y 987 %truncsub = trunc <8 x i32> %sub to <8 x i16> 988 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer 989 ret <8 x i16> %res 990 } 991 992 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind { 993 ; SSE2-LABEL: test16: 994 ; SSE2: # %bb.0: # %vector.ph 995 ; SSE2-NEXT: movdqa %xmm0, %xmm3 996 ; SSE2-NEXT: pxor %xmm4, %xmm4 997 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 998 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 999 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 1000 ; SSE2-NEXT: movdqa %xmm3, %xmm5 1001 ; SSE2-NEXT: psubd %xmm2, %xmm3 1002 ; SSE2-NEXT: pxor %xmm4, %xmm2 1003 ; SSE2-NEXT: por %xmm4, %xmm5 1004 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 1005 ; SSE2-NEXT: movdqa %xmm1, %xmm2 1006 ; SSE2-NEXT: pxor %xmm4, %xmm2 1007 ; SSE2-NEXT: por %xmm0, %xmm4 1008 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 1009 ; SSE2-NEXT: packssdw %xmm5, %xmm4 1010 ; SSE2-NEXT: psubd %xmm1, %xmm0 1011 ; SSE2-NEXT: pslld $16, %xmm3 1012 ; SSE2-NEXT: psrad $16, %xmm3 1013 ; SSE2-NEXT: pslld $16, %xmm0 1014 ; SSE2-NEXT: psrad $16, %xmm0 1015 ; SSE2-NEXT: packssdw %xmm3, %xmm0 1016 ; SSE2-NEXT: pand %xmm4, %xmm0 1017 ; SSE2-NEXT: retq 1018 ; 1019 ; SSSE3-LABEL: test16: 1020 ; SSSE3: # %bb.0: # %vector.ph 1021 ; SSSE3-NEXT: pxor %xmm4, %xmm4 1022 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 1023 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1024 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1025 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 1026 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 1027 ; SSSE3-NEXT: psubd %xmm2, %xmm0 1028 ; SSSE3-NEXT: pxor %xmm4, %xmm2 1029 ; SSSE3-NEXT: por %xmm4, %xmm5 1030 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 1031 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 1032 ; SSSE3-NEXT: pxor %xmm4, %xmm2 1033 ; SSSE3-NEXT: por %xmm3, %xmm4 1034 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 1035 ; SSSE3-NEXT: packssdw %xmm5, %xmm4 1036 ; SSSE3-NEXT: psubd %xmm1, %xmm3 1037 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1038 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 1039 ; SSSE3-NEXT: pshufb %xmm1, %xmm3 1040 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] 1041 ; SSSE3-NEXT: pand %xmm4, %xmm3 1042 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 1043 ; SSSE3-NEXT: retq 1044 ; 1045 ; SSE41-LABEL: test16: 1046 ; SSE41: # %bb.0: # %vector.ph 1047 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 1048 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1049 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1050 ; SSE41-NEXT: movdqa %xmm1, %xmm4 1051 ; SSE41-NEXT: pmaxud %xmm0, %xmm4 1052 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 1053 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 1054 ; SSE41-NEXT: pxor %xmm5, %xmm4 1055 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1056 ; SSE41-NEXT: pshufb %xmm6, %xmm4 1057 ; SSE41-NEXT: movdqa %xmm2, %xmm7 1058 ; SSE41-NEXT: pmaxud %xmm3, %xmm7 1059 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 1060 ; SSE41-NEXT: pxor %xmm5, %xmm7 1061 ; SSE41-NEXT: pshufb %xmm6, %xmm7 1062 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] 1063 ; SSE41-NEXT: psubd %xmm2, %xmm3 1064 ; SSE41-NEXT: psubd %xmm1, %xmm0 1065 ; SSE41-NEXT: pshufb %xmm6, %xmm0 1066 ; SSE41-NEXT: pshufb %xmm6, %xmm3 1067 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 1068 ; SSE41-NEXT: pand %xmm4, %xmm0 1069 ; SSE41-NEXT: retq 1070 ; 1071 ; AVX1-LABEL: test16: 1072 ; AVX1: # %bb.0: # %vector.ph 1073 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1074 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 1075 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1076 ; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3 1077 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 1078 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1079 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1080 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1081 ; AVX1-NEXT: vpmaxud %xmm2, %xmm5, %xmm6 1082 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 1083 ; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 1084 ; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 1085 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 1086 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1087 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1088 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1089 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1090 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1091 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 1092 ; AVX1-NEXT: vzeroupper 1093 ; AVX1-NEXT: retq 1094 ; 1095 ; AVX2-LABEL: test16: 1096 ; AVX2: # %bb.0: # %vector.ph 1097 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1098 ; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2 1099 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 1100 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 1101 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 1102 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1103 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 1104 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1105 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1106 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1107 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 1108 ; AVX2-NEXT: vzeroupper 1109 ; AVX2-NEXT: retq 1110 ; 1111 ; AVX512-LABEL: test16: 1112 ; AVX512: # %bb.0: # %vector.ph 1113 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1114 ; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1 1115 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1116 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 1117 ; AVX512-NEXT: vzeroupper 1118 ; AVX512-NEXT: retq 1119 vector.ph: 1120 %lhs = zext <8 x i16> %x to <8 x i32> 1121 %cond = icmp ult <8 x i32> %y, %lhs 1122 %sub = sub <8 x i32> %lhs, %y 1123 %truncsub = trunc <8 x i32> %sub to <8 x i16> 1124 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer 1125 ret <8 x i16> %res 1126 } 1127 1128 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind { 1129 ; SSE-LABEL: psubus_8i16_max: 1130 ; SSE: # %bb.0: # %vector.ph 1131 ; SSE-NEXT: psubusw %xmm1, %xmm0 1132 ; SSE-NEXT: retq 1133 ; 1134 ; AVX-LABEL: psubus_8i16_max: 1135 ; AVX: # %bb.0: # %vector.ph 1136 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1137 ; AVX-NEXT: retq 1138 vector.ph: 1139 %cmp = icmp ult <8 x i16> %x, %y 1140 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x 1141 %res = sub <8 x i16> %max, %y 1142 ret <8 x i16> %res 1143 } 1144 1145 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind { 1146 ; SSE-LABEL: psubus_16i8_max: 1147 ; SSE: # %bb.0: # %vector.ph 1148 ; SSE-NEXT: psubusb %xmm1, %xmm0 1149 ; SSE-NEXT: retq 1150 ; 1151 ; AVX-LABEL: psubus_16i8_max: 1152 ; AVX: # %bb.0: # %vector.ph 1153 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 1154 ; AVX-NEXT: retq 1155 vector.ph: 1156 %cmp = icmp ult <16 x i8> %x, %y 1157 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x 1158 %res = sub <16 x i8> %max, %y 1159 ret <16 x i8> %res 1160 } 1161 1162 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind { 1163 ; SSE-LABEL: psubus_16i16_max: 1164 ; SSE: # %bb.0: # %vector.ph 1165 ; SSE-NEXT: psubusw %xmm2, %xmm0 1166 ; SSE-NEXT: psubusw %xmm3, %xmm1 1167 ; SSE-NEXT: retq 1168 ; 1169 ; AVX1-LABEL: psubus_16i16_max: 1170 ; AVX1: # %bb.0: # %vector.ph 1171 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1172 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1173 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 1174 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1175 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1176 ; AVX1-NEXT: retq 1177 ; 1178 ; AVX2-LABEL: psubus_16i16_max: 1179 ; AVX2: # %bb.0: # %vector.ph 1180 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1181 ; AVX2-NEXT: retq 1182 ; 1183 ; AVX512-LABEL: psubus_16i16_max: 1184 ; AVX512: # %bb.0: # %vector.ph 1185 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1186 ; AVX512-NEXT: retq 1187 vector.ph: 1188 %cmp = icmp ult <16 x i16> %x, %y 1189 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x 1190 %res = sub <16 x i16> %max, %y 1191 ret <16 x i16> %res 1192 } 1193 1194 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind { 1195 ; SSE-LABEL: psubus_32i16_max: 1196 ; SSE: # %bb.0: # %vector.ph 1197 ; SSE-NEXT: psubusw %xmm4, %xmm0 1198 ; SSE-NEXT: psubusw %xmm5, %xmm1 1199 ; SSE-NEXT: psubusw %xmm6, %xmm2 1200 ; SSE-NEXT: psubusw %xmm7, %xmm3 1201 ; SSE-NEXT: retq 1202 ; 1203 ; AVX1-LABEL: psubus_32i16_max: 1204 ; AVX1: # %bb.0: # %vector.ph 1205 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1206 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1207 ; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4 1208 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 1209 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1210 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1211 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1212 ; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2 1213 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1 1214 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1215 ; AVX1-NEXT: retq 1216 ; 1217 ; AVX2-LABEL: psubus_32i16_max: 1218 ; AVX2: # %bb.0: # %vector.ph 1219 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 1220 ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 1221 ; AVX2-NEXT: retq 1222 ; 1223 ; AVX512-LABEL: psubus_32i16_max: 1224 ; AVX512: # %bb.0: # %vector.ph 1225 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 1226 ; AVX512-NEXT: retq 1227 vector.ph: 1228 %cmp = icmp ult <32 x i16> %x, %y 1229 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x 1230 %res = sub <32 x i16> %max, %y 1231 ret <32 x i16> %res 1232 } 1233 1234 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind { 1235 ; SSE-LABEL: psubus_64i8_max: 1236 ; SSE: # %bb.0: # %vector.ph 1237 ; SSE-NEXT: psubusb %xmm4, %xmm0 1238 ; SSE-NEXT: psubusb %xmm5, %xmm1 1239 ; SSE-NEXT: psubusb %xmm6, %xmm2 1240 ; SSE-NEXT: psubusb %xmm7, %xmm3 1241 ; SSE-NEXT: retq 1242 ; 1243 ; AVX1-LABEL: psubus_64i8_max: 1244 ; AVX1: # %bb.0: # %vector.ph 1245 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1246 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1247 ; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4 1248 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 1249 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1250 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1251 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1252 ; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2 1253 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1 1254 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1255 ; AVX1-NEXT: retq 1256 ; 1257 ; AVX2-LABEL: psubus_64i8_max: 1258 ; AVX2: # %bb.0: # %vector.ph 1259 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 1260 ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1 1261 ; AVX2-NEXT: retq 1262 ; 1263 ; AVX512-LABEL: psubus_64i8_max: 1264 ; AVX512: # %bb.0: # %vector.ph 1265 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 1266 ; AVX512-NEXT: retq 1267 vector.ph: 1268 %cmp = icmp ult <64 x i8> %x, %y 1269 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x 1270 %res = sub <64 x i8> %max, %y 1271 ret <64 x i8> %res 1272 } 1273 1274 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind { 1275 ; SSE-LABEL: psubus_32i8_max: 1276 ; SSE: # %bb.0: # %vector.ph 1277 ; SSE-NEXT: psubusb %xmm2, %xmm0 1278 ; SSE-NEXT: psubusb %xmm3, %xmm1 1279 ; SSE-NEXT: retq 1280 ; 1281 ; AVX1-LABEL: psubus_32i8_max: 1282 ; AVX1: # %bb.0: # %vector.ph 1283 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1284 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1285 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2 1286 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 1287 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1288 ; AVX1-NEXT: retq 1289 ; 1290 ; AVX2-LABEL: psubus_32i8_max: 1291 ; AVX2: # %bb.0: # %vector.ph 1292 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 1293 ; AVX2-NEXT: retq 1294 ; 1295 ; AVX512-LABEL: psubus_32i8_max: 1296 ; AVX512: # %bb.0: # %vector.ph 1297 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 1298 ; AVX512-NEXT: retq 1299 vector.ph: 1300 %cmp = icmp ult <32 x i8> %x, %y 1301 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x 1302 %res = sub <32 x i8> %max, %y 1303 ret <32 x i8> %res 1304 } 1305 1306 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { 1307 ; SSE2-LABEL: psubus_8i32_max: 1308 ; SSE2: # %bb.0: # %vector.ph 1309 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1310 ; SSE2-NEXT: pxor %xmm4, %xmm4 1311 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1312 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1313 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] 1314 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1315 ; SSE2-NEXT: pxor %xmm5, %xmm6 1316 ; SSE2-NEXT: movdqa %xmm3, %xmm4 1317 ; SSE2-NEXT: por %xmm5, %xmm4 1318 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 1319 ; SSE2-NEXT: pand %xmm4, %xmm3 1320 ; SSE2-NEXT: pandn %xmm2, %xmm4 1321 ; SSE2-NEXT: por %xmm3, %xmm4 1322 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1323 ; SSE2-NEXT: pxor %xmm5, %xmm3 1324 ; SSE2-NEXT: por %xmm0, %xmm5 1325 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 1326 ; SSE2-NEXT: pand %xmm5, %xmm0 1327 ; SSE2-NEXT: pandn %xmm1, %xmm5 1328 ; SSE2-NEXT: por %xmm5, %xmm0 1329 ; SSE2-NEXT: psubd %xmm1, %xmm0 1330 ; SSE2-NEXT: psubd %xmm2, %xmm4 1331 ; SSE2-NEXT: pslld $16, %xmm4 1332 ; SSE2-NEXT: psrad $16, %xmm4 1333 ; SSE2-NEXT: pslld $16, %xmm0 1334 ; SSE2-NEXT: psrad $16, %xmm0 1335 ; SSE2-NEXT: packssdw %xmm4, %xmm0 1336 ; SSE2-NEXT: retq 1337 ; 1338 ; SSSE3-LABEL: psubus_8i32_max: 1339 ; SSSE3: # %bb.0: # %vector.ph 1340 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1341 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 1342 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 1343 ; SSSE3-NEXT: pxor %xmm4, %xmm5 1344 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1345 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 1346 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 1347 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 1348 ; SSSE3-NEXT: pand %xmm7, %xmm2 1349 ; SSSE3-NEXT: pandn %xmm5, %xmm7 1350 ; SSSE3-NEXT: por %xmm2, %xmm7 1351 ; SSSE3-NEXT: pshufb %xmm3, %xmm7 1352 ; SSSE3-NEXT: pxor %xmm1, %xmm4 1353 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 1354 ; SSSE3-NEXT: pand %xmm6, %xmm1 1355 ; SSSE3-NEXT: pandn %xmm5, %xmm6 1356 ; SSSE3-NEXT: por %xmm1, %xmm6 1357 ; SSSE3-NEXT: pshufb %xmm3, %xmm6 1358 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1359 ; SSSE3-NEXT: psubusw %xmm6, %xmm0 1360 ; SSSE3-NEXT: retq 1361 ; 1362 ; SSE41-LABEL: psubus_8i32_max: 1363 ; SSE41: # %bb.0: # %vector.ph 1364 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 1365 ; SSE41-NEXT: pminud %xmm3, %xmm2 1366 ; SSE41-NEXT: pminud %xmm3, %xmm1 1367 ; SSE41-NEXT: packusdw %xmm2, %xmm1 1368 ; SSE41-NEXT: psubusw %xmm1, %xmm0 1369 ; SSE41-NEXT: retq 1370 ; 1371 ; AVX1-LABEL: psubus_8i32_max: 1372 ; AVX1: # %bb.0: # %vector.ph 1373 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1374 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 1375 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 1376 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 1377 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1378 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1379 ; AVX1-NEXT: vzeroupper 1380 ; AVX1-NEXT: retq 1381 ; 1382 ; AVX2-LABEL: psubus_8i32_max: 1383 ; AVX2: # %bb.0: # %vector.ph 1384 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 1385 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 1386 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1387 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1388 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1389 ; AVX2-NEXT: vzeroupper 1390 ; AVX2-NEXT: retq 1391 ; 1392 ; AVX512-LABEL: psubus_8i32_max: 1393 ; AVX512: # %bb.0: # %vector.ph 1394 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 1395 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1396 ; AVX512-NEXT: vzeroupper 1397 ; AVX512-NEXT: retq 1398 vector.ph: 1399 %lhs = zext <8 x i16> %x to <8 x i32> 1400 %cond = icmp ult <8 x i32> %lhs, %y 1401 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs 1402 %sub = sub <8 x i32> %max, %y 1403 %res = trunc <8 x i32> %sub to <8 x i16> 1404 ret <8 x i16> %res 1405 } 1406 1407 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { 1408 ; SSE2-LABEL: psubus_8i64_max: 1409 ; SSE2: # %bb.0: # %vector.ph 1410 ; SSE2-NEXT: pxor %xmm5, %xmm5 1411 ; SSE2-NEXT: movdqa %xmm0, %xmm10 1412 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] 1413 ; SSE2-NEXT: movdqa %xmm10, %xmm8 1414 ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] 1415 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] 1416 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] 1417 ; SSE2-NEXT: movdqa %xmm0, %xmm9 1418 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] 1419 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] 1420 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] 1421 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1422 ; SSE2-NEXT: pxor %xmm11, %xmm6 1423 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1424 ; SSE2-NEXT: por %xmm11, %xmm7 1425 ; SSE2-NEXT: movdqa %xmm7, %xmm5 1426 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 1427 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] 1428 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 1429 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 1430 ; SSE2-NEXT: pand %xmm12, %xmm7 1431 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] 1432 ; SSE2-NEXT: por %xmm7, %xmm13 1433 ; SSE2-NEXT: pand %xmm13, %xmm0 1434 ; SSE2-NEXT: pandn %xmm2, %xmm13 1435 ; SSE2-NEXT: por %xmm0, %xmm13 1436 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1437 ; SSE2-NEXT: pxor %xmm11, %xmm0 1438 ; SSE2-NEXT: movdqa %xmm9, %xmm5 1439 ; SSE2-NEXT: por %xmm11, %xmm5 1440 ; SSE2-NEXT: movdqa %xmm5, %xmm7 1441 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 1442 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] 1443 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 1444 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1445 ; SSE2-NEXT: pand %xmm12, %xmm5 1446 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] 1447 ; SSE2-NEXT: por %xmm5, %xmm0 1448 ; SSE2-NEXT: pand %xmm0, %xmm9 1449 ; SSE2-NEXT: pandn %xmm1, %xmm0 1450 ; SSE2-NEXT: por %xmm9, %xmm0 1451 ; SSE2-NEXT: movdqa %xmm4, %xmm5 1452 ; SSE2-NEXT: pxor %xmm11, %xmm5 1453 ; SSE2-NEXT: movdqa %xmm10, %xmm7 1454 ; SSE2-NEXT: por %xmm11, %xmm7 1455 ; SSE2-NEXT: movdqa %xmm7, %xmm6 1456 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 1457 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] 1458 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 1459 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1460 ; SSE2-NEXT: pand %xmm9, %xmm5 1461 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 1462 ; SSE2-NEXT: por %xmm5, %xmm7 1463 ; SSE2-NEXT: pand %xmm7, %xmm10 1464 ; SSE2-NEXT: pandn %xmm4, %xmm7 1465 ; SSE2-NEXT: por %xmm10, %xmm7 1466 ; SSE2-NEXT: movdqa %xmm3, %xmm5 1467 ; SSE2-NEXT: pxor %xmm11, %xmm5 1468 ; SSE2-NEXT: por %xmm8, %xmm11 1469 ; SSE2-NEXT: movdqa %xmm11, %xmm6 1470 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 1471 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] 1472 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm11 1473 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3] 1474 ; SSE2-NEXT: pand %xmm9, %xmm5 1475 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1476 ; SSE2-NEXT: por %xmm5, %xmm6 1477 ; SSE2-NEXT: pand %xmm6, %xmm8 1478 ; SSE2-NEXT: pandn %xmm3, %xmm6 1479 ; SSE2-NEXT: por %xmm8, %xmm6 1480 ; SSE2-NEXT: psubq %xmm3, %xmm6 1481 ; SSE2-NEXT: psubq %xmm4, %xmm7 1482 ; SSE2-NEXT: psubq %xmm1, %xmm0 1483 ; SSE2-NEXT: psubq %xmm2, %xmm13 1484 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] 1485 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1486 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1487 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 1488 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1489 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] 1490 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1491 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] 1492 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1493 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1494 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] 1495 ; SSE2-NEXT: retq 1496 ; 1497 ; SSSE3-LABEL: psubus_8i64_max: 1498 ; SSSE3: # %bb.0: # %vector.ph 1499 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] 1500 ; SSSE3-NEXT: movdqa %xmm2, %xmm7 1501 ; SSSE3-NEXT: pxor %xmm5, %xmm7 1502 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] 1503 ; SSSE3-NEXT: movdqa %xmm8, %xmm6 1504 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 1505 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] 1506 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 1507 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 1508 ; SSSE3-NEXT: pand %xmm9, %xmm7 1509 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1510 ; SSSE3-NEXT: por %xmm7, %xmm6 1511 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] 1512 ; SSSE3-NEXT: pand %xmm6, %xmm2 1513 ; SSSE3-NEXT: pandn %xmm9, %xmm6 1514 ; SSSE3-NEXT: por %xmm2, %xmm6 1515 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] 1516 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] 1517 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 1518 ; SSSE3-NEXT: pxor %xmm5, %xmm6 1519 ; SSSE3-NEXT: movdqa %xmm8, %xmm7 1520 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 1521 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] 1522 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 1523 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1524 ; SSSE3-NEXT: pand %xmm2, %xmm6 1525 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] 1526 ; SSSE3-NEXT: por %xmm6, %xmm2 1527 ; SSSE3-NEXT: pand %xmm2, %xmm1 1528 ; SSSE3-NEXT: pandn %xmm9, %xmm2 1529 ; SSSE3-NEXT: por %xmm1, %xmm2 1530 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 1531 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1532 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] 1533 ; SSSE3-NEXT: movdqa %xmm4, %xmm2 1534 ; SSSE3-NEXT: pxor %xmm5, %xmm2 1535 ; SSSE3-NEXT: movdqa %xmm8, %xmm6 1536 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 1537 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] 1538 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 1539 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1540 ; SSSE3-NEXT: pand %xmm7, %xmm2 1541 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 1542 ; SSSE3-NEXT: por %xmm2, %xmm6 1543 ; SSSE3-NEXT: pand %xmm6, %xmm4 1544 ; SSSE3-NEXT: pandn %xmm9, %xmm6 1545 ; SSSE3-NEXT: por %xmm4, %xmm6 1546 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] 1547 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] 1548 ; SSSE3-NEXT: pxor %xmm3, %xmm5 1549 ; SSSE3-NEXT: movdqa %xmm8, %xmm4 1550 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 1551 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 1552 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 1553 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1554 ; SSSE3-NEXT: pand %xmm6, %xmm5 1555 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1556 ; SSSE3-NEXT: por %xmm5, %xmm4 1557 ; SSSE3-NEXT: pand %xmm4, %xmm3 1558 ; SSSE3-NEXT: pandn %xmm9, %xmm4 1559 ; SSSE3-NEXT: por %xmm3, %xmm4 1560 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 1561 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] 1562 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1563 ; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] 1564 ; SSSE3-NEXT: psubusw %xmm3, %xmm0 1565 ; SSSE3-NEXT: retq 1566 ; 1567 ; SSE41-LABEL: psubus_8i64_max: 1568 ; SSE41: # %bb.0: # %vector.ph 1569 ; SSE41-NEXT: movdqa %xmm0, %xmm10 1570 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] 1571 ; SSE41-NEXT: movdqa %xmm4, %xmm0 1572 ; SSE41-NEXT: pxor %xmm6, %xmm0 1573 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] 1574 ; SSE41-NEXT: movdqa %xmm8, %xmm7 1575 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 1576 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] 1577 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 1578 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1579 ; SSE41-NEXT: pand %xmm9, %xmm5 1580 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] 1581 ; SSE41-NEXT: por %xmm5, %xmm0 1582 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] 1583 ; SSE41-NEXT: movapd %xmm7, %xmm11 1584 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11 1585 ; SSE41-NEXT: movdqa %xmm3, %xmm0 1586 ; SSE41-NEXT: pxor %xmm6, %xmm0 1587 ; SSE41-NEXT: movdqa %xmm8, %xmm4 1588 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 1589 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] 1590 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 1591 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1592 ; SSE41-NEXT: pand %xmm9, %xmm5 1593 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] 1594 ; SSE41-NEXT: por %xmm5, %xmm0 1595 ; SSE41-NEXT: movapd %xmm7, %xmm4 1596 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 1597 ; SSE41-NEXT: packusdw %xmm11, %xmm4 1598 ; SSE41-NEXT: movdqa %xmm2, %xmm0 1599 ; SSE41-NEXT: pxor %xmm6, %xmm0 1600 ; SSE41-NEXT: movdqa %xmm8, %xmm3 1601 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 1602 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2] 1603 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 1604 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1605 ; SSE41-NEXT: pand %xmm9, %xmm5 1606 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] 1607 ; SSE41-NEXT: por %xmm5, %xmm0 1608 ; SSE41-NEXT: movapd %xmm7, %xmm3 1609 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 1610 ; SSE41-NEXT: pxor %xmm1, %xmm6 1611 ; SSE41-NEXT: movdqa %xmm8, %xmm0 1612 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 1613 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] 1614 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 1615 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 1616 ; SSE41-NEXT: pand %xmm2, %xmm5 1617 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1618 ; SSE41-NEXT: por %xmm5, %xmm0 1619 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 1620 ; SSE41-NEXT: packusdw %xmm3, %xmm7 1621 ; SSE41-NEXT: packusdw %xmm4, %xmm7 1622 ; SSE41-NEXT: psubusw %xmm7, %xmm10 1623 ; SSE41-NEXT: pxor %xmm1, %xmm1 1624 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero 1625 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1626 ; SSE41-NEXT: packusdw %xmm10, %xmm0 1627 ; SSE41-NEXT: retq 1628 ; 1629 ; AVX1-LABEL: psubus_8i64_max: 1630 ; AVX1: # %bb.0: # %vector.ph 1631 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1632 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 1633 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1634 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] 1635 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 1636 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 1637 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 1638 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 1639 ; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [65535,65535,65535,65535] 1640 ; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 1641 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1642 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1643 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1644 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1645 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 1646 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 1647 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 1648 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 1649 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 1650 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1651 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1652 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1653 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1654 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1655 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1656 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1657 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1658 ; AVX1-NEXT: vzeroupper 1659 ; AVX1-NEXT: retq 1660 ; 1661 ; AVX2-LABEL: psubus_8i64_max: 1662 ; AVX2: # %bb.0: # %vector.ph 1663 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 1664 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 1665 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] 1666 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 1667 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535] 1668 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 1669 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 1670 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 1671 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 1672 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 1673 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 1674 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1675 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1676 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1677 ; AVX2-NEXT: vzeroupper 1678 ; AVX2-NEXT: retq 1679 ; 1680 ; AVX512-LABEL: psubus_8i64_max: 1681 ; AVX512: # %bb.0: # %vector.ph 1682 ; AVX512-NEXT: vpmovusqw %zmm1, %xmm1 1683 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1684 ; AVX512-NEXT: vzeroupper 1685 ; AVX512-NEXT: retq 1686 vector.ph: 1687 %lhs = zext <8 x i16> %x to <8 x i64> 1688 %cond = icmp ult <8 x i64> %lhs, %y 1689 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs 1690 %sub = sub <8 x i64> %max, %y 1691 %res = trunc <8 x i64> %sub to <8 x i16> 1692 ret <8 x i16> %res 1693 } 1694 1695 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { 1696 ; SSE2-LABEL: psubus_16i32_max: 1697 ; SSE2: # %bb.0: # %vector.ph 1698 ; SSE2-NEXT: movdqa %xmm1, %xmm8 1699 ; SSE2-NEXT: pxor %xmm7, %xmm7 1700 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 1701 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 1702 ; SSE2-NEXT: movdqa %xmm0, %xmm10 1703 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] 1704 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 1705 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] 1706 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1707 ; SSE2-NEXT: pxor %xmm7, %xmm6 1708 ; SSE2-NEXT: movdqa %xmm0, %xmm9 1709 ; SSE2-NEXT: por %xmm7, %xmm9 1710 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 1711 ; SSE2-NEXT: pand %xmm9, %xmm0 1712 ; SSE2-NEXT: pandn %xmm3, %xmm9 1713 ; SSE2-NEXT: por %xmm0, %xmm9 1714 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1715 ; SSE2-NEXT: pxor %xmm7, %xmm6 1716 ; SSE2-NEXT: movdqa %xmm10, %xmm0 1717 ; SSE2-NEXT: por %xmm7, %xmm0 1718 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 1719 ; SSE2-NEXT: pand %xmm0, %xmm10 1720 ; SSE2-NEXT: pandn %xmm2, %xmm0 1721 ; SSE2-NEXT: por %xmm10, %xmm0 1722 ; SSE2-NEXT: movdqa %xmm5, %xmm10 1723 ; SSE2-NEXT: pxor %xmm7, %xmm10 1724 ; SSE2-NEXT: movdqa %xmm8, %xmm6 1725 ; SSE2-NEXT: por %xmm7, %xmm6 1726 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 1727 ; SSE2-NEXT: pand %xmm6, %xmm8 1728 ; SSE2-NEXT: pandn %xmm5, %xmm6 1729 ; SSE2-NEXT: por %xmm8, %xmm6 1730 ; SSE2-NEXT: movdqa %xmm4, %xmm8 1731 ; SSE2-NEXT: pxor %xmm7, %xmm8 1732 ; SSE2-NEXT: por %xmm1, %xmm7 1733 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 1734 ; SSE2-NEXT: pand %xmm7, %xmm1 1735 ; SSE2-NEXT: pandn %xmm4, %xmm7 1736 ; SSE2-NEXT: por %xmm7, %xmm1 1737 ; SSE2-NEXT: psubd %xmm4, %xmm1 1738 ; SSE2-NEXT: psubd %xmm5, %xmm6 1739 ; SSE2-NEXT: psubd %xmm2, %xmm0 1740 ; SSE2-NEXT: psubd %xmm3, %xmm9 1741 ; SSE2-NEXT: pslld $16, %xmm9 1742 ; SSE2-NEXT: psrad $16, %xmm9 1743 ; SSE2-NEXT: pslld $16, %xmm0 1744 ; SSE2-NEXT: psrad $16, %xmm0 1745 ; SSE2-NEXT: packssdw %xmm9, %xmm0 1746 ; SSE2-NEXT: pslld $16, %xmm6 1747 ; SSE2-NEXT: psrad $16, %xmm6 1748 ; SSE2-NEXT: pslld $16, %xmm1 1749 ; SSE2-NEXT: psrad $16, %xmm1 1750 ; SSE2-NEXT: packssdw %xmm6, %xmm1 1751 ; SSE2-NEXT: retq 1752 ; 1753 ; SSSE3-LABEL: psubus_16i32_max: 1754 ; SSSE3: # %bb.0: # %vector.ph 1755 ; SSSE3-NEXT: movdqa %xmm1, %xmm8 1756 ; SSSE3-NEXT: pxor %xmm7, %xmm7 1757 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 1758 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 1759 ; SSSE3-NEXT: movdqa %xmm0, %xmm10 1760 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] 1761 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 1762 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] 1763 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 1764 ; SSSE3-NEXT: pxor %xmm7, %xmm6 1765 ; SSSE3-NEXT: movdqa %xmm0, %xmm9 1766 ; SSSE3-NEXT: por %xmm7, %xmm9 1767 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 1768 ; SSSE3-NEXT: pand %xmm9, %xmm0 1769 ; SSSE3-NEXT: pandn %xmm3, %xmm9 1770 ; SSSE3-NEXT: por %xmm0, %xmm9 1771 ; SSSE3-NEXT: movdqa %xmm2, %xmm6 1772 ; SSSE3-NEXT: pxor %xmm7, %xmm6 1773 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 1774 ; SSSE3-NEXT: por %xmm7, %xmm0 1775 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 1776 ; SSSE3-NEXT: pand %xmm0, %xmm10 1777 ; SSSE3-NEXT: pandn %xmm2, %xmm0 1778 ; SSSE3-NEXT: por %xmm10, %xmm0 1779 ; SSSE3-NEXT: movdqa %xmm5, %xmm10 1780 ; SSSE3-NEXT: pxor %xmm7, %xmm10 1781 ; SSSE3-NEXT: movdqa %xmm8, %xmm6 1782 ; SSSE3-NEXT: por %xmm7, %xmm6 1783 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 1784 ; SSSE3-NEXT: pand %xmm6, %xmm8 1785 ; SSSE3-NEXT: pandn %xmm5, %xmm6 1786 ; SSSE3-NEXT: por %xmm8, %xmm6 1787 ; SSSE3-NEXT: movdqa %xmm4, %xmm8 1788 ; SSSE3-NEXT: pxor %xmm7, %xmm8 1789 ; SSSE3-NEXT: por %xmm1, %xmm7 1790 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 1791 ; SSSE3-NEXT: pand %xmm7, %xmm1 1792 ; SSSE3-NEXT: pandn %xmm4, %xmm7 1793 ; SSSE3-NEXT: por %xmm7, %xmm1 1794 ; SSSE3-NEXT: psubd %xmm4, %xmm1 1795 ; SSSE3-NEXT: psubd %xmm5, %xmm6 1796 ; SSSE3-NEXT: psubd %xmm2, %xmm0 1797 ; SSSE3-NEXT: psubd %xmm3, %xmm9 1798 ; SSSE3-NEXT: pslld $16, %xmm9 1799 ; SSSE3-NEXT: psrad $16, %xmm9 1800 ; SSSE3-NEXT: pslld $16, %xmm0 1801 ; SSSE3-NEXT: psrad $16, %xmm0 1802 ; SSSE3-NEXT: packssdw %xmm9, %xmm0 1803 ; SSSE3-NEXT: pslld $16, %xmm6 1804 ; SSSE3-NEXT: psrad $16, %xmm6 1805 ; SSSE3-NEXT: pslld $16, %xmm1 1806 ; SSSE3-NEXT: psrad $16, %xmm1 1807 ; SSSE3-NEXT: packssdw %xmm6, %xmm1 1808 ; SSSE3-NEXT: retq 1809 ; 1810 ; SSE41-LABEL: psubus_16i32_max: 1811 ; SSE41: # %bb.0: # %vector.ph 1812 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] 1813 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero 1814 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1815 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] 1816 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero 1817 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1818 ; SSE41-NEXT: pmaxud %xmm2, %xmm0 1819 ; SSE41-NEXT: pmaxud %xmm3, %xmm7 1820 ; SSE41-NEXT: pmaxud %xmm4, %xmm1 1821 ; SSE41-NEXT: pmaxud %xmm5, %xmm6 1822 ; SSE41-NEXT: psubd %xmm5, %xmm6 1823 ; SSE41-NEXT: psubd %xmm4, %xmm1 1824 ; SSE41-NEXT: psubd %xmm3, %xmm7 1825 ; SSE41-NEXT: psubd %xmm2, %xmm0 1826 ; SSE41-NEXT: pxor %xmm2, %xmm2 1827 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1828 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7] 1829 ; SSE41-NEXT: packusdw %xmm7, %xmm0 1830 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1831 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] 1832 ; SSE41-NEXT: packusdw %xmm6, %xmm1 1833 ; SSE41-NEXT: retq 1834 ; 1835 ; AVX1-LABEL: psubus_16i32_max: 1836 ; AVX1: # %bb.0: # %vector.ph 1837 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1838 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] 1839 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 1840 ; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 1841 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1842 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1843 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 1844 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 1845 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1846 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1847 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 1848 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1849 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1850 ; AVX1-NEXT: retq 1851 ; 1852 ; AVX2-LABEL: psubus_16i32_max: 1853 ; AVX2: # %bb.0: # %vector.ph 1854 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] 1855 ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 1856 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 1857 ; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 1858 ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 1859 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1860 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1861 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1862 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1863 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 1864 ; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1 1865 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1866 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1867 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 1868 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1869 ; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 1870 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1871 ; AVX2-NEXT: retq 1872 ; 1873 ; AVX512-LABEL: psubus_16i32_max: 1874 ; AVX512: # %bb.0: # %vector.ph 1875 ; AVX512-NEXT: vpmovusdw %zmm1, %ymm1 1876 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 1877 ; AVX512-NEXT: retq 1878 vector.ph: 1879 %lhs = zext <16 x i16> %x to <16 x i32> 1880 %cond = icmp ult <16 x i32> %lhs, %y 1881 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs 1882 %sub = sub <16 x i32> %max, %y 1883 %res = trunc <16 x i32> %sub to <16 x i16> 1884 ret <16 x i16> %res 1885 } 1886 1887 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind { 1888 ; SSE2-LABEL: psubus_i16_i32_max_swapped: 1889 ; SSE2: # %bb.0: # %vector.ph 1890 ; SSE2-NEXT: pxor %xmm3, %xmm3 1891 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1892 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1893 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1894 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] 1895 ; SSE2-NEXT: movdqa %xmm2, %xmm3 1896 ; SSE2-NEXT: pxor %xmm5, %xmm3 1897 ; SSE2-NEXT: movdqa %xmm0, %xmm6 1898 ; SSE2-NEXT: por %xmm5, %xmm6 1899 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 1900 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1901 ; SSE2-NEXT: pand %xmm3, %xmm6 1902 ; SSE2-NEXT: pandn %xmm0, %xmm3 1903 ; SSE2-NEXT: por %xmm6, %xmm3 1904 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1905 ; SSE2-NEXT: pxor %xmm5, %xmm0 1906 ; SSE2-NEXT: por %xmm4, %xmm5 1907 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 1908 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1909 ; SSE2-NEXT: pand %xmm0, %xmm5 1910 ; SSE2-NEXT: pandn %xmm4, %xmm0 1911 ; SSE2-NEXT: por %xmm5, %xmm0 1912 ; SSE2-NEXT: psubd %xmm1, %xmm0 1913 ; SSE2-NEXT: psubd %xmm2, %xmm3 1914 ; SSE2-NEXT: pslld $16, %xmm3 1915 ; SSE2-NEXT: psrad $16, %xmm3 1916 ; SSE2-NEXT: pslld $16, %xmm0 1917 ; SSE2-NEXT: psrad $16, %xmm0 1918 ; SSE2-NEXT: packssdw %xmm3, %xmm0 1919 ; SSE2-NEXT: retq 1920 ; 1921 ; SSSE3-LABEL: psubus_i16_i32_max_swapped: 1922 ; SSSE3: # %bb.0: # %vector.ph 1923 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1924 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 1925 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 1926 ; SSSE3-NEXT: pxor %xmm4, %xmm5 1927 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 1928 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 1929 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 1930 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 1931 ; SSSE3-NEXT: pand %xmm7, %xmm2 1932 ; SSSE3-NEXT: pandn %xmm5, %xmm7 1933 ; SSSE3-NEXT: por %xmm2, %xmm7 1934 ; SSSE3-NEXT: pshufb %xmm3, %xmm7 1935 ; SSSE3-NEXT: pxor %xmm1, %xmm4 1936 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 1937 ; SSSE3-NEXT: pand %xmm6, %xmm1 1938 ; SSSE3-NEXT: pandn %xmm5, %xmm6 1939 ; SSSE3-NEXT: por %xmm1, %xmm6 1940 ; SSSE3-NEXT: pshufb %xmm3, %xmm6 1941 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1942 ; SSSE3-NEXT: psubusw %xmm6, %xmm0 1943 ; SSSE3-NEXT: retq 1944 ; 1945 ; SSE41-LABEL: psubus_i16_i32_max_swapped: 1946 ; SSE41: # %bb.0: # %vector.ph 1947 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 1948 ; SSE41-NEXT: pminud %xmm3, %xmm2 1949 ; SSE41-NEXT: pminud %xmm3, %xmm1 1950 ; SSE41-NEXT: packusdw %xmm2, %xmm1 1951 ; SSE41-NEXT: psubusw %xmm1, %xmm0 1952 ; SSE41-NEXT: retq 1953 ; 1954 ; AVX1-LABEL: psubus_i16_i32_max_swapped: 1955 ; AVX1: # %bb.0: # %vector.ph 1956 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1957 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 1958 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 1959 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 1960 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1961 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1962 ; AVX1-NEXT: vzeroupper 1963 ; AVX1-NEXT: retq 1964 ; 1965 ; AVX2-LABEL: psubus_i16_i32_max_swapped: 1966 ; AVX2: # %bb.0: # %vector.ph 1967 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 1968 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 1969 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1970 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1971 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1972 ; AVX2-NEXT: vzeroupper 1973 ; AVX2-NEXT: retq 1974 ; 1975 ; AVX512-LABEL: psubus_i16_i32_max_swapped: 1976 ; AVX512: # %bb.0: # %vector.ph 1977 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 1978 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 1979 ; AVX512-NEXT: vzeroupper 1980 ; AVX512-NEXT: retq 1981 vector.ph: 1982 %lhs = zext <8 x i16> %x to <8 x i32> 1983 %cond = icmp ult <8 x i32> %y, %lhs 1984 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y 1985 %sub = sub <8 x i32> %max, %y 1986 %res = trunc <8 x i32> %sub to <8 x i16> 1987 ret <8 x i16> %res 1988 } 1989 1990 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { 1991 ; SSE2-LABEL: psubus_i16_i32_min: 1992 ; SSE2: # %bb.0: # %vector.ph 1993 ; SSE2-NEXT: pxor %xmm4, %xmm4 1994 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1995 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 1996 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1997 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 1998 ; SSE2-NEXT: movdqa %xmm2, %xmm5 1999 ; SSE2-NEXT: pxor %xmm4, %xmm5 2000 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2001 ; SSE2-NEXT: por %xmm4, %xmm6 2002 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 2003 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2004 ; SSE2-NEXT: pand %xmm5, %xmm6 2005 ; SSE2-NEXT: pandn %xmm2, %xmm5 2006 ; SSE2-NEXT: por %xmm6, %xmm5 2007 ; SSE2-NEXT: movdqa %xmm1, %xmm2 2008 ; SSE2-NEXT: pxor %xmm4, %xmm2 2009 ; SSE2-NEXT: por %xmm3, %xmm4 2010 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 2011 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2012 ; SSE2-NEXT: pand %xmm2, %xmm4 2013 ; SSE2-NEXT: pandn %xmm1, %xmm2 2014 ; SSE2-NEXT: por %xmm4, %xmm2 2015 ; SSE2-NEXT: psubd %xmm2, %xmm3 2016 ; SSE2-NEXT: psubd %xmm5, %xmm0 2017 ; SSE2-NEXT: pslld $16, %xmm0 2018 ; SSE2-NEXT: psrad $16, %xmm0 2019 ; SSE2-NEXT: pslld $16, %xmm3 2020 ; SSE2-NEXT: psrad $16, %xmm3 2021 ; SSE2-NEXT: packssdw %xmm0, %xmm3 2022 ; SSE2-NEXT: movdqa %xmm3, %xmm0 2023 ; SSE2-NEXT: retq 2024 ; 2025 ; SSSE3-LABEL: psubus_i16_i32_min: 2026 ; SSSE3: # %bb.0: # %vector.ph 2027 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2028 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] 2029 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 2030 ; SSSE3-NEXT: pxor %xmm4, %xmm5 2031 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] 2032 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 2033 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 2034 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 2035 ; SSSE3-NEXT: pand %xmm7, %xmm2 2036 ; SSSE3-NEXT: pandn %xmm5, %xmm7 2037 ; SSSE3-NEXT: por %xmm2, %xmm7 2038 ; SSSE3-NEXT: pshufb %xmm3, %xmm7 2039 ; SSSE3-NEXT: pxor %xmm1, %xmm4 2040 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 2041 ; SSSE3-NEXT: pand %xmm6, %xmm1 2042 ; SSSE3-NEXT: pandn %xmm5, %xmm6 2043 ; SSSE3-NEXT: por %xmm1, %xmm6 2044 ; SSSE3-NEXT: pshufb %xmm3, %xmm6 2045 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2046 ; SSSE3-NEXT: psubusw %xmm6, %xmm0 2047 ; SSSE3-NEXT: retq 2048 ; 2049 ; SSE41-LABEL: psubus_i16_i32_min: 2050 ; SSE41: # %bb.0: # %vector.ph 2051 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 2052 ; SSE41-NEXT: pminud %xmm3, %xmm2 2053 ; SSE41-NEXT: pminud %xmm3, %xmm1 2054 ; SSE41-NEXT: packusdw %xmm2, %xmm1 2055 ; SSE41-NEXT: psubusw %xmm1, %xmm0 2056 ; SSE41-NEXT: retq 2057 ; 2058 ; AVX1-LABEL: psubus_i16_i32_min: 2059 ; AVX1: # %bb.0: # %vector.ph 2060 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2061 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] 2062 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 2063 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 2064 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2065 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2066 ; AVX1-NEXT: vzeroupper 2067 ; AVX1-NEXT: retq 2068 ; 2069 ; AVX2-LABEL: psubus_i16_i32_min: 2070 ; AVX2: # %bb.0: # %vector.ph 2071 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 2072 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 2073 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2074 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2075 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2076 ; AVX2-NEXT: vzeroupper 2077 ; AVX2-NEXT: retq 2078 ; 2079 ; AVX512-LABEL: psubus_i16_i32_min: 2080 ; AVX512: # %bb.0: # %vector.ph 2081 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 2082 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 2083 ; AVX512-NEXT: vzeroupper 2084 ; AVX512-NEXT: retq 2085 vector.ph: 2086 %lhs = zext <8 x i16> %x to <8 x i32> 2087 %cond = icmp ult <8 x i32> %lhs, %y 2088 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y 2089 %sub = sub <8 x i32> %lhs, %min 2090 %res = trunc <8 x i32> %sub to <8 x i16> 2091 ret <8 x i16> %res 2092 } 2093