1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW 7 8 ; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through. 9 ; This would require the combine to recreate the concat_vectors. 10 define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 11 ; SSE-LABEL: pmaddubsw_128: 12 ; SSE: # %bb.0: 13 ; SSE-NEXT: movdqa (%rsi), %xmm0 14 ; SSE-NEXT: pmaddubsw (%rdi), %xmm0 15 ; SSE-NEXT: retq 16 ; 17 ; AVX-LABEL: pmaddubsw_128: 18 ; AVX: # %bb.0: 19 ; AVX-NEXT: vmovdqa (%rsi), %xmm0 20 ; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 21 ; AVX-NEXT: retq 22 %A = load <16 x i8>, <16 x i8>* %Aptr 23 %B = load <16 x i8>, <16 x i8>* %Bptr 24 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 25 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 26 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 27 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 28 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 29 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 30 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 31 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 32 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 33 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 34 %add = add <8 x i32> %even_mul, %odd_mul 35 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 36 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 37 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 38 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 39 %trunc = trunc <8 x i32> %min to <8 x i16> 40 ret <8 x i16> %trunc 41 } 42 43 define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) { 44 ; SSE-LABEL: pmaddubsw_256: 45 ; SSE: # %bb.0: 46 ; SSE-NEXT: movdqa (%rsi), %xmm0 47 ; SSE-NEXT: movdqa 16(%rsi), %xmm1 48 ; SSE-NEXT: pmaddubsw (%rdi), %xmm0 49 ; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 50 ; SSE-NEXT: retq 51 ; 52 ; AVX1-LABEL: pmaddubsw_256: 53 ; AVX1: # %bb.0: 54 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 55 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 56 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 57 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 58 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm2 59 ; AVX1-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 60 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 61 ; AVX1-NEXT: retq 62 ; 63 ; AVX256-LABEL: pmaddubsw_256: 64 ; AVX256: # %bb.0: 65 ; AVX256-NEXT: vmovdqa (%rsi), %ymm0 66 ; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 67 ; AVX256-NEXT: retq 68 %A = load <32 x i8>, <32 x i8>* %Aptr 69 %B = load <32 x i8>, <32 x i8>* %Bptr 70 %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 71 %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 72 %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 73 %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 74 %A_even_ext = sext <16 x i8> %A_even to <16 x i32> 75 %B_even_ext = zext <16 x i8> %B_even to <16 x i32> 76 %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32> 77 %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32> 78 %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext 79 %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext 80 %add = add <16 x i32> %even_mul, %odd_mul 81 %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 82 %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 83 %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 84 %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 85 %trunc = trunc <16 x i32> %min to <16 x i16> 86 ret <16 x i16> %trunc 87 } 88 89 define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) { 90 ; SSE-LABEL: pmaddubsw_512: 91 ; SSE: # %bb.0: 92 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 93 ; SSE-NEXT: movdqa 96(%rdx), %xmm1 94 ; SSE-NEXT: movdqa 80(%rdx), %xmm2 95 ; SSE-NEXT: movdqa 64(%rdx), %xmm3 96 ; SSE-NEXT: movdqa (%rdx), %xmm4 97 ; SSE-NEXT: movdqa 16(%rdx), %xmm5 98 ; SSE-NEXT: movdqa 32(%rdx), %xmm6 99 ; SSE-NEXT: movdqa 48(%rdx), %xmm7 100 ; SSE-NEXT: pmaddubsw (%rsi), %xmm4 101 ; SSE-NEXT: pmaddubsw 16(%rsi), %xmm5 102 ; SSE-NEXT: pmaddubsw 32(%rsi), %xmm6 103 ; SSE-NEXT: pmaddubsw 48(%rsi), %xmm7 104 ; SSE-NEXT: pmaddubsw 64(%rsi), %xmm3 105 ; SSE-NEXT: pmaddubsw 80(%rsi), %xmm2 106 ; SSE-NEXT: pmaddubsw 96(%rsi), %xmm1 107 ; SSE-NEXT: pmaddubsw 112(%rsi), %xmm0 108 ; SSE-NEXT: movdqa %xmm0, 112(%rdi) 109 ; SSE-NEXT: movdqa %xmm1, 96(%rdi) 110 ; SSE-NEXT: movdqa %xmm2, 80(%rdi) 111 ; SSE-NEXT: movdqa %xmm3, 64(%rdi) 112 ; SSE-NEXT: movdqa %xmm7, 48(%rdi) 113 ; SSE-NEXT: movdqa %xmm6, 32(%rdi) 114 ; SSE-NEXT: movdqa %xmm5, 16(%rdi) 115 ; SSE-NEXT: movdqa %xmm4, (%rdi) 116 ; SSE-NEXT: movq %rdi, %rax 117 ; SSE-NEXT: retq 118 ; 119 ; AVX1-LABEL: pmaddubsw_512: 120 ; AVX1: # %bb.0: 121 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 122 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 123 ; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 124 ; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8 125 ; AVX1-NEXT: vmovdqa (%rsi), %ymm4 126 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5 127 ; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6 128 ; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9 129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 130 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 131 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm7, %xmm3 132 ; AVX1-NEXT: vpmaddubsw %xmm0, %xmm4, %xmm0 133 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 134 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 135 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 136 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 137 ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm5, %xmm1 138 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 139 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 140 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 141 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 142 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2 143 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 144 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 145 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 146 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 147 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm9, %xmm4 148 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 149 ; AVX1-NEXT: retq 150 ; 151 ; AVX2-LABEL: pmaddubsw_512: 152 ; AVX2: # %bb.0: 153 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 154 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 155 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 156 ; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 157 ; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 158 ; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 159 ; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 160 ; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 161 ; AVX2-NEXT: retq 162 ; 163 ; AVX512F-LABEL: pmaddubsw_512: 164 ; AVX512F: # %bb.0: 165 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 166 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 167 ; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 168 ; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 169 ; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 170 ; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 171 ; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 172 ; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 173 ; AVX512F-NEXT: retq 174 ; 175 ; AVX512BW-LABEL: pmaddubsw_512: 176 ; AVX512BW: # %bb.0: 177 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 178 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 179 ; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 180 ; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 181 ; AVX512BW-NEXT: retq 182 %A = load <128 x i8>, <128 x i8>* %Aptr 183 %B = load <128 x i8>, <128 x i8>* %Bptr 184 %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 185 %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 186 %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126> 187 %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> 188 %A_even_ext = sext <64 x i8> %A_even to <64 x i32> 189 %B_even_ext = zext <64 x i8> %B_even to <64 x i32> 190 %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32> 191 %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32> 192 %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext 193 %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext 194 %add = add <64 x i32> %even_mul, %odd_mul 195 %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 196 %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 197 %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 198 %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 199 %trunc = trunc <64 x i32> %min to <64 x i16> 200 ret <64 x i16> %trunc 201 } 202 203 define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 204 ; SSE-LABEL: pmaddubsw_swapped_indices: 205 ; SSE: # %bb.0: 206 ; SSE-NEXT: movdqa (%rsi), %xmm0 207 ; SSE-NEXT: pmaddubsw (%rdi), %xmm0 208 ; SSE-NEXT: retq 209 ; 210 ; AVX-LABEL: pmaddubsw_swapped_indices: 211 ; AVX: # %bb.0: 212 ; AVX-NEXT: vmovdqa (%rsi), %xmm0 213 ; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 214 ; AVX-NEXT: retq 215 %A = load <16 x i8>, <16 x i8>* %Aptr 216 %B = load <16 x i8>, <16 x i8>* %Bptr 217 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 218 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 219 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A 220 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A 221 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 222 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 223 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 224 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 225 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 226 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 227 %add = add <8 x i32> %even_mul, %odd_mul 228 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 229 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 230 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 231 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 232 %trunc = trunc <8 x i32> %min to <8 x i16> 233 ret <8 x i16> %trunc 234 } 235 236 define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 237 ; SSE-LABEL: pmaddubsw_swapped_extend: 238 ; SSE: # %bb.0: 239 ; SSE-NEXT: movdqa (%rdi), %xmm0 240 ; SSE-NEXT: pmaddubsw (%rsi), %xmm0 241 ; SSE-NEXT: retq 242 ; 243 ; AVX-LABEL: pmaddubsw_swapped_extend: 244 ; AVX: # %bb.0: 245 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 246 ; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 247 ; AVX-NEXT: retq 248 %A = load <16 x i8>, <16 x i8>* %Aptr 249 %B = load <16 x i8>, <16 x i8>* %Bptr 250 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 251 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 252 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 253 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 254 %A_even_ext = zext <8 x i8> %A_even to <8 x i32> 255 %B_even_ext = sext <8 x i8> %B_even to <8 x i32> 256 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 257 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 258 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 259 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 260 %add = add <8 x i32> %even_mul, %odd_mul 261 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 262 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 263 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 264 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 265 %trunc = trunc <8 x i32> %min to <8 x i16> 266 ret <8 x i16> %trunc 267 } 268 269 define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 270 ; SSE-LABEL: pmaddubsw_commuted_mul: 271 ; SSE: # %bb.0: 272 ; SSE-NEXT: movdqa (%rsi), %xmm0 273 ; SSE-NEXT: pmaddubsw (%rdi), %xmm0 274 ; SSE-NEXT: retq 275 ; 276 ; AVX-LABEL: pmaddubsw_commuted_mul: 277 ; AVX: # %bb.0: 278 ; AVX-NEXT: vmovdqa (%rsi), %xmm0 279 ; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 280 ; AVX-NEXT: retq 281 %A = load <16 x i8>, <16 x i8>* %Aptr 282 %B = load <16 x i8>, <16 x i8>* %Bptr 283 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 284 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 285 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 286 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 287 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 288 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 289 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 290 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 291 %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext 292 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 293 %add = add <8 x i32> %even_mul, %odd_mul 294 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 295 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 296 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 297 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 298 %trunc = trunc <8 x i32> %min to <8 x i16> 299 ret <8 x i16> %trunc 300 } 301 302 define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 303 ; SSE-LABEL: pmaddubsw_bad_extend: 304 ; SSE: # %bb.0: 305 ; SSE-NEXT: movdqa (%rdi), %xmm1 306 ; SSE-NEXT: movdqa (%rsi), %xmm0 307 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 308 ; SSE-NEXT: pand %xmm0, %xmm2 309 ; SSE-NEXT: movdqa %xmm1, %xmm3 310 ; SSE-NEXT: psllw $8, %xmm3 311 ; SSE-NEXT: psraw $8, %xmm3 312 ; SSE-NEXT: movdqa %xmm3, %xmm4 313 ; SSE-NEXT: pmulhw %xmm2, %xmm4 314 ; SSE-NEXT: pmullw %xmm2, %xmm3 315 ; SSE-NEXT: movdqa %xmm3, %xmm2 316 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 317 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 318 ; SSE-NEXT: psraw $8, %xmm0 319 ; SSE-NEXT: psrlw $8, %xmm1 320 ; SSE-NEXT: movdqa %xmm1, %xmm4 321 ; SSE-NEXT: pmulhw %xmm0, %xmm4 322 ; SSE-NEXT: pmullw %xmm0, %xmm1 323 ; SSE-NEXT: movdqa %xmm1, %xmm0 324 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 325 ; SSE-NEXT: paddd %xmm2, %xmm0 326 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 327 ; SSE-NEXT: paddd %xmm3, %xmm1 328 ; SSE-NEXT: packssdw %xmm1, %xmm0 329 ; SSE-NEXT: retq 330 ; 331 ; AVX1-LABEL: pmaddubsw_bad_extend: 332 ; AVX1: # %bb.0: 333 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 334 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 335 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u> 336 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 337 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 338 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u> 339 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 340 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 341 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 342 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 343 ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 344 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3 345 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 346 ; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 347 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u> 348 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 349 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 350 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u> 351 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 352 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 353 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 354 ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 355 ; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 356 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 357 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 358 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 359 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 360 ; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 361 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 362 ; AVX1-NEXT: retq 363 ; 364 ; AVX2-LABEL: pmaddubsw_bad_extend: 365 ; AVX2: # %bb.0: 366 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 367 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 368 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 369 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3 370 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 371 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 372 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 373 ; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2 374 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 375 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 376 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 377 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 378 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 379 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 380 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 381 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 382 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 383 ; AVX2-NEXT: vzeroupper 384 ; AVX2-NEXT: retq 385 ; 386 ; AVX512-LABEL: pmaddubsw_bad_extend: 387 ; AVX512: # %bb.0: 388 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 389 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 390 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 391 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 392 ; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3 393 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 394 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 395 ; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2 396 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 397 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 398 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 399 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 400 ; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1 401 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 402 ; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 403 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] 404 ; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 405 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] 406 ; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 407 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 408 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 409 ; AVX512-NEXT: vzeroupper 410 ; AVX512-NEXT: retq 411 %A = load <16 x i8>, <16 x i8>* %Aptr 412 %B = load <16 x i8>, <16 x i8>* %Bptr 413 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 414 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 415 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 416 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 417 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 418 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 419 %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32> 420 %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32> 421 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 422 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 423 %add = add <8 x i32> %even_mul, %odd_mul 424 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 425 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 426 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 427 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 428 %trunc = trunc <8 x i32> %min to <8 x i16> 429 ret <8 x i16> %trunc 430 } 431 432 define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { 433 ; SSE-LABEL: pmaddubsw_bad_indices: 434 ; SSE: # %bb.0: 435 ; SSE-NEXT: movdqa (%rdi), %xmm1 436 ; SSE-NEXT: movdqa (%rsi), %xmm0 437 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 438 ; SSE-NEXT: pand %xmm0, %xmm2 439 ; SSE-NEXT: movdqa %xmm1, %xmm3 440 ; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] 441 ; SSE-NEXT: psraw $8, %xmm3 442 ; SSE-NEXT: movdqa %xmm3, %xmm4 443 ; SSE-NEXT: pmulhw %xmm2, %xmm4 444 ; SSE-NEXT: pmullw %xmm2, %xmm3 445 ; SSE-NEXT: movdqa %xmm3, %xmm2 446 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 447 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 448 ; SSE-NEXT: psrlw $8, %xmm0 449 ; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] 450 ; SSE-NEXT: psraw $8, %xmm1 451 ; SSE-NEXT: movdqa %xmm1, %xmm4 452 ; SSE-NEXT: pmulhw %xmm0, %xmm4 453 ; SSE-NEXT: pmullw %xmm0, %xmm1 454 ; SSE-NEXT: movdqa %xmm1, %xmm0 455 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 456 ; SSE-NEXT: paddd %xmm2, %xmm0 457 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 458 ; SSE-NEXT: paddd %xmm3, %xmm1 459 ; SSE-NEXT: packssdw %xmm1, %xmm0 460 ; SSE-NEXT: retq 461 ; 462 ; AVX1-LABEL: pmaddubsw_bad_indices: 463 ; AVX1: # %bb.0: 464 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 465 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 466 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u] 467 ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 468 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u] 469 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 470 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u] 471 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 472 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 473 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 474 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 475 ; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 476 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u] 477 ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 478 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u] 479 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 480 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u] 481 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 482 ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 483 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 484 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u] 485 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 486 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 487 ; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 488 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 489 ; AVX1-NEXT: retq 490 ; 491 ; AVX2-LABEL: pmaddubsw_bad_indices: 492 ; AVX2: # %bb.0: 493 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 494 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 495 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 496 ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 497 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 498 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 499 ; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 500 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 501 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 502 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 503 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 504 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 505 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 506 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 507 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 508 ; AVX2-NEXT: vzeroupper 509 ; AVX2-NEXT: retq 510 ; 511 ; AVX512-LABEL: pmaddubsw_bad_indices: 512 ; AVX512: # %bb.0: 513 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 514 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 515 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] 516 ; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2 517 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 518 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 519 ; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2 520 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] 521 ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 522 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 523 ; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 524 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 525 ; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 526 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] 527 ; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 528 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] 529 ; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 530 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 531 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 532 ; AVX512-NEXT: vzeroupper 533 ; AVX512-NEXT: retq 534 %A = load <16 x i8>, <16 x i8>* %Aptr 535 %B = load <16 x i8>, <16 x i8>* %Bptr 536 %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even 537 %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd 538 %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A 539 %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A 540 %A_even_ext = sext <8 x i8> %A_even to <8 x i32> 541 %B_even_ext = zext <8 x i8> %B_even to <8 x i32> 542 %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32> 543 %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32> 544 %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext 545 %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext 546 %add = add <8 x i32> %even_mul, %odd_mul 547 %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 548 %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 549 %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 550 %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 551 %trunc = trunc <8 x i32> %min to <8 x i16> 552 ret <8 x i16> %trunc 553 } 554