Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
      7 
      8 ; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
      9 ; This would require the combine to recreate the concat_vectors.
     10 define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
     11 ; SSE-LABEL: pmaddubsw_128:
     12 ; SSE:       # %bb.0:
     13 ; SSE-NEXT:    movdqa (%rsi), %xmm0
     14 ; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
     15 ; SSE-NEXT:    retq
     16 ;
     17 ; AVX-LABEL: pmaddubsw_128:
     18 ; AVX:       # %bb.0:
     19 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
     20 ; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
     21 ; AVX-NEXT:    retq
     22   %A = load <16 x i8>, <16 x i8>* %Aptr
     23   %B = load <16 x i8>, <16 x i8>* %Bptr
     24   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
     25   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     26   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
     27   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     28   %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
     29   %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
     30   %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
     31   %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
     32   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
     33   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
     34   %add = add <8 x i32> %even_mul, %odd_mul
     35   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
     36   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
     37   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
     38   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
     39   %trunc = trunc <8 x i32> %min to <8 x i16>
     40   ret <8 x i16> %trunc
     41 }
     42 
     43 define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) {
     44 ; SSE-LABEL: pmaddubsw_256:
     45 ; SSE:       # %bb.0:
     46 ; SSE-NEXT:    movdqa (%rsi), %xmm0
     47 ; SSE-NEXT:    movdqa 16(%rsi), %xmm1
     48 ; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
     49 ; SSE-NEXT:    pmaddubsw 16(%rdi), %xmm1
     50 ; SSE-NEXT:    retq
     51 ;
     52 ; AVX1-LABEL: pmaddubsw_256:
     53 ; AVX1:       # %bb.0:
     54 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
     55 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
     56 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
     57 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
     58 ; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm3, %xmm2
     59 ; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0
     60 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
     61 ; AVX1-NEXT:    retq
     62 ;
     63 ; AVX256-LABEL: pmaddubsw_256:
     64 ; AVX256:       # %bb.0:
     65 ; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
     66 ; AVX256-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
     67 ; AVX256-NEXT:    retq
     68   %A = load <32 x i8>, <32 x i8>* %Aptr
     69   %B = load <32 x i8>, <32 x i8>* %Bptr
     70   %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
     71   %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
     72   %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
     73   %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
     74   %A_even_ext = sext <16 x i8> %A_even to <16 x i32>
     75   %B_even_ext = zext <16 x i8> %B_even to <16 x i32>
     76   %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32>
     77   %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32>
     78   %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
     79   %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
     80   %add = add <16 x i32> %even_mul, %odd_mul
     81   %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
     82   %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
     83   %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
     84   %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
     85   %trunc = trunc <16 x i32> %min to <16 x i16>
     86   ret <16 x i16> %trunc
     87 }
     88 
     89 define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
     90 ; SSE-LABEL: pmaddubsw_512:
     91 ; SSE:       # %bb.0:
     92 ; SSE-NEXT:    movdqa 112(%rdx), %xmm0
     93 ; SSE-NEXT:    movdqa 96(%rdx), %xmm1
     94 ; SSE-NEXT:    movdqa 80(%rdx), %xmm2
     95 ; SSE-NEXT:    movdqa 64(%rdx), %xmm3
     96 ; SSE-NEXT:    movdqa (%rdx), %xmm4
     97 ; SSE-NEXT:    movdqa 16(%rdx), %xmm5
     98 ; SSE-NEXT:    movdqa 32(%rdx), %xmm6
     99 ; SSE-NEXT:    movdqa 48(%rdx), %xmm7
    100 ; SSE-NEXT:    pmaddubsw (%rsi), %xmm4
    101 ; SSE-NEXT:    pmaddubsw 16(%rsi), %xmm5
    102 ; SSE-NEXT:    pmaddubsw 32(%rsi), %xmm6
    103 ; SSE-NEXT:    pmaddubsw 48(%rsi), %xmm7
    104 ; SSE-NEXT:    pmaddubsw 64(%rsi), %xmm3
    105 ; SSE-NEXT:    pmaddubsw 80(%rsi), %xmm2
    106 ; SSE-NEXT:    pmaddubsw 96(%rsi), %xmm1
    107 ; SSE-NEXT:    pmaddubsw 112(%rsi), %xmm0
    108 ; SSE-NEXT:    movdqa %xmm0, 112(%rdi)
    109 ; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
    110 ; SSE-NEXT:    movdqa %xmm2, 80(%rdi)
    111 ; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
    112 ; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
    113 ; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
    114 ; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
    115 ; SSE-NEXT:    movdqa %xmm4, (%rdi)
    116 ; SSE-NEXT:    movq %rdi, %rax
    117 ; SSE-NEXT:    retq
    118 ;
    119 ; AVX1-LABEL: pmaddubsw_512:
    120 ; AVX1:       # %bb.0:
    121 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    122 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
    123 ; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
    124 ; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
    125 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
    126 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
    127 ; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
    128 ; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
    129 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    130 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
    131 ; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm7, %xmm3
    132 ; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm4, %xmm0
    133 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
    134 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    135 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
    136 ; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
    137 ; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm5, %xmm1
    138 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
    139 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
    140 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
    141 ; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
    142 ; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm6, %xmm2
    143 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    144 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm3
    145 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
    146 ; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
    147 ; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm9, %xmm4
    148 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
    149 ; AVX1-NEXT:    retq
    150 ;
    151 ; AVX2-LABEL: pmaddubsw_512:
    152 ; AVX2:       # %bb.0:
    153 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    154 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
    155 ; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm2
    156 ; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm3
    157 ; AVX2-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
    158 ; AVX2-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
    159 ; AVX2-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
    160 ; AVX2-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm3
    161 ; AVX2-NEXT:    retq
    162 ;
    163 ; AVX512F-LABEL: pmaddubsw_512:
    164 ; AVX512F:       # %bb.0:
    165 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
    166 ; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
    167 ; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm2
    168 ; AVX512F-NEXT:    vmovdqa 96(%rsi), %ymm3
    169 ; AVX512F-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
    170 ; AVX512F-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
    171 ; AVX512F-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
    172 ; AVX512F-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm3
    173 ; AVX512F-NEXT:    retq
    174 ;
    175 ; AVX512BW-LABEL: pmaddubsw_512:
    176 ; AVX512BW:       # %bb.0:
    177 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
    178 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm1
    179 ; AVX512BW-NEXT:    vpmaddubsw (%rdi), %zmm0, %zmm0
    180 ; AVX512BW-NEXT:    vpmaddubsw 64(%rdi), %zmm1, %zmm1
    181 ; AVX512BW-NEXT:    retq
    182   %A = load <128 x i8>, <128 x i8>* %Aptr
    183   %B = load <128 x i8>, <128 x i8>* %Bptr
    184   %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
    185   %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
    186   %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
    187   %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
    188   %A_even_ext = sext <64 x i8> %A_even to <64 x i32>
    189   %B_even_ext = zext <64 x i8> %B_even to <64 x i32>
    190   %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32>
    191   %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32>
    192   %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext
    193   %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext
    194   %add = add <64 x i32> %even_mul, %odd_mul
    195   %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    196   %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    197   %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    198   %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    199   %trunc = trunc <64 x i32> %min to <64 x i16>
    200   ret <64 x i16> %trunc
    201 }
    202 
    203 define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
    204 ; SSE-LABEL: pmaddubsw_swapped_indices:
    205 ; SSE:       # %bb.0:
    206 ; SSE-NEXT:    movdqa (%rsi), %xmm0
    207 ; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
    208 ; SSE-NEXT:    retq
    209 ;
    210 ; AVX-LABEL: pmaddubsw_swapped_indices:
    211 ; AVX:       # %bb.0:
    212 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
    213 ; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
    214 ; AVX-NEXT:    retq
    215   %A = load <16 x i8>, <16 x i8>* %Aptr
    216   %B = load <16 x i8>, <16 x i8>* %Bptr
    217   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
    218   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
    219   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A
    220   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A
    221   %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
    222   %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
    223   %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
    224   %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
    225   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
    226   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
    227   %add = add <8 x i32> %even_mul, %odd_mul
    228   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    229   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    230   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    231   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    232   %trunc = trunc <8 x i32> %min to <8 x i16>
    233   ret <8 x i16> %trunc
    234 }
    235 
    236 define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
    237 ; SSE-LABEL: pmaddubsw_swapped_extend:
    238 ; SSE:       # %bb.0:
    239 ; SSE-NEXT:    movdqa (%rdi), %xmm0
    240 ; SSE-NEXT:    pmaddubsw (%rsi), %xmm0
    241 ; SSE-NEXT:    retq
    242 ;
    243 ; AVX-LABEL: pmaddubsw_swapped_extend:
    244 ; AVX:       # %bb.0:
    245 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    246 ; AVX-NEXT:    vpmaddubsw (%rsi), %xmm0, %xmm0
    247 ; AVX-NEXT:    retq
    248   %A = load <16 x i8>, <16 x i8>* %Aptr
    249   %B = load <16 x i8>, <16 x i8>* %Bptr
    250   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    251   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    252   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    253   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    254   %A_even_ext = zext <8 x i8> %A_even to <8 x i32>
    255   %B_even_ext = sext <8 x i8> %B_even to <8 x i32>
    256   %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
    257   %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
    258   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
    259   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
    260   %add = add <8 x i32> %even_mul, %odd_mul
    261   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    262   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    263   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    264   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    265   %trunc = trunc <8 x i32> %min to <8 x i16>
    266   ret <8 x i16> %trunc
    267 }
    268 
    269 define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
    270 ; SSE-LABEL: pmaddubsw_commuted_mul:
    271 ; SSE:       # %bb.0:
    272 ; SSE-NEXT:    movdqa (%rsi), %xmm0
    273 ; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
    274 ; SSE-NEXT:    retq
    275 ;
    276 ; AVX-LABEL: pmaddubsw_commuted_mul:
    277 ; AVX:       # %bb.0:
    278 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
    279 ; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
    280 ; AVX-NEXT:    retq
    281   %A = load <16 x i8>, <16 x i8>* %Aptr
    282   %B = load <16 x i8>, <16 x i8>* %Bptr
    283   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    284   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    285   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    286   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    287   %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
    288   %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
    289   %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
    290   %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
    291   %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext
    292   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
    293   %add = add <8 x i32> %even_mul, %odd_mul
    294   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    295   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    296   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    297   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    298   %trunc = trunc <8 x i32> %min to <8 x i16>
    299   ret <8 x i16> %trunc
    300 }
    301 
    302 define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
    303 ; SSE-LABEL: pmaddubsw_bad_extend:
    304 ; SSE:       # %bb.0:
    305 ; SSE-NEXT:    movdqa (%rdi), %xmm1
    306 ; SSE-NEXT:    movdqa (%rsi), %xmm0
    307 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
    308 ; SSE-NEXT:    pand %xmm0, %xmm2
    309 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    310 ; SSE-NEXT:    psllw $8, %xmm3
    311 ; SSE-NEXT:    psraw $8, %xmm3
    312 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    313 ; SSE-NEXT:    pmulhw %xmm2, %xmm4
    314 ; SSE-NEXT:    pmullw %xmm2, %xmm3
    315 ; SSE-NEXT:    movdqa %xmm3, %xmm2
    316 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    317 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    318 ; SSE-NEXT:    psraw $8, %xmm0
    319 ; SSE-NEXT:    psrlw $8, %xmm1
    320 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    321 ; SSE-NEXT:    pmulhw %xmm0, %xmm4
    322 ; SSE-NEXT:    pmullw %xmm0, %xmm1
    323 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    324 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    325 ; SSE-NEXT:    paddd %xmm2, %xmm0
    326 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    327 ; SSE-NEXT:    paddd %xmm3, %xmm1
    328 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    329 ; SSE-NEXT:    retq
    330 ;
    331 ; AVX1-LABEL: pmaddubsw_bad_extend:
    332 ; AVX1:       # %bb.0:
    333 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
    334 ; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
    335 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u>
    336 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
    337 ; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
    338 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u>
    339 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
    340 ; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm5
    341 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
    342 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    343 ; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
    344 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
    345 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    346 ; AVX1-NEXT:    vpmulld %xmm3, %xmm5, %xmm3
    347 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u>
    348 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
    349 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    350 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u>
    351 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
    352 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    353 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
    354 ; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
    355 ; AVX1-NEXT:    vpmulld %xmm4, %xmm5, %xmm4
    356 ; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
    357 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    358 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
    359 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    360 ; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
    361 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
    362 ; AVX1-NEXT:    retq
    363 ;
    364 ; AVX2-LABEL: pmaddubsw_bad_extend:
    365 ; AVX2:       # %bb.0:
    366 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
    367 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
    368 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    369 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
    370 ; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
    371 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
    372 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
    373 ; AVX2-NEXT:    vpmulld %ymm2, %ymm3, %ymm2
    374 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
    375 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    376 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    377 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    378 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
    379 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    380 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    381 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    382 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    383 ; AVX2-NEXT:    vzeroupper
    384 ; AVX2-NEXT:    retq
    385 ;
    386 ; AVX512-LABEL: pmaddubsw_bad_extend:
    387 ; AVX512:       # %bb.0:
    388 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
    389 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
    390 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    391 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
    392 ; AVX512-NEXT:    vpmovsxbd %xmm3, %ymm3
    393 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
    394 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
    395 ; AVX512-NEXT:    vpmulld %ymm2, %ymm3, %ymm2
    396 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
    397 ; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    398 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    399 ; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    400 ; AVX512-NEXT:    vpmovsxbd %xmm1, %ymm1
    401 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    402 ; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    403 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
    404 ; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    405 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    406 ; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
    407 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
    408 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    409 ; AVX512-NEXT:    vzeroupper
    410 ; AVX512-NEXT:    retq
    411   %A = load <16 x i8>, <16 x i8>* %Aptr
    412   %B = load <16 x i8>, <16 x i8>* %Bptr
    413   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    414   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    415   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    416   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    417   %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
    418   %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
    419   %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
    420   %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
    421   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
    422   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
    423   %add = add <8 x i32> %even_mul, %odd_mul
    424   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    425   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    426   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    427   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    428   %trunc = trunc <8 x i32> %min to <8 x i16>
    429   ret <8 x i16> %trunc
    430 }
    431 
    432 define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
    433 ; SSE-LABEL: pmaddubsw_bad_indices:
    434 ; SSE:       # %bb.0:
    435 ; SSE-NEXT:    movdqa (%rdi), %xmm1
    436 ; SSE-NEXT:    movdqa (%rsi), %xmm0
    437 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
    438 ; SSE-NEXT:    pand %xmm0, %xmm2
    439 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    440 ; SSE-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
    441 ; SSE-NEXT:    psraw $8, %xmm3
    442 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    443 ; SSE-NEXT:    pmulhw %xmm2, %xmm4
    444 ; SSE-NEXT:    pmullw %xmm2, %xmm3
    445 ; SSE-NEXT:    movdqa %xmm3, %xmm2
    446 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    447 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    448 ; SSE-NEXT:    psrlw $8, %xmm0
    449 ; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
    450 ; SSE-NEXT:    psraw $8, %xmm1
    451 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    452 ; SSE-NEXT:    pmulhw %xmm0, %xmm4
    453 ; SSE-NEXT:    pmullw %xmm0, %xmm1
    454 ; SSE-NEXT:    movdqa %xmm1, %xmm0
    455 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    456 ; SSE-NEXT:    paddd %xmm2, %xmm0
    457 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    458 ; SSE-NEXT:    paddd %xmm3, %xmm1
    459 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    460 ; SSE-NEXT:    retq
    461 ;
    462 ; AVX1-LABEL: pmaddubsw_bad_indices:
    463 ; AVX1:       # %bb.0:
    464 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
    465 ; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
    466 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u]
    467 ; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
    468 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
    469 ; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
    470 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u]
    471 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    472 ; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
    473 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
    474 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    475 ; AVX1-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
    476 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u]
    477 ; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
    478 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u]
    479 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
    480 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u]
    481 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    482 ; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
    483 ; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
    484 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
    485 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    486 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    487 ; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
    488 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
    489 ; AVX1-NEXT:    retq
    490 ;
    491 ; AVX2-LABEL: pmaddubsw_bad_indices:
    492 ; AVX2:       # %bb.0:
    493 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
    494 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
    495 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
    496 ; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
    497 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    498 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
    499 ; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
    500 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
    501 ; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
    502 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
    503 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    504 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    505 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    506 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    507 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    508 ; AVX2-NEXT:    vzeroupper
    509 ; AVX2-NEXT:    retq
    510 ;
    511 ; AVX512-LABEL: pmaddubsw_bad_indices:
    512 ; AVX512:       # %bb.0:
    513 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
    514 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
    515 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
    516 ; AVX512-NEXT:    vpmovsxbd %xmm2, %ymm2
    517 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    518 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
    519 ; AVX512-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
    520 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
    521 ; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
    522 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
    523 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    524 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    525 ; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    526 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
    527 ; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
    528 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
    529 ; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
    530 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
    531 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    532 ; AVX512-NEXT:    vzeroupper
    533 ; AVX512-NEXT:    retq
    534   %A = load <16 x i8>, <16 x i8>* %Aptr
    535   %B = load <16 x i8>, <16 x i8>* %Bptr
    536   %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
    537   %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
    538   %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A
    539   %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A
    540   %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
    541   %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
    542   %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
    543   %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
    544   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
    545   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
    546   %add = add <8 x i32> %even_mul, %odd_mul
    547   %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    548   %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
    549   %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    550   %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
    551   %trunc = trunc <8 x i32> %min to <8 x i16>
    552   ret <8 x i16> %trunc
    553 }
    554