Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
      7 
      8 define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
      9 ; SSE2-LABEL: _Z10test_shortPsS_i_128:
     10 ; SSE2:       # %bb.0: # %entry
     11 ; SSE2-NEXT:    movl %edx, %eax
     12 ; SSE2-NEXT:    pxor %xmm0, %xmm0
     13 ; SSE2-NEXT:    xorl %ecx, %ecx
     14 ; SSE2-NEXT:    .p2align 4, 0x90
     15 ; SSE2-NEXT:  .LBB0_1: # %vector.body
     16 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
     17 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     18 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
     19 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
     20 ; SSE2-NEXT:    pmulhw %xmm1, %xmm3
     21 ; SSE2-NEXT:    pmullw %xmm1, %xmm2
     22 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
     23 ; SSE2-NEXT:    paddd %xmm2, %xmm0
     24 ; SSE2-NEXT:    addq $8, %rcx
     25 ; SSE2-NEXT:    cmpq %rcx, %rax
     26 ; SSE2-NEXT:    jne .LBB0_1
     27 ; SSE2-NEXT:  # %bb.2: # %middle.block
     28 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     29 ; SSE2-NEXT:    paddd %xmm0, %xmm1
     30 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
     31 ; SSE2-NEXT:    paddd %xmm1, %xmm0
     32 ; SSE2-NEXT:    movd %xmm0, %eax
     33 ; SSE2-NEXT:    retq
     34 ;
     35 ; AVX-LABEL: _Z10test_shortPsS_i_128:
     36 ; AVX:       # %bb.0: # %entry
     37 ; AVX-NEXT:    movl %edx, %eax
     38 ; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
     39 ; AVX-NEXT:    xorl %ecx, %ecx
     40 ; AVX-NEXT:    .p2align 4, 0x90
     41 ; AVX-NEXT:  .LBB0_1: # %vector.body
     42 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
     43 ; AVX-NEXT:    vpmovsxwd (%rdi,%rcx,2), %xmm1
     44 ; AVX-NEXT:    vpmovsxwd (%rsi,%rcx,2), %xmm2
     45 ; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
     46 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
     47 ; AVX-NEXT:    addq $8, %rcx
     48 ; AVX-NEXT:    cmpq %rcx, %rax
     49 ; AVX-NEXT:    jne .LBB0_1
     50 ; AVX-NEXT:  # %bb.2: # %middle.block
     51 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     52 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
     53 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
     54 ; AVX-NEXT:    vmovd %xmm0, %eax
     55 ; AVX-NEXT:    retq
     56 entry:
     57   %3 = zext i32 %2 to i64
     58   br label %vector.body
     59 
     60 vector.body:
     61   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
     62   %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
     63   %4 = getelementptr inbounds i16, i16* %0, i64 %index
     64   %5 = bitcast i16* %4 to <4 x i16>*
     65   %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
     66   %6 = sext <4 x i16> %wide.load to <4 x i32>
     67   %7 = getelementptr inbounds i16, i16* %1, i64 %index
     68   %8 = bitcast i16* %7 to <4 x i16>*
     69   %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
     70   %9 = sext <4 x i16> %wide.load14 to <4 x i32>
     71   %10 = mul nsw <4 x i32> %9, %6
     72   %11 = add nsw <4 x i32> %10, %vec.phi
     73   %index.next = add i64 %index, 8
     74   %12 = icmp eq i64 %index.next, %3
     75   br i1 %12, label %middle.block, label %vector.body
     76 
     77 middle.block:
     78   %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     79   %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
     80   %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     81   %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
     82   %13 = extractelement <4 x i32> %bin.rdx18, i32 0
     83   ret i32 %13
     84 }
     85 
     86 define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
     87 ; SSE2-LABEL: _Z10test_shortPsS_i_256:
     88 ; SSE2:       # %bb.0: # %entry
     89 ; SSE2-NEXT:    movl %edx, %eax
     90 ; SSE2-NEXT:    pxor %xmm0, %xmm0
     91 ; SSE2-NEXT:    xorl %ecx, %ecx
     92 ; SSE2-NEXT:    pxor %xmm1, %xmm1
     93 ; SSE2-NEXT:    .p2align 4, 0x90
     94 ; SSE2-NEXT:  .LBB1_1: # %vector.body
     95 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
     96 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
     97 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
     98 ; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
     99 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    100 ; SSE2-NEXT:    addq $8, %rcx
    101 ; SSE2-NEXT:    cmpq %rcx, %rax
    102 ; SSE2-NEXT:    jne .LBB1_1
    103 ; SSE2-NEXT:  # %bb.2: # %middle.block
    104 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    105 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    106 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    107 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    108 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    109 ; SSE2-NEXT:    movd %xmm1, %eax
    110 ; SSE2-NEXT:    retq
    111 ;
    112 ; AVX1-LABEL: _Z10test_shortPsS_i_256:
    113 ; AVX1:       # %bb.0: # %entry
    114 ; AVX1-NEXT:    movl %edx, %eax
    115 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    116 ; AVX1-NEXT:    xorl %ecx, %ecx
    117 ; AVX1-NEXT:    .p2align 4, 0x90
    118 ; AVX1-NEXT:  .LBB1_1: # %vector.body
    119 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    120 ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
    121 ; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
    122 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
    123 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    124 ; AVX1-NEXT:    addq $8, %rcx
    125 ; AVX1-NEXT:    cmpq %rcx, %rax
    126 ; AVX1-NEXT:    jne .LBB1_1
    127 ; AVX1-NEXT:  # %bb.2: # %middle.block
    128 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    129 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    130 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    131 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    132 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    133 ; AVX1-NEXT:    vmovd %xmm0, %eax
    134 ; AVX1-NEXT:    vzeroupper
    135 ; AVX1-NEXT:    retq
    136 ;
    137 ; AVX256-LABEL: _Z10test_shortPsS_i_256:
    138 ; AVX256:       # %bb.0: # %entry
    139 ; AVX256-NEXT:    movl %edx, %eax
    140 ; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    141 ; AVX256-NEXT:    xorl %ecx, %ecx
    142 ; AVX256-NEXT:    .p2align 4, 0x90
    143 ; AVX256-NEXT:  .LBB1_1: # %vector.body
    144 ; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
    145 ; AVX256-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
    146 ; AVX256-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
    147 ; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    148 ; AVX256-NEXT:    addq $8, %rcx
    149 ; AVX256-NEXT:    cmpq %rcx, %rax
    150 ; AVX256-NEXT:    jne .LBB1_1
    151 ; AVX256-NEXT:  # %bb.2: # %middle.block
    152 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
    153 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    154 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    155 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    156 ; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    157 ; AVX256-NEXT:    vmovd %xmm0, %eax
    158 ; AVX256-NEXT:    vzeroupper
    159 ; AVX256-NEXT:    retq
    160 entry:
    161   %3 = zext i32 %2 to i64
    162   br label %vector.body
    163 
    164 vector.body:
    165   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    166   %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    167   %4 = getelementptr inbounds i16, i16* %0, i64 %index
    168   %5 = bitcast i16* %4 to <8 x i16>*
    169   %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
    170   %6 = sext <8 x i16> %wide.load to <8 x i32>
    171   %7 = getelementptr inbounds i16, i16* %1, i64 %index
    172   %8 = bitcast i16* %7 to <8 x i16>*
    173   %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
    174   %9 = sext <8 x i16> %wide.load14 to <8 x i32>
    175   %10 = mul nsw <8 x i32> %9, %6
    176   %11 = add nsw <8 x i32> %10, %vec.phi
    177   %index.next = add i64 %index, 8
    178   %12 = icmp eq i64 %index.next, %3
    179   br i1 %12, label %middle.block, label %vector.body
    180 
    181 middle.block:
    182   %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    183   %bin.rdx = add <8 x i32> %11, %rdx.shuf
    184   %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    185   %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
    186   %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    187   %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
    188   %13 = extractelement <8 x i32> %bin.rdx18, i32 0
    189   ret i32 %13
    190 }
    191 
    192 define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
    193 ; SSE2-LABEL: _Z10test_shortPsS_i_512:
    194 ; SSE2:       # %bb.0: # %entry
    195 ; SSE2-NEXT:    movl %edx, %eax
    196 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    197 ; SSE2-NEXT:    xorl %ecx, %ecx
    198 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    199 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    200 ; SSE2-NEXT:    .p2align 4, 0x90
    201 ; SSE2-NEXT:  .LBB2_1: # %vector.body
    202 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    203 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm3
    204 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm4
    205 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm5
    206 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm5
    207 ; SSE2-NEXT:    paddd %xmm5, %xmm2
    208 ; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm3
    209 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
    210 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    211 ; SSE2-NEXT:    addq $16, %rcx
    212 ; SSE2-NEXT:    cmpq %rcx, %rax
    213 ; SSE2-NEXT:    jne .LBB2_1
    214 ; SSE2-NEXT:  # %bb.2: # %middle.block
    215 ; SSE2-NEXT:    paddd %xmm0, %xmm2
    216 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    217 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    218 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    219 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    220 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    221 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    222 ; SSE2-NEXT:    movd %xmm1, %eax
    223 ; SSE2-NEXT:    retq
    224 ;
    225 ; AVX1-LABEL: _Z10test_shortPsS_i_512:
    226 ; AVX1:       # %bb.0: # %entry
    227 ; AVX1-NEXT:    movl %edx, %eax
    228 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    229 ; AVX1-NEXT:    xorl %ecx, %ecx
    230 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    231 ; AVX1-NEXT:    .p2align 4, 0x90
    232 ; AVX1-NEXT:  .LBB2_1: # %vector.body
    233 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    234 ; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm2
    235 ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
    236 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
    237 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
    238 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
    239 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
    240 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
    241 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
    242 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
    243 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
    244 ; AVX1-NEXT:    addq $16, %rcx
    245 ; AVX1-NEXT:    cmpq %rcx, %rax
    246 ; AVX1-NEXT:    jne .LBB2_1
    247 ; AVX1-NEXT:  # %bb.2: # %middle.block
    248 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    249 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    250 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
    251 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    252 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    253 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    254 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    255 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    256 ; AVX1-NEXT:    vmovd %xmm0, %eax
    257 ; AVX1-NEXT:    vzeroupper
    258 ; AVX1-NEXT:    retq
    259 ;
    260 ; AVX2-LABEL: _Z10test_shortPsS_i_512:
    261 ; AVX2:       # %bb.0: # %entry
    262 ; AVX2-NEXT:    movl %edx, %eax
    263 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    264 ; AVX2-NEXT:    xorl %ecx, %ecx
    265 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    266 ; AVX2-NEXT:    .p2align 4, 0x90
    267 ; AVX2-NEXT:  .LBB2_1: # %vector.body
    268 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
    269 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
    270 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
    271 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
    272 ; AVX2-NEXT:    addq $16, %rcx
    273 ; AVX2-NEXT:    cmpq %rcx, %rax
    274 ; AVX2-NEXT:    jne .LBB2_1
    275 ; AVX2-NEXT:  # %bb.2: # %middle.block
    276 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    277 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    278 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    279 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    280 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    281 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    282 ; AVX2-NEXT:    vmovd %xmm0, %eax
    283 ; AVX2-NEXT:    vzeroupper
    284 ; AVX2-NEXT:    retq
    285 ;
    286 ; AVX512-LABEL: _Z10test_shortPsS_i_512:
    287 ; AVX512:       # %bb.0: # %entry
    288 ; AVX512-NEXT:    movl %edx, %eax
    289 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    290 ; AVX512-NEXT:    xorl %ecx, %ecx
    291 ; AVX512-NEXT:    .p2align 4, 0x90
    292 ; AVX512-NEXT:  .LBB2_1: # %vector.body
    293 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
    294 ; AVX512-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm1
    295 ; AVX512-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1
    296 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    297 ; AVX512-NEXT:    addq $16, %rcx
    298 ; AVX512-NEXT:    cmpq %rcx, %rax
    299 ; AVX512-NEXT:    jne .LBB2_1
    300 ; AVX512-NEXT:  # %bb.2: # %middle.block
    301 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    302 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    303 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    304 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    305 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    306 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    307 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    308 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    309 ; AVX512-NEXT:    vmovd %xmm0, %eax
    310 ; AVX512-NEXT:    vzeroupper
    311 ; AVX512-NEXT:    retq
    312 entry:
    313   %3 = zext i32 %2 to i64
    314   br label %vector.body
    315 
    316 vector.body:
    317   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    318   %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    319   %4 = getelementptr inbounds i16, i16* %0, i64 %index
    320   %5 = bitcast i16* %4 to <16 x i16>*
    321   %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
    322   %6 = sext <16 x i16> %wide.load to <16 x i32>
    323   %7 = getelementptr inbounds i16, i16* %1, i64 %index
    324   %8 = bitcast i16* %7 to <16 x i16>*
    325   %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
    326   %9 = sext <16 x i16> %wide.load14 to <16 x i32>
    327   %10 = mul nsw <16 x i32> %9, %6
    328   %11 = add nsw <16 x i32> %10, %vec.phi
    329   %index.next = add i64 %index, 16
    330   %12 = icmp eq i64 %index.next, %3
    331   br i1 %12, label %middle.block, label %vector.body
    332 
    333 middle.block:
    334   %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    335   %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
    336   %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    337   %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
    338   %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    339   %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
    340   %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    341   %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
    342   %13 = extractelement <16 x i32> %bin.rdx18, i32 0
    343   ret i32 %13
    344 }
    345 
    346 define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
    347 ; SSE2-LABEL: _Z10test_shortPsS_i_1024:
    348 ; SSE2:       # %bb.0: # %entry
    349 ; SSE2-NEXT:    movl %edx, %eax
    350 ; SSE2-NEXT:    pxor %xmm8, %xmm8
    351 ; SSE2-NEXT:    xorl %ecx, %ecx
    352 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    353 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    354 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    355 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    356 ; SSE2-NEXT:    .p2align 4, 0x90
    357 ; SSE2-NEXT:  .LBB3_1: # %vector.body
    358 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    359 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm5
    360 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm6
    361 ; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm7
    362 ; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm9
    363 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm0
    364 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
    365 ; SSE2-NEXT:    paddd %xmm0, %xmm2
    366 ; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm0
    367 ; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
    368 ; SSE2-NEXT:    paddd %xmm0, %xmm4
    369 ; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm0
    370 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm0
    371 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    372 ; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm0
    373 ; SSE2-NEXT:    pmaddwd %xmm9, %xmm0
    374 ; SSE2-NEXT:    paddd %xmm0, %xmm3
    375 ; SSE2-NEXT:    addq $16, %rcx
    376 ; SSE2-NEXT:    cmpq %rcx, %rax
    377 ; SSE2-NEXT:    jne .LBB3_1
    378 ; SSE2-NEXT:  # %bb.2: # %middle.block
    379 ; SSE2-NEXT:    paddd %xmm8, %xmm4
    380 ; SSE2-NEXT:    paddd %xmm8, %xmm3
    381 ; SSE2-NEXT:    paddd %xmm4, %xmm3
    382 ; SSE2-NEXT:    paddd %xmm8, %xmm2
    383 ; SSE2-NEXT:    paddd %xmm8, %xmm1
    384 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    385 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    386 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    387 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    388 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    389 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    390 ; SSE2-NEXT:    movd %xmm1, %eax
    391 ; SSE2-NEXT:    retq
    392 ;
    393 ; AVX1-LABEL: _Z10test_shortPsS_i_1024:
    394 ; AVX1:       # %bb.0: # %entry
    395 ; AVX1-NEXT:    movl %edx, %eax
    396 ; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
    397 ; AVX1-NEXT:    xorl %ecx, %ecx
    398 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    399 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    400 ; AVX1-NEXT:    .p2align 4, 0x90
    401 ; AVX1-NEXT:  .LBB3_1: # %vector.body
    402 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    403 ; AVX1-NEXT:    vmovdqu (%rdi,%rcx,2), %ymm3
    404 ; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,2), %ymm4
    405 ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm5
    406 ; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm6
    407 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
    408 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
    409 ; AVX1-NEXT:    vpmaddwd %xmm7, %xmm0, %xmm0
    410 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
    411 ; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0
    412 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm6, %xmm4
    413 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
    414 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
    415 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
    416 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
    417 ; AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
    418 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
    419 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
    420 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm5, %xmm3
    421 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
    422 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
    423 ; AVX1-NEXT:    addq $16, %rcx
    424 ; AVX1-NEXT:    cmpq %rcx, %rax
    425 ; AVX1-NEXT:    jne .LBB3_1
    426 ; AVX1-NEXT:  # %bb.2: # %middle.block
    427 ; AVX1-NEXT:    vpaddd %xmm8, %xmm2, %xmm0
    428 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    429 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
    430 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
    431 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm5
    432 ; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
    433 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
    434 ; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
    435 ; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
    436 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    437 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    438 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    439 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    440 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    441 ; AVX1-NEXT:    vmovd %xmm0, %eax
    442 ; AVX1-NEXT:    vzeroupper
    443 ; AVX1-NEXT:    retq
    444 ;
    445 ; AVX2-LABEL: _Z10test_shortPsS_i_1024:
    446 ; AVX2:       # %bb.0: # %entry
    447 ; AVX2-NEXT:    movl %edx, %eax
    448 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    449 ; AVX2-NEXT:    xorl %ecx, %ecx
    450 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    451 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    452 ; AVX2-NEXT:    .p2align 4, 0x90
    453 ; AVX2-NEXT:  .LBB3_1: # %vector.body
    454 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
    455 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
    456 ; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
    457 ; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
    458 ; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
    459 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
    460 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
    461 ; AVX2-NEXT:    addq $16, %rcx
    462 ; AVX2-NEXT:    cmpq %rcx, %rax
    463 ; AVX2-NEXT:    jne .LBB3_1
    464 ; AVX2-NEXT:  # %bb.2: # %middle.block
    465 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
    466 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    467 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    468 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    469 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    470 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    471 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    472 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    473 ; AVX2-NEXT:    vmovd %xmm0, %eax
    474 ; AVX2-NEXT:    vzeroupper
    475 ; AVX2-NEXT:    retq
    476 ;
    477 ; AVX512F-LABEL: _Z10test_shortPsS_i_1024:
    478 ; AVX512F:       # %bb.0: # %entry
    479 ; AVX512F-NEXT:    movl %edx, %eax
    480 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    481 ; AVX512F-NEXT:    xorl %ecx, %ecx
    482 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    483 ; AVX512F-NEXT:    .p2align 4, 0x90
    484 ; AVX512F-NEXT:  .LBB3_1: # %vector.body
    485 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
    486 ; AVX512F-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm2
    487 ; AVX512F-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm3
    488 ; AVX512F-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3
    489 ; AVX512F-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2
    490 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
    491 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
    492 ; AVX512F-NEXT:    addq $16, %rcx
    493 ; AVX512F-NEXT:    cmpq %rcx, %rax
    494 ; AVX512F-NEXT:    jne .LBB3_1
    495 ; AVX512F-NEXT:  # %bb.2: # %middle.block
    496 ; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    497 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    498 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    499 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    500 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    501 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    502 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    503 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    504 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    505 ; AVX512F-NEXT:    vmovd %xmm0, %eax
    506 ; AVX512F-NEXT:    vzeroupper
    507 ; AVX512F-NEXT:    retq
    508 ;
    509 ; AVX512BW-LABEL: _Z10test_shortPsS_i_1024:
    510 ; AVX512BW:       # %bb.0: # %entry
    511 ; AVX512BW-NEXT:    movl %edx, %eax
    512 ; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    513 ; AVX512BW-NEXT:    xorl %ecx, %ecx
    514 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    515 ; AVX512BW-NEXT:    .p2align 4, 0x90
    516 ; AVX512BW-NEXT:  .LBB3_1: # %vector.body
    517 ; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
    518 ; AVX512BW-NEXT:    vmovdqu64 (%rsi,%rcx,2), %zmm2
    519 ; AVX512BW-NEXT:    vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2
    520 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
    521 ; AVX512BW-NEXT:    addq $16, %rcx
    522 ; AVX512BW-NEXT:    cmpq %rcx, %rax
    523 ; AVX512BW-NEXT:    jne .LBB3_1
    524 ; AVX512BW-NEXT:  # %bb.2: # %middle.block
    525 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    526 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    527 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    528 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    529 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    530 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    531 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    532 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    533 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    534 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
    535 ; AVX512BW-NEXT:    vzeroupper
    536 ; AVX512BW-NEXT:    retq
    537 entry:
    538   %3 = zext i32 %2 to i64
    539   br label %vector.body
    540 
    541 vector.body:
    542   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    543   %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    544   %4 = getelementptr inbounds i16, i16* %0, i64 %index
    545   %5 = bitcast i16* %4 to <32 x i16>*
    546   %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
    547   %6 = sext <32 x i16> %wide.load to <32 x i32>
    548   %7 = getelementptr inbounds i16, i16* %1, i64 %index
    549   %8 = bitcast i16* %7 to <32 x i16>*
    550   %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
    551   %9 = sext <32 x i16> %wide.load14 to <32 x i32>
    552   %10 = mul nsw <32 x i32> %9, %6
    553   %11 = add nsw <32 x i32> %10, %vec.phi
    554   %index.next = add i64 %index, 16
    555   %12 = icmp eq i64 %index.next, %3
    556   br i1 %12, label %middle.block, label %vector.body
    557 
    558 middle.block:
    559   %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    560   %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
    561   %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    562   %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
    563   %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    564   %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
    565   %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    566   %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
    567   %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    568   %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
    569   %13 = extractelement <32 x i32> %bin.rdx18, i32 0
    570   ret i32 %13
    571 }
    572 
    573 define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
    574 ; SSE2-LABEL: _Z9test_charPcS_i_128:
    575 ; SSE2:       # %bb.0: # %entry
    576 ; SSE2-NEXT:    movl %edx, %eax
    577 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    578 ; SSE2-NEXT:    xorl %ecx, %ecx
    579 ; SSE2-NEXT:    .p2align 4, 0x90
    580 ; SSE2-NEXT:  .LBB4_1: # %vector.body
    581 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    582 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    583 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    584 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
    585 ; SSE2-NEXT:    psrad $24, %xmm1
    586 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
    587 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    588 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
    589 ; SSE2-NEXT:    psrad $24, %xmm2
    590 ; SSE2-NEXT:    pmullw %xmm1, %xmm2
    591 ; SSE2-NEXT:    pslld $16, %xmm2
    592 ; SSE2-NEXT:    psrad $16, %xmm2
    593 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    594 ; SSE2-NEXT:    addq $16, %rcx
    595 ; SSE2-NEXT:    cmpq %rcx, %rax
    596 ; SSE2-NEXT:    jne .LBB4_1
    597 ; SSE2-NEXT:  # %bb.2: # %middle.block
    598 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    599 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    600 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    601 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    602 ; SSE2-NEXT:    movd %xmm0, %eax
    603 ; SSE2-NEXT:    retq
    604 ;
    605 ; AVX-LABEL: _Z9test_charPcS_i_128:
    606 ; AVX:       # %bb.0: # %entry
    607 ; AVX-NEXT:    movl %edx, %eax
    608 ; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    609 ; AVX-NEXT:    xorl %ecx, %ecx
    610 ; AVX-NEXT:    .p2align 4, 0x90
    611 ; AVX-NEXT:  .LBB4_1: # %vector.body
    612 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
    613 ; AVX-NEXT:    vpmovsxbd (%rdi,%rcx), %xmm1
    614 ; AVX-NEXT:    vpmovsxbd (%rsi,%rcx), %xmm2
    615 ; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
    616 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    617 ; AVX-NEXT:    addq $16, %rcx
    618 ; AVX-NEXT:    cmpq %rcx, %rax
    619 ; AVX-NEXT:    jne .LBB4_1
    620 ; AVX-NEXT:  # %bb.2: # %middle.block
    621 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    622 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    623 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    624 ; AVX-NEXT:    vmovd %xmm0, %eax
    625 ; AVX-NEXT:    retq
    626 entry:
    627   %3 = zext i32 %2 to i64
    628   br label %vector.body
    629 
    630 vector.body:
    631   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    632   %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    633   %4 = getelementptr inbounds i8, i8* %0, i64 %index
    634   %5 = bitcast i8* %4 to <4 x i8>*
    635   %wide.load = load <4 x i8>, <4 x i8>* %5, align 1
    636   %6 = sext <4 x i8> %wide.load to <4 x i32>
    637   %7 = getelementptr inbounds i8, i8* %1, i64 %index
    638   %8 = bitcast i8* %7 to <4 x i8>*
    639   %wide.load14 = load <4 x i8>, <4 x i8>* %8, align 1
    640   %9 = sext <4 x i8> %wide.load14 to <4 x i32>
    641   %10 = mul nsw <4 x i32> %9, %6
    642   %11 = add nsw <4 x i32> %10, %vec.phi
    643   %index.next = add i64 %index, 16
    644   %12 = icmp eq i64 %index.next, %3
    645   br i1 %12, label %middle.block, label %vector.body
    646 
    647 middle.block:
    648   %rdx.shuf17 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    649   %bin.rdx18 = add <4 x i32> %11, %rdx.shuf17
    650   %rdx.shuf19 = shufflevector <4 x i32> %bin.rdx18, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    651   %bin.rdx20 = add <4 x i32> %bin.rdx18, %rdx.shuf19
    652   %13 = extractelement <4 x i32> %bin.rdx20, i32 0
    653   ret i32 %13
    654 }
    655 
    656 define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
    657 ; SSE2-LABEL: _Z9test_charPcS_i_256:
    658 ; SSE2:       # %bb.0: # %entry
    659 ; SSE2-NEXT:    movl %edx, %eax
    660 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    661 ; SSE2-NEXT:    xorl %ecx, %ecx
    662 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    663 ; SSE2-NEXT:    .p2align 4, 0x90
    664 ; SSE2-NEXT:  .LBB5_1: # %vector.body
    665 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    666 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    667 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    668 ; SSE2-NEXT:    psraw $8, %xmm2
    669 ; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
    670 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    671 ; SSE2-NEXT:    psraw $8, %xmm3
    672 ; SSE2-NEXT:    pmaddwd %xmm2, %xmm3
    673 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    674 ; SSE2-NEXT:    addq $16, %rcx
    675 ; SSE2-NEXT:    cmpq %rcx, %rax
    676 ; SSE2-NEXT:    jne .LBB5_1
    677 ; SSE2-NEXT:  # %bb.2: # %middle.block
    678 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    679 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    680 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    681 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    682 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    683 ; SSE2-NEXT:    movd %xmm1, %eax
    684 ; SSE2-NEXT:    retq
    685 ;
    686 ; AVX1-LABEL: _Z9test_charPcS_i_256:
    687 ; AVX1:       # %bb.0: # %entry
    688 ; AVX1-NEXT:    movl %edx, %eax
    689 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    690 ; AVX1-NEXT:    xorl %ecx, %ecx
    691 ; AVX1-NEXT:    .p2align 4, 0x90
    692 ; AVX1-NEXT:  .LBB5_1: # %vector.body
    693 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    694 ; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
    695 ; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
    696 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
    697 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
    698 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    699 ; AVX1-NEXT:    addq $16, %rcx
    700 ; AVX1-NEXT:    cmpq %rcx, %rax
    701 ; AVX1-NEXT:    jne .LBB5_1
    702 ; AVX1-NEXT:  # %bb.2: # %middle.block
    703 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    704 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    705 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    706 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    707 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    708 ; AVX1-NEXT:    vmovd %xmm0, %eax
    709 ; AVX1-NEXT:    vzeroupper
    710 ; AVX1-NEXT:    retq
    711 ;
    712 ; AVX256-LABEL: _Z9test_charPcS_i_256:
    713 ; AVX256:       # %bb.0: # %entry
    714 ; AVX256-NEXT:    movl %edx, %eax
    715 ; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    716 ; AVX256-NEXT:    xorl %ecx, %ecx
    717 ; AVX256-NEXT:    .p2align 4, 0x90
    718 ; AVX256-NEXT:  .LBB5_1: # %vector.body
    719 ; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
    720 ; AVX256-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm1
    721 ; AVX256-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm2
    722 ; AVX256-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
    723 ; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    724 ; AVX256-NEXT:    addq $16, %rcx
    725 ; AVX256-NEXT:    cmpq %rcx, %rax
    726 ; AVX256-NEXT:    jne .LBB5_1
    727 ; AVX256-NEXT:  # %bb.2: # %middle.block
    728 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
    729 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    730 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    731 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    732 ; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    733 ; AVX256-NEXT:    vmovd %xmm0, %eax
    734 ; AVX256-NEXT:    vzeroupper
    735 ; AVX256-NEXT:    retq
    736 entry:
    737   %3 = zext i32 %2 to i64
    738   br label %vector.body
    739 
    740 vector.body:
    741   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    742   %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    743   %4 = getelementptr inbounds i8, i8* %0, i64 %index
    744   %5 = bitcast i8* %4 to <8 x i8>*
    745   %wide.load = load <8 x i8>, <8 x i8>* %5, align 1
    746   %6 = sext <8 x i8> %wide.load to <8 x i32>
    747   %7 = getelementptr inbounds i8, i8* %1, i64 %index
    748   %8 = bitcast i8* %7 to <8 x i8>*
    749   %wide.load14 = load <8 x i8>, <8 x i8>* %8, align 1
    750   %9 = sext <8 x i8> %wide.load14 to <8 x i32>
    751   %10 = mul nsw <8 x i32> %9, %6
    752   %11 = add nsw <8 x i32> %10, %vec.phi
    753   %index.next = add i64 %index, 16
    754   %12 = icmp eq i64 %index.next, %3
    755   br i1 %12, label %middle.block, label %vector.body
    756 
    757 middle.block:
    758   %rdx.shuf15 = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    759   %bin.rdx16 = add <8 x i32> %11, %rdx.shuf15
    760   %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    761   %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
    762   %rdx.shuf19 = shufflevector <8 x i32> %bin.rdx18, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    763   %bin.rdx20 = add <8 x i32> %bin.rdx18, %rdx.shuf19
    764   %13 = extractelement <8 x i32> %bin.rdx20, i32 0
    765   ret i32 %13
    766 }
    767 
    768 define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
    769 ; SSE2-LABEL: _Z9test_charPcS_i_512:
    770 ; SSE2:       # %bb.0: # %entry
    771 ; SSE2-NEXT:    movl %edx, %eax
    772 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    773 ; SSE2-NEXT:    xorl %ecx, %ecx
    774 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    775 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    776 ; SSE2-NEXT:    .p2align 4, 0x90
    777 ; SSE2-NEXT:  .LBB6_1: # %vector.body
    778 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    779 ; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
    780 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    781 ; SSE2-NEXT:    psraw $8, %xmm3
    782 ; SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
    783 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    784 ; SSE2-NEXT:    psraw $8, %xmm4
    785 ; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
    786 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    787 ; SSE2-NEXT:    psraw $8, %xmm5
    788 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm5
    789 ; SSE2-NEXT:    paddd %xmm5, %xmm2
    790 ; SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
    791 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    792 ; SSE2-NEXT:    psraw $8, %xmm3
    793 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
    794 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    795 ; SSE2-NEXT:    addq $16, %rcx
    796 ; SSE2-NEXT:    cmpq %rcx, %rax
    797 ; SSE2-NEXT:    jne .LBB6_1
    798 ; SSE2-NEXT:  # %bb.2: # %middle.block
    799 ; SSE2-NEXT:    paddd %xmm0, %xmm2
    800 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    801 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    802 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    803 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    804 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    805 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    806 ; SSE2-NEXT:    movd %xmm1, %eax
    807 ; SSE2-NEXT:    retq
    808 ;
    809 ; AVX1-LABEL: _Z9test_charPcS_i_512:
    810 ; AVX1:       # %bb.0: # %entry
    811 ; AVX1-NEXT:    movl %edx, %eax
    812 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    813 ; AVX1-NEXT:    xorl %ecx, %ecx
    814 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    815 ; AVX1-NEXT:    .p2align 4, 0x90
    816 ; AVX1-NEXT:  .LBB6_1: # %vector.body
    817 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    818 ; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm2
    819 ; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm3
    820 ; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
    821 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
    822 ; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
    823 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
    824 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
    825 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
    826 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
    827 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
    828 ; AVX1-NEXT:    addq $16, %rcx
    829 ; AVX1-NEXT:    cmpq %rcx, %rax
    830 ; AVX1-NEXT:    jne .LBB6_1
    831 ; AVX1-NEXT:  # %bb.2: # %middle.block
    832 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    833 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    834 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
    835 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    836 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    837 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    838 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    839 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    840 ; AVX1-NEXT:    vmovd %xmm0, %eax
    841 ; AVX1-NEXT:    vzeroupper
    842 ; AVX1-NEXT:    retq
    843 ;
    844 ; AVX2-LABEL: _Z9test_charPcS_i_512:
    845 ; AVX2:       # %bb.0: # %entry
    846 ; AVX2-NEXT:    movl %edx, %eax
    847 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    848 ; AVX2-NEXT:    xorl %ecx, %ecx
    849 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    850 ; AVX2-NEXT:    .p2align 4, 0x90
    851 ; AVX2-NEXT:  .LBB6_1: # %vector.body
    852 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
    853 ; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
    854 ; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
    855 ; AVX2-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
    856 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
    857 ; AVX2-NEXT:    addq $16, %rcx
    858 ; AVX2-NEXT:    cmpq %rcx, %rax
    859 ; AVX2-NEXT:    jne .LBB6_1
    860 ; AVX2-NEXT:  # %bb.2: # %middle.block
    861 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    862 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    863 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    864 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    865 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    866 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    867 ; AVX2-NEXT:    vmovd %xmm0, %eax
    868 ; AVX2-NEXT:    vzeroupper
    869 ; AVX2-NEXT:    retq
    870 ;
    871 ; AVX512-LABEL: _Z9test_charPcS_i_512:
    872 ; AVX512:       # %bb.0: # %entry
    873 ; AVX512-NEXT:    movl %edx, %eax
    874 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    875 ; AVX512-NEXT:    xorl %ecx, %ecx
    876 ; AVX512-NEXT:    .p2align 4, 0x90
    877 ; AVX512-NEXT:  .LBB6_1: # %vector.body
    878 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
    879 ; AVX512-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm1
    880 ; AVX512-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm2
    881 ; AVX512-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
    882 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    883 ; AVX512-NEXT:    addq $16, %rcx
    884 ; AVX512-NEXT:    cmpq %rcx, %rax
    885 ; AVX512-NEXT:    jne .LBB6_1
    886 ; AVX512-NEXT:  # %bb.2: # %middle.block
    887 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    888 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    889 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    890 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    891 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    892 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    893 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    894 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    895 ; AVX512-NEXT:    vmovd %xmm0, %eax
    896 ; AVX512-NEXT:    vzeroupper
    897 ; AVX512-NEXT:    retq
    898 entry:
    899   %3 = zext i32 %2 to i64
    900   br label %vector.body
    901 
    902 vector.body:
    903   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    904   %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    905   %4 = getelementptr inbounds i8, i8* %0, i64 %index
    906   %5 = bitcast i8* %4 to <16 x i8>*
    907   %wide.load = load <16 x i8>, <16 x i8>* %5, align 1
    908   %6 = sext <16 x i8> %wide.load to <16 x i32>
    909   %7 = getelementptr inbounds i8, i8* %1, i64 %index
    910   %8 = bitcast i8* %7 to <16 x i8>*
    911   %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1
    912   %9 = sext <16 x i8> %wide.load14 to <16 x i32>
    913   %10 = mul nsw <16 x i32> %9, %6
    914   %11 = add nsw <16 x i32> %10, %vec.phi
    915   %index.next = add i64 %index, 16
    916   %12 = icmp eq i64 %index.next, %3
    917   br i1 %12, label %middle.block, label %vector.body
    918 
    919 middle.block:
    920   %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    921   %bin.rdx = add <16 x i32> %11, %rdx.shuf
    922   %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    923   %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
    924   %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    925   %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
    926   %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    927   %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
    928   %13 = extractelement <16 x i32> %bin.rdx20, i32 0
    929   ret i32 %13
    930 }
    931 
    932 define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
    933 ; SSE2-LABEL: _Z9test_charPcS_i_1024:
    934 ; SSE2:       # %bb.0: # %entry
    935 ; SSE2-NEXT:    movl %edx, %eax
    936 ; SSE2-NEXT:    pxor %xmm8, %xmm8
    937 ; SSE2-NEXT:    xorl %ecx, %ecx
    938 ; SSE2-NEXT:    pxor %xmm9, %xmm9
    939 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    940 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    941 ; SSE2-NEXT:    pxor %xmm3, %xmm3
    942 ; SSE2-NEXT:    .p2align 4, 0x90
    943 ; SSE2-NEXT:  .LBB7_1: # %vector.body
    944 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    945 ; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
    946 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    947 ; SSE2-NEXT:    psraw $8, %xmm5
    948 ; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
    949 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    950 ; SSE2-NEXT:    psraw $8, %xmm6
    951 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
    952 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    953 ; SSE2-NEXT:    psraw $8, %xmm7
    954 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    955 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    956 ; SSE2-NEXT:    psraw $8, %xmm0
    957 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    958 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    959 ; SSE2-NEXT:    psraw $8, %xmm2
    960 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm2
    961 ; SSE2-NEXT:    paddd %xmm2, %xmm9
    962 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    963 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    964 ; SSE2-NEXT:    psraw $8, %xmm2
    965 ; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
    966 ; SSE2-NEXT:    paddd %xmm2, %xmm4
    967 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    968 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    969 ; SSE2-NEXT:    psraw $8, %xmm2
    970 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
    971 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    972 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    973 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    974 ; SSE2-NEXT:    psraw $8, %xmm2
    975 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
    976 ; SSE2-NEXT:    paddd %xmm2, %xmm3
    977 ; SSE2-NEXT:    addq $32, %rcx
    978 ; SSE2-NEXT:    cmpq %rcx, %rax
    979 ; SSE2-NEXT:    jne .LBB7_1
    980 ; SSE2-NEXT:  # %bb.2: # %middle.block
    981 ; SSE2-NEXT:    paddd %xmm8, %xmm4
    982 ; SSE2-NEXT:    paddd %xmm8, %xmm3
    983 ; SSE2-NEXT:    paddd %xmm4, %xmm3
    984 ; SSE2-NEXT:    paddd %xmm8, %xmm9
    985 ; SSE2-NEXT:    paddd %xmm8, %xmm1
    986 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    987 ; SSE2-NEXT:    paddd %xmm9, %xmm1
    988 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    989 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    990 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    991 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    992 ; SSE2-NEXT:    movd %xmm1, %eax
    993 ; SSE2-NEXT:    retq
    994 ;
    995 ; AVX1-LABEL: _Z9test_charPcS_i_1024:
    996 ; AVX1:       # %bb.0: # %entry
    997 ; AVX1-NEXT:    movl %edx, %eax
    998 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    999 ; AVX1-NEXT:    xorl %ecx, %ecx
   1000 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1001 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1002 ; AVX1-NEXT:    .p2align 4, 0x90
   1003 ; AVX1-NEXT:  .LBB7_1: # %vector.body
   1004 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
   1005 ; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
   1006 ; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm4
   1007 ; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm5
   1008 ; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm6
   1009 ; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
   1010 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
   1011 ; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
   1012 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
   1013 ; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
   1014 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
   1015 ; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
   1016 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
   1017 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
   1018 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
   1019 ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
   1020 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
   1021 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
   1022 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
   1023 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
   1024 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
   1025 ; AVX1-NEXT:    addq $32, %rcx
   1026 ; AVX1-NEXT:    cmpq %rcx, %rax
   1027 ; AVX1-NEXT:    jne .LBB7_1
   1028 ; AVX1-NEXT:  # %bb.2: # %middle.block
   1029 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
   1030 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1031 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1032 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   1033 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
   1034 ; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
   1035 ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
   1036 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
   1037 ; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
   1038 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
   1039 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   1040 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1041 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1042 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   1043 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1044 ; AVX1-NEXT:    vzeroupper
   1045 ; AVX1-NEXT:    retq
   1046 ;
   1047 ; AVX2-LABEL: _Z9test_charPcS_i_1024:
   1048 ; AVX2:       # %bb.0: # %entry
   1049 ; AVX2-NEXT:    movl %edx, %eax
   1050 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1051 ; AVX2-NEXT:    xorl %ecx, %ecx
   1052 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1053 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1054 ; AVX2-NEXT:    .p2align 4, 0x90
   1055 ; AVX2-NEXT:  .LBB7_1: # %vector.body
   1056 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
   1057 ; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
   1058 ; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
   1059 ; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
   1060 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
   1061 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
   1062 ; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
   1063 ; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
   1064 ; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
   1065 ; AVX2-NEXT:    addq $32, %rcx
   1066 ; AVX2-NEXT:    cmpq %rcx, %rax
   1067 ; AVX2-NEXT:    jne .LBB7_1
   1068 ; AVX2-NEXT:  # %bb.2: # %middle.block
   1069 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
   1070 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
   1071 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   1072 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1073 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1074 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1075 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1076 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
   1077 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1078 ; AVX2-NEXT:    vzeroupper
   1079 ; AVX2-NEXT:    retq
   1080 ;
   1081 ; AVX512F-LABEL: _Z9test_charPcS_i_1024:
   1082 ; AVX512F:       # %bb.0: # %entry
   1083 ; AVX512F-NEXT:    movl %edx, %eax
   1084 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1085 ; AVX512F-NEXT:    xorl %ecx, %ecx
   1086 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1087 ; AVX512F-NEXT:    .p2align 4, 0x90
   1088 ; AVX512F-NEXT:  .LBB7_1: # %vector.body
   1089 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
   1090 ; AVX512F-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm2
   1091 ; AVX512F-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
   1092 ; AVX512F-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm4
   1093 ; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm4, %ymm2
   1094 ; AVX512F-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm4
   1095 ; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm4, %ymm3
   1096 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
   1097 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
   1098 ; AVX512F-NEXT:    addq $32, %rcx
   1099 ; AVX512F-NEXT:    cmpq %rcx, %rax
   1100 ; AVX512F-NEXT:    jne .LBB7_1
   1101 ; AVX512F-NEXT:  # %bb.2: # %middle.block
   1102 ; AVX512F-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   1103 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1104 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1105 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1106 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1107 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1108 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1109 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1110 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1111 ; AVX512F-NEXT:    vmovd %xmm0, %eax
   1112 ; AVX512F-NEXT:    vzeroupper
   1113 ; AVX512F-NEXT:    retq
   1114 ;
   1115 ; AVX512BW-LABEL: _Z9test_charPcS_i_1024:
   1116 ; AVX512BW:       # %bb.0: # %entry
   1117 ; AVX512BW-NEXT:    movl %edx, %eax
   1118 ; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1119 ; AVX512BW-NEXT:    xorl %ecx, %ecx
   1120 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1121 ; AVX512BW-NEXT:    .p2align 4, 0x90
   1122 ; AVX512BW-NEXT:  .LBB7_1: # %vector.body
   1123 ; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
   1124 ; AVX512BW-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
   1125 ; AVX512BW-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
   1126 ; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
   1127 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
   1128 ; AVX512BW-NEXT:    addq $32, %rcx
   1129 ; AVX512BW-NEXT:    cmpq %rcx, %rax
   1130 ; AVX512BW-NEXT:    jne .LBB7_1
   1131 ; AVX512BW-NEXT:  # %bb.2: # %middle.block
   1132 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   1133 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1134 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1135 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1136 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1137 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1138 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1139 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1140 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1141 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
   1142 ; AVX512BW-NEXT:    vzeroupper
   1143 ; AVX512BW-NEXT:    retq
   1144 entry:
   1145   %3 = zext i32 %2 to i64
   1146   br label %vector.body
   1147 
   1148 vector.body:
   1149   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
   1150   %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
   1151   %4 = getelementptr inbounds i8, i8* %0, i64 %index
   1152   %5 = bitcast i8* %4 to <32 x i8>*
   1153   %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
   1154   %6 = sext <32 x i8> %wide.load to <32 x i32>
   1155   %7 = getelementptr inbounds i8, i8* %1, i64 %index
   1156   %8 = bitcast i8* %7 to <32 x i8>*
   1157   %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
   1158   %9 = sext <32 x i8> %wide.load14 to <32 x i32>
   1159   %10 = mul nsw <32 x i32> %9, %6
   1160   %11 = add nsw <32 x i32> %10, %vec.phi
   1161   %index.next = add i64 %index, 32
   1162   %12 = icmp eq i64 %index.next, %3
   1163   br i1 %12, label %middle.block, label %vector.body
   1164 
   1165 middle.block:
   1166   %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1167   %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
   1168   %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1169   %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
   1170   %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1171   %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
   1172   %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1173   %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
   1174   %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1175   %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
   1176   %13 = extractelement <32 x i32> %bin.rdx20, i32 0
   1177   ret i32 %13
   1178 }
   1179 
   1180 define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
   1181 ; SSE2-LABEL: test_unsigned_short_128:
   1182 ; SSE2:       # %bb.0: # %entry
   1183 ; SSE2-NEXT:    movl %edx, %eax
   1184 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1185 ; SSE2-NEXT:    xorl %ecx, %ecx
   1186 ; SSE2-NEXT:    .p2align 4, 0x90
   1187 ; SSE2-NEXT:  .LBB8_1: # %vector.body
   1188 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
   1189 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   1190 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
   1191 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
   1192 ; SSE2-NEXT:    pmulhuw %xmm1, %xmm3
   1193 ; SSE2-NEXT:    pmullw %xmm1, %xmm2
   1194 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   1195 ; SSE2-NEXT:    paddd %xmm2, %xmm0
   1196 ; SSE2-NEXT:    addq $16, %rcx
   1197 ; SSE2-NEXT:    cmpq %rcx, %rax
   1198 ; SSE2-NEXT:    jne .LBB8_1
   1199 ; SSE2-NEXT:  # %bb.2: # %middle.block
   1200 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1201 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1202 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1203 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1204 ; SSE2-NEXT:    movd %xmm0, %eax
   1205 ; SSE2-NEXT:    retq
   1206 ;
   1207 ; AVX-LABEL: test_unsigned_short_128:
   1208 ; AVX:       # %bb.0: # %entry
   1209 ; AVX-NEXT:    movl %edx, %eax
   1210 ; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1211 ; AVX-NEXT:    xorl %ecx, %ecx
   1212 ; AVX-NEXT:    .p2align 4, 0x90
   1213 ; AVX-NEXT:  .LBB8_1: # %vector.body
   1214 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
   1215 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1216 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1217 ; AVX-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
   1218 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   1219 ; AVX-NEXT:    addq $16, %rcx
   1220 ; AVX-NEXT:    cmpq %rcx, %rax
   1221 ; AVX-NEXT:    jne .LBB8_1
   1222 ; AVX-NEXT:  # %bb.2: # %middle.block
   1223 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1224 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1225 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   1226 ; AVX-NEXT:    vmovd %xmm0, %eax
   1227 ; AVX-NEXT:    retq
   1228 entry:
   1229   %3 = zext i32 %2 to i64
   1230   br label %vector.body
   1231 
   1232 vector.body:
   1233   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
   1234   %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
   1235   %4 = getelementptr inbounds i16, i16* %0, i64 %index
   1236   %5 = bitcast i16* %4 to <4 x i16>*
   1237   %wide.load = load <4 x i16>, <4 x i16>* %5, align 2
   1238   %6 = zext <4 x i16> %wide.load to <4 x i32>
   1239   %7 = getelementptr inbounds i16, i16* %1, i64 %index
   1240   %8 = bitcast i16* %7 to <4 x i16>*
   1241   %wide.load14 = load <4 x i16>, <4 x i16>* %8, align 2
   1242   %9 = zext <4 x i16> %wide.load14 to <4 x i32>
   1243   %10 = mul nsw <4 x i32> %9, %6
   1244   %11 = add nsw <4 x i32> %10, %vec.phi
   1245   %index.next = add i64 %index, 16
   1246   %12 = icmp eq i64 %index.next, %3
   1247   br i1 %12, label %middle.block, label %vector.body
   1248 
   1249 middle.block:
   1250   %rdx.shuf15 = shufflevector <4 x i32> %11, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   1251   %bin.rdx16 = add <4 x i32> %11, %rdx.shuf15
   1252   %rdx.shuf17 = shufflevector <4 x i32> %bin.rdx16, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   1253   %bin.rdx18 = add <4 x i32> %bin.rdx16, %rdx.shuf17
   1254   %13 = extractelement <4 x i32> %bin.rdx18, i32 0
   1255   ret i32 %13
   1256 }
   1257 
   1258 define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
   1259 ; SSE2-LABEL: test_unsigned_short_256:
   1260 ; SSE2:       # %bb.0: # %entry
   1261 ; SSE2-NEXT:    movl %edx, %eax
   1262 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1263 ; SSE2-NEXT:    xorl %ecx, %ecx
   1264 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1265 ; SSE2-NEXT:    .p2align 4, 0x90
   1266 ; SSE2-NEXT:  .LBB9_1: # %vector.body
   1267 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
   1268 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm2
   1269 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm3
   1270 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1271 ; SSE2-NEXT:    pmulhuw %xmm2, %xmm4
   1272 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
   1273 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
   1274 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
   1275 ; SSE2-NEXT:    paddd %xmm2, %xmm0
   1276 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   1277 ; SSE2-NEXT:    paddd %xmm3, %xmm1
   1278 ; SSE2-NEXT:    addq $16, %rcx
   1279 ; SSE2-NEXT:    cmpq %rcx, %rax
   1280 ; SSE2-NEXT:    jne .LBB9_1
   1281 ; SSE2-NEXT:  # %bb.2: # %middle.block
   1282 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1283 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1284 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1285 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1286 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1287 ; SSE2-NEXT:    movd %xmm0, %eax
   1288 ; SSE2-NEXT:    retq
   1289 ;
   1290 ; AVX1-LABEL: test_unsigned_short_256:
   1291 ; AVX1:       # %bb.0: # %entry
   1292 ; AVX1-NEXT:    movl %edx, %eax
   1293 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1294 ; AVX1-NEXT:    xorl %ecx, %ecx
   1295 ; AVX1-NEXT:    .p2align 4, 0x90
   1296 ; AVX1-NEXT:  .LBB9_1: # %vector.body
   1297 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
   1298 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1299 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1300 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1301 ; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
   1302 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1303 ; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
   1304 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1305 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
   1306 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
   1307 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1308 ; AVX1-NEXT:    addq $16, %rcx
   1309 ; AVX1-NEXT:    cmpq %rcx, %rax
   1310 ; AVX1-NEXT:    jne .LBB9_1
   1311 ; AVX1-NEXT:  # %bb.2: # %middle.block
   1312 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1313 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1314 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1315 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1316 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   1317 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1318 ; AVX1-NEXT:    vzeroupper
   1319 ; AVX1-NEXT:    retq
   1320 ;
   1321 ; AVX256-LABEL: test_unsigned_short_256:
   1322 ; AVX256:       # %bb.0: # %entry
   1323 ; AVX256-NEXT:    movl %edx, %eax
   1324 ; AVX256-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1325 ; AVX256-NEXT:    xorl %ecx, %ecx
   1326 ; AVX256-NEXT:    .p2align 4, 0x90
   1327 ; AVX256-NEXT:  .LBB9_1: # %vector.body
   1328 ; AVX256-NEXT:    # =>This Inner Loop Header: Depth=1
   1329 ; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1330 ; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1331 ; AVX256-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
   1332 ; AVX256-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
   1333 ; AVX256-NEXT:    addq $16, %rcx
   1334 ; AVX256-NEXT:    cmpq %rcx, %rax
   1335 ; AVX256-NEXT:    jne .LBB9_1
   1336 ; AVX256-NEXT:  # %bb.2: # %middle.block
   1337 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1338 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1339 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1340 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1341 ; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
   1342 ; AVX256-NEXT:    vmovd %xmm0, %eax
   1343 ; AVX256-NEXT:    vzeroupper
   1344 ; AVX256-NEXT:    retq
   1345 entry:
   1346   %3 = zext i32 %2 to i64
   1347   br label %vector.body
   1348 
   1349 vector.body:
   1350   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
   1351   %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
   1352   %4 = getelementptr inbounds i16, i16* %0, i64 %index
   1353   %5 = bitcast i16* %4 to <8 x i16>*
   1354   %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
   1355   %6 = zext <8 x i16> %wide.load to <8 x i32>
   1356   %7 = getelementptr inbounds i16, i16* %1, i64 %index
   1357   %8 = bitcast i16* %7 to <8 x i16>*
   1358   %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
   1359   %9 = zext <8 x i16> %wide.load14 to <8 x i32>
   1360   %10 = mul nsw <8 x i32> %9, %6
   1361   %11 = add nsw <8 x i32> %10, %vec.phi
   1362   %index.next = add i64 %index, 16
   1363   %12 = icmp eq i64 %index.next, %3
   1364   br i1 %12, label %middle.block, label %vector.body
   1365 
   1366 middle.block:
   1367   %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   1368   %bin.rdx = add <8 x i32> %11, %rdx.shuf
   1369   %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1370   %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
   1371   %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1372   %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
   1373   %13 = extractelement <8 x i32> %bin.rdx18, i32 0
   1374   ret i32 %13
   1375 }
   1376 
   1377 define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
   1378 ; SSE2-LABEL: test_unsigned_short_512:
   1379 ; SSE2:       # %bb.0: # %entry
   1380 ; SSE2-NEXT:    movl %edx, %eax
   1381 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1382 ; SSE2-NEXT:    xorl %ecx, %ecx
   1383 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1384 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1385 ; SSE2-NEXT:    pxor %xmm2, %xmm2
   1386 ; SSE2-NEXT:    .p2align 4, 0x90
   1387 ; SSE2-NEXT:  .LBB10_1: # %vector.body
   1388 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
   1389 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm4
   1390 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm8
   1391 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm6
   1392 ; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm7
   1393 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
   1394 ; SSE2-NEXT:    pmulhuw %xmm4, %xmm5
   1395 ; SSE2-NEXT:    pmullw %xmm4, %xmm6
   1396 ; SSE2-NEXT:    movdqa %xmm6, %xmm4
   1397 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
   1398 ; SSE2-NEXT:    paddd %xmm4, %xmm0
   1399 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
   1400 ; SSE2-NEXT:    paddd %xmm6, %xmm1
   1401 ; SSE2-NEXT:    movdqa %xmm7, %xmm4
   1402 ; SSE2-NEXT:    pmulhuw %xmm8, %xmm4
   1403 ; SSE2-NEXT:    pmullw %xmm8, %xmm7
   1404 ; SSE2-NEXT:    movdqa %xmm7, %xmm5
   1405 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
   1406 ; SSE2-NEXT:    paddd %xmm5, %xmm3
   1407 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
   1408 ; SSE2-NEXT:    paddd %xmm7, %xmm2
   1409 ; SSE2-NEXT:    addq $16, %rcx
   1410 ; SSE2-NEXT:    cmpq %rcx, %rax
   1411 ; SSE2-NEXT:    jne .LBB10_1
   1412 ; SSE2-NEXT:  # %bb.2: # %middle.block
   1413 ; SSE2-NEXT:    paddd %xmm3, %xmm0
   1414 ; SSE2-NEXT:    paddd %xmm2, %xmm1
   1415 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1416 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1417 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1418 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1419 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1420 ; SSE2-NEXT:    movd %xmm1, %eax
   1421 ; SSE2-NEXT:    retq
   1422 ;
   1423 ; AVX1-LABEL: test_unsigned_short_512:
   1424 ; AVX1:       # %bb.0: # %entry
   1425 ; AVX1-NEXT:    movl %edx, %eax
   1426 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1427 ; AVX1-NEXT:    xorl %ecx, %ecx
   1428 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1429 ; AVX1-NEXT:    .p2align 4, 0x90
   1430 ; AVX1-NEXT:  .LBB10_1: # %vector.body
   1431 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
   1432 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1433 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1434 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1435 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1436 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1437 ; AVX1-NEXT:    vpmulld %xmm2, %xmm6, %xmm2
   1438 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1439 ; AVX1-NEXT:    vpmulld %xmm3, %xmm6, %xmm3
   1440 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1441 ; AVX1-NEXT:    vpmulld %xmm4, %xmm6, %xmm4
   1442 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1443 ; AVX1-NEXT:    vpmulld %xmm5, %xmm6, %xmm5
   1444 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
   1445 ; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
   1446 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
   1447 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1448 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1449 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
   1450 ; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
   1451 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1452 ; AVX1-NEXT:    addq $16, %rcx
   1453 ; AVX1-NEXT:    cmpq %rcx, %rax
   1454 ; AVX1-NEXT:    jne .LBB10_1
   1455 ; AVX1-NEXT:  # %bb.2: # %middle.block
   1456 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1457 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1458 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1459 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1460 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1461 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1462 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1463 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   1464 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1465 ; AVX1-NEXT:    vzeroupper
   1466 ; AVX1-NEXT:    retq
   1467 ;
   1468 ; AVX2-LABEL: test_unsigned_short_512:
   1469 ; AVX2:       # %bb.0: # %entry
   1470 ; AVX2-NEXT:    movl %edx, %eax
   1471 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1472 ; AVX2-NEXT:    xorl %ecx, %ecx
   1473 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1474 ; AVX2-NEXT:    .p2align 4, 0x90
   1475 ; AVX2-NEXT:  .LBB10_1: # %vector.body
   1476 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
   1477 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1478 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1479 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1480 ; AVX2-NEXT:    vpmulld %ymm2, %ymm4, %ymm2
   1481 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
   1482 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1483 ; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
   1484 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
   1485 ; AVX2-NEXT:    addq $16, %rcx
   1486 ; AVX2-NEXT:    cmpq %rcx, %rax
   1487 ; AVX2-NEXT:    jne .LBB10_1
   1488 ; AVX2-NEXT:  # %bb.2: # %middle.block
   1489 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1490 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1491 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1492 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1493 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1494 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
   1495 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1496 ; AVX2-NEXT:    vzeroupper
   1497 ; AVX2-NEXT:    retq
   1498 ;
   1499 ; AVX512-LABEL: test_unsigned_short_512:
   1500 ; AVX512:       # %bb.0: # %entry
   1501 ; AVX512-NEXT:    movl %edx, %eax
   1502 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1503 ; AVX512-NEXT:    xorl %ecx, %ecx
   1504 ; AVX512-NEXT:    .p2align 4, 0x90
   1505 ; AVX512-NEXT:  .LBB10_1: # %vector.body
   1506 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
   1507 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1508 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1509 ; AVX512-NEXT:    vpmulld %zmm1, %zmm2, %zmm1
   1510 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   1511 ; AVX512-NEXT:    addq $16, %rcx
   1512 ; AVX512-NEXT:    cmpq %rcx, %rax
   1513 ; AVX512-NEXT:    jne .LBB10_1
   1514 ; AVX512-NEXT:  # %bb.2: # %middle.block
   1515 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1516 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1517 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1518 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1519 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1520 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1521 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1522 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1523 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1524 ; AVX512-NEXT:    vzeroupper
   1525 ; AVX512-NEXT:    retq
   1526 entry:
   1527   %3 = zext i32 %2 to i64
   1528   br label %vector.body
   1529 
   1530 vector.body:
   1531   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
   1532   %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
   1533   %4 = getelementptr inbounds i16, i16* %0, i64 %index
   1534   %5 = bitcast i16* %4 to <16 x i16>*
   1535   %wide.load = load <16 x i16>, <16 x i16>* %5, align 2
   1536   %6 = zext <16 x i16> %wide.load to <16 x i32>
   1537   %7 = getelementptr inbounds i16, i16* %1, i64 %index
   1538   %8 = bitcast i16* %7 to <16 x i16>*
   1539   %wide.load14 = load <16 x i16>, <16 x i16>* %8, align 2
   1540   %9 = zext <16 x i16> %wide.load14 to <16 x i32>
   1541   %10 = mul nsw <16 x i32> %9, %6
   1542   %11 = add nsw <16 x i32> %10, %vec.phi
   1543   %index.next = add i64 %index, 16
   1544   %12 = icmp eq i64 %index.next, %3
   1545   br i1 %12, label %middle.block, label %vector.body
   1546 
   1547 middle.block:
   1548   %rdx.shuf1 = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1549   %bin.rdx1 = add <16 x i32> %11, %rdx.shuf1
   1550   %rdx.shuf = shufflevector <16 x i32> %bin.rdx1, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1551   %bin.rdx = add <16 x i32> %bin.rdx1, %rdx.shuf
   1552   %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1553   %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
   1554   %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1555   %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
   1556   %13 = extractelement <16 x i32> %bin.rdx18, i32 0
   1557   ret i32 %13
   1558 }
   1559 
   1560 define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
   1561 ; SSE2-LABEL: test_unsigned_short_1024:
   1562 ; SSE2:       # %bb.0: # %entry
   1563 ; SSE2-NEXT:    movl %edx, %eax
   1564 ; SSE2-NEXT:    pxor %xmm8, %xmm8
   1565 ; SSE2-NEXT:    xorl %ecx, %ecx
   1566 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1567 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   1568 ; SSE2-NEXT:    pxor %xmm10, %xmm10
   1569 ; SSE2-NEXT:    pxor %xmm4, %xmm4
   1570 ; SSE2-NEXT:    pxor %xmm6, %xmm6
   1571 ; SSE2-NEXT:    pxor %xmm5, %xmm5
   1572 ; SSE2-NEXT:    pxor %xmm7, %xmm7
   1573 ; SSE2-NEXT:    .p2align 4, 0x90
   1574 ; SSE2-NEXT:  .LBB11_1: # %vector.body
   1575 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
   1576 ; SSE2-NEXT:    movdqu 48(%rdi,%rcx,2), %xmm0
   1577 ; SSE2-NEXT:    movdqu 48(%rsi,%rcx,2), %xmm1
   1578 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1579 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
   1580 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   1581 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1582 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
   1583 ; SSE2-NEXT:    paddd %xmm0, %xmm7
   1584 ; SSE2-NEXT:    movdqu 32(%rdi,%rcx,2), %xmm0
   1585 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1586 ; SSE2-NEXT:    movdqu 32(%rsi,%rcx,2), %xmm2
   1587 ; SSE2-NEXT:    paddd %xmm1, %xmm5
   1588 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
   1589 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
   1590 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   1591 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
   1592 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1593 ; SSE2-NEXT:    paddd %xmm0, %xmm6
   1594 ; SSE2-NEXT:    movdqu (%rdi,%rcx,2), %xmm0
   1595 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   1596 ; SSE2-NEXT:    movdqu (%rsi,%rcx,2), %xmm1
   1597 ; SSE2-NEXT:    paddd %xmm2, %xmm4
   1598 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1599 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm2
   1600 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   1601 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1602 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1603 ; SSE2-NEXT:    paddd %xmm0, %xmm8
   1604 ; SSE2-NEXT:    movdqu 16(%rdi,%rcx,2), %xmm0
   1605 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   1606 ; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm2
   1607 ; SSE2-NEXT:    paddd %xmm1, %xmm3
   1608 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
   1609 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
   1610 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   1611 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
   1612 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1613 ; SSE2-NEXT:    paddd %xmm0, %xmm9
   1614 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
   1615 ; SSE2-NEXT:    paddd %xmm2, %xmm10
   1616 ; SSE2-NEXT:    addq $16, %rcx
   1617 ; SSE2-NEXT:    cmpq %rcx, %rax
   1618 ; SSE2-NEXT:    jne .LBB11_1
   1619 ; SSE2-NEXT:  # %bb.2: # %middle.block
   1620 ; SSE2-NEXT:    paddd %xmm6, %xmm3
   1621 ; SSE2-NEXT:    paddd %xmm7, %xmm10
   1622 ; SSE2-NEXT:    paddd %xmm3, %xmm10
   1623 ; SSE2-NEXT:    paddd %xmm4, %xmm8
   1624 ; SSE2-NEXT:    paddd %xmm5, %xmm9
   1625 ; SSE2-NEXT:    paddd %xmm10, %xmm9
   1626 ; SSE2-NEXT:    paddd %xmm8, %xmm9
   1627 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1]
   1628 ; SSE2-NEXT:    paddd %xmm9, %xmm0
   1629 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1630 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1631 ; SSE2-NEXT:    movd %xmm1, %eax
   1632 ; SSE2-NEXT:    retq
   1633 ;
   1634 ; AVX1-LABEL: test_unsigned_short_1024:
   1635 ; AVX1:       # %bb.0: # %entry
   1636 ; AVX1-NEXT:    movl %edx, %eax
   1637 ; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
   1638 ; AVX1-NEXT:    xorl %ecx, %ecx
   1639 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1640 ; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
   1641 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   1642 ; AVX1-NEXT:    .p2align 4, 0x90
   1643 ; AVX1-NEXT:  .LBB11_1: # %vector.body
   1644 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
   1645 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1646 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1647 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1648 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1649 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1650 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1651 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1652 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1653 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1654 ; AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
   1655 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1656 ; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
   1657 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1658 ; AVX1-NEXT:    vpmulld %xmm6, %xmm5, %xmm5
   1659 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1660 ; AVX1-NEXT:    vpmulld %xmm7, %xmm6, %xmm6
   1661 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1662 ; AVX1-NEXT:    vpmulld %xmm0, %xmm7, %xmm13
   1663 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1664 ; AVX1-NEXT:    vpmulld %xmm12, %xmm7, %xmm7
   1665 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1666 ; AVX1-NEXT:    vpmulld %xmm10, %xmm0, %xmm10
   1667 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1668 ; AVX1-NEXT:    vpmulld %xmm11, %xmm0, %xmm11
   1669 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1670 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   1671 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm1
   1672 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
   1673 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm0
   1674 ; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
   1675 ; AVX1-NEXT:    vpaddd %xmm8, %xmm6, %xmm1
   1676 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
   1677 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
   1678 ; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm0
   1679 ; AVX1-NEXT:    vpaddd %xmm9, %xmm7, %xmm1
   1680 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
   1681 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
   1682 ; AVX1-NEXT:    vpaddd %xmm0, %xmm10, %xmm0
   1683 ; AVX1-NEXT:    vpaddd %xmm3, %xmm11, %xmm1
   1684 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
   1685 ; AVX1-NEXT:    addq $16, %rcx
   1686 ; AVX1-NEXT:    cmpq %rcx, %rax
   1687 ; AVX1-NEXT:    jne .LBB11_1
   1688 ; AVX1-NEXT:  # %bb.2: # %middle.block
   1689 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm0
   1690 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
   1691 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
   1692 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   1693 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1694 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1695 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
   1696 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1697 ; AVX1-NEXT:    vpaddd %xmm0, %xmm9, %xmm0
   1698 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1699 ; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
   1700 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1701 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1702 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
   1703 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1704 ; AVX1-NEXT:    vzeroupper
   1705 ; AVX1-NEXT:    retq
   1706 ;
   1707 ; AVX2-LABEL: test_unsigned_short_1024:
   1708 ; AVX2:       # %bb.0: # %entry
   1709 ; AVX2-NEXT:    movl %edx, %eax
   1710 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1711 ; AVX2-NEXT:    xorl %ecx, %ecx
   1712 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1713 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1714 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   1715 ; AVX2-NEXT:    .p2align 4, 0x90
   1716 ; AVX2-NEXT:  .LBB11_1: # %vector.body
   1717 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
   1718 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1719 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1720 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1721 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1722 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1723 ; AVX2-NEXT:    vpmulld %ymm4, %ymm8, %ymm4
   1724 ; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
   1725 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1726 ; AVX2-NEXT:    vpmulld %ymm5, %ymm4, %ymm4
   1727 ; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
   1728 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1729 ; AVX2-NEXT:    vpmulld %ymm6, %ymm4, %ymm4
   1730 ; AVX2-NEXT:    vpaddd %ymm0, %ymm4, %ymm0
   1731 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   1732 ; AVX2-NEXT:    vpmulld %ymm7, %ymm4, %ymm4
   1733 ; AVX2-NEXT:    vpaddd %ymm3, %ymm4, %ymm3
   1734 ; AVX2-NEXT:    addq $16, %rcx
   1735 ; AVX2-NEXT:    cmpq %rcx, %rax
   1736 ; AVX2-NEXT:    jne .LBB11_1
   1737 ; AVX2-NEXT:  # %bb.2: # %middle.block
   1738 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   1739 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
   1740 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1741 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1742 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1743 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1744 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1745 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
   1746 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1747 ; AVX2-NEXT:    vzeroupper
   1748 ; AVX2-NEXT:    retq
   1749 ;
   1750 ; AVX512-LABEL: test_unsigned_short_1024:
   1751 ; AVX512:       # %bb.0: # %entry
   1752 ; AVX512-NEXT:    movl %edx, %eax
   1753 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1754 ; AVX512-NEXT:    xorl %ecx, %ecx
   1755 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1756 ; AVX512-NEXT:    .p2align 4, 0x90
   1757 ; AVX512-NEXT:  .LBB11_1: # %vector.body
   1758 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
   1759 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1760 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1761 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1762 ; AVX512-NEXT:    vpmulld %zmm2, %zmm4, %zmm2
   1763 ; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
   1764 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   1765 ; AVX512-NEXT:    vpmulld %zmm3, %zmm2, %zmm2
   1766 ; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
   1767 ; AVX512-NEXT:    addq $16, %rcx
   1768 ; AVX512-NEXT:    cmpq %rcx, %rax
   1769 ; AVX512-NEXT:    jne .LBB11_1
   1770 ; AVX512-NEXT:  # %bb.2: # %middle.block
   1771 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1772 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1773 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1774 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1775 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1776 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1777 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1778 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1779 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1780 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1781 ; AVX512-NEXT:    vzeroupper
   1782 ; AVX512-NEXT:    retq
   1783 entry:
   1784   %3 = zext i32 %2 to i64
   1785   br label %vector.body
   1786 
   1787 vector.body:
   1788   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
   1789   %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
   1790   %4 = getelementptr inbounds i16, i16* %0, i64 %index
   1791   %5 = bitcast i16* %4 to <32 x i16>*
   1792   %wide.load = load <32 x i16>, <32 x i16>* %5, align 2
   1793   %6 = zext <32 x i16> %wide.load to <32 x i32>
   1794   %7 = getelementptr inbounds i16, i16* %1, i64 %index
   1795   %8 = bitcast i16* %7 to <32 x i16>*
   1796   %wide.load14 = load <32 x i16>, <32 x i16>* %8, align 2
   1797   %9 = zext <32 x i16> %wide.load14 to <32 x i32>
   1798   %10 = mul nsw <32 x i32> %9, %6
   1799   %11 = add nsw <32 x i32> %10, %vec.phi
   1800   %index.next = add i64 %index, 16
   1801   %12 = icmp eq i64 %index.next, %3
   1802   br i1 %12, label %middle.block, label %vector.body
   1803 
   1804 middle.block:
   1805   %rdx.shuf2 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1806   %bin.rdx2 = add <32 x i32> %11, %rdx.shuf2
   1807   %rdx.shuf1 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1808   %bin.rdx1 = add <32 x i32> %bin.rdx2, %rdx.shuf1
   1809   %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1810   %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
   1811   %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1812   %bin.rdx16 = add <32 x i32> %bin.rdx, %rdx.shuf15
   1813   %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx16, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1814   %bin.rdx18 = add <32 x i32> %bin.rdx16, %rdx.shuf17
   1815   %13 = extractelement <32 x i32> %bin.rdx18, i32 0
   1816   ret i32 %13
   1817 }
   1818 
   1819 define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) {
   1820 ; SSE2-LABEL: pmaddwd_8:
   1821 ; SSE2:       # %bb.0:
   1822 ; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
   1823 ; SSE2-NEXT:    retq
   1824 ;
   1825 ; AVX-LABEL: pmaddwd_8:
   1826 ; AVX:       # %bb.0:
   1827 ; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1828 ; AVX-NEXT:    retq
   1829    %a = sext <8 x i16> %A to <8 x i32>
   1830    %b = sext <8 x i16> %B to <8 x i32>
   1831    %m = mul nsw <8 x i32> %a, %b
   1832    %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   1833    %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   1834    %ret = add <4 x i32> %odd, %even
   1835    ret <4 x i32> %ret
   1836 }
   1837 
   1838 define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) {
   1839 ; SSE2-LABEL: pmaddwd_8_swapped:
   1840 ; SSE2:       # %bb.0:
   1841 ; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
   1842 ; SSE2-NEXT:    retq
   1843 ;
   1844 ; AVX-LABEL: pmaddwd_8_swapped:
   1845 ; AVX:       # %bb.0:
   1846 ; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1847 ; AVX-NEXT:    retq
   1848    %a = sext <8 x i16> %A to <8 x i32>
   1849    %b = sext <8 x i16> %B to <8 x i32>
   1850    %m = mul nsw <8 x i32> %a, %b
   1851    %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   1852    %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   1853    %ret = add <4 x i32> %even, %odd
   1854    ret <4 x i32> %ret
   1855 }
   1856 
   1857 define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
   1858 ; SSE2-LABEL: larger_mul:
   1859 ; SSE2:       # %bb.0:
   1860 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1861 ; SSE2-NEXT:    pmulhw %xmm2, %xmm1
   1862 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
   1863 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1864 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
   1865 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1866 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1867 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
   1868 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
   1869 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1870 ; SSE2-NEXT:    retq
   1871 ;
   1872 ; AVX1-LABEL: larger_mul:
   1873 ; AVX1:       # %bb.0:
   1874 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
   1875 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1876 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
   1877 ; AVX1-NEXT:    vpackssdw %xmm0, %xmm2, %xmm0
   1878 ; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm2
   1879 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1880 ; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
   1881 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
   1882 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1883 ; AVX1-NEXT:    vzeroupper
   1884 ; AVX1-NEXT:    retq
   1885 ;
   1886 ; AVX2-LABEL: larger_mul:
   1887 ; AVX2:       # %bb.0:
   1888 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
   1889 ; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
   1890 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1891 ; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
   1892 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1893 ; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
   1894 ; AVX2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1895 ; AVX2-NEXT:    vzeroupper
   1896 ; AVX2-NEXT:    retq
   1897 ;
   1898 ; AVX512-LABEL: larger_mul:
   1899 ; AVX512:       # %bb.0:
   1900 ; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
   1901 ; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
   1902 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1903 ; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
   1904 ; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm1
   1905 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1906 ; AVX512-NEXT:    vmovd %xmm2, %eax
   1907 ; AVX512-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
   1908 ; AVX512-NEXT:    vpextrd $2, %xmm2, %eax
   1909 ; AVX512-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
   1910 ; AVX512-NEXT:    vpextrd $3, %xmm0, %eax
   1911 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1912 ; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
   1913 ; AVX512-NEXT:    vpextrd $1, %xmm2, %eax
   1914 ; AVX512-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
   1915 ; AVX512-NEXT:    vpextrd $3, %xmm2, %eax
   1916 ; AVX512-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
   1917 ; AVX512-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
   1918 ; AVX512-NEXT:    vzeroupper
   1919 ; AVX512-NEXT:    retq
   1920    %a = sext <16 x i16> %A to <16 x i32>
   1921    %b = sext <16 x i16> %B to <16 x i32>
   1922    %m = mul nsw <16 x i32> %a, %b
   1923    %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   1924    %even = shufflevector <16 x i32> %m, <16 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   1925    %ret = add <4 x i32> %odd, %even
   1926    ret <4 x i32> %ret
   1927 }
   1928 
   1929 define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
   1930 ; SSE2-LABEL: pmaddwd_16:
   1931 ; SSE2:       # %bb.0:
   1932 ; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
   1933 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
   1934 ; SSE2-NEXT:    retq
   1935 ;
   1936 ; AVX1-LABEL: pmaddwd_16:
   1937 ; AVX1:       # %bb.0:
   1938 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1939 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1940 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
   1941 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1942 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1943 ; AVX1-NEXT:    retq
   1944 ;
   1945 ; AVX256-LABEL: pmaddwd_16:
   1946 ; AVX256:       # %bb.0:
   1947 ; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
   1948 ; AVX256-NEXT:    retq
   1949    %a = sext <16 x i16> %A to <16 x i32>
   1950    %b = sext <16 x i16> %B to <16 x i32>
   1951    %m = mul nsw <16 x i32> %a, %b
   1952    %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   1953    %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   1954    %ret = add <8 x i32> %odd, %even
   1955    ret <8 x i32> %ret
   1956 }
   1957 
   1958 define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
   1959 ; SSE2-LABEL: pmaddwd_32:
   1960 ; SSE2:       # %bb.0:
   1961 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
   1962 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
   1963 ; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
   1964 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
   1965 ; SSE2-NEXT:    retq
   1966 ;
   1967 ; AVX1-LABEL: pmaddwd_32:
   1968 ; AVX1:       # %bb.0:
   1969 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1970 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1971 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
   1972 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
   1973 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
   1974 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
   1975 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
   1976 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
   1977 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
   1978 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
   1979 ; AVX1-NEXT:    retq
   1980 ;
   1981 ; AVX2-LABEL: pmaddwd_32:
   1982 ; AVX2:       # %bb.0:
   1983 ; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   1984 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
   1985 ; AVX2-NEXT:    retq
   1986 ;
   1987 ; AVX512F-LABEL: pmaddwd_32:
   1988 ; AVX512F:       # %bb.0:
   1989 ; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
   1990 ; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   1991 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   1992 ; AVX512F-NEXT:    retq
   1993 ;
   1994 ; AVX512BW-LABEL: pmaddwd_32:
   1995 ; AVX512BW:       # %bb.0:
   1996 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
   1997 ; AVX512BW-NEXT:    retq
   1998    %a = sext <32 x i16> %A to <32 x i32>
   1999    %b = sext <32 x i16> %B to <32 x i32>
   2000    %m = mul nsw <32 x i32> %a, %b
   2001    %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   2002    %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   2003    %ret = add <16 x i32> %odd, %even
   2004    ret <16 x i32> %ret
   2005 }
   2006 
   2007 define <4 x i32> @pmaddwd_const(<8 x i16> %A) {
   2008 ; SSE2-LABEL: pmaddwd_const:
   2009 ; SSE2:       # %bb.0:
   2010 ; SSE2-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
   2011 ; SSE2-NEXT:    retq
   2012 ;
   2013 ; AVX-LABEL: pmaddwd_const:
   2014 ; AVX:       # %bb.0:
   2015 ; AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
   2016 ; AVX-NEXT:    retq
   2017    %a = sext <8 x i16> %A to <8 x i32>
   2018    %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
   2019    %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2020    %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2021    %ret = add <4 x i32> %odd, %even
   2022    ret <4 x i32> %ret
   2023 }
   2024 
   2025 ; Do not select unsigned i16 multiplication
   2026 define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
   2027 ; SSE2-LABEL: pmaddwd_negative1:
   2028 ; SSE2:       # %bb.0:
   2029 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   2030 ; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
   2031 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2032 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   2033 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   2034 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   2035 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   2036 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
   2037 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
   2038 ; SSE2-NEXT:    paddd %xmm2, %xmm0
   2039 ; SSE2-NEXT:    retq
   2040 ;
   2041 ; AVX1-LABEL: pmaddwd_negative1:
   2042 ; AVX1:       # %bb.0:
   2043 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   2044 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2045 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2046 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
   2047 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   2048 ; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
   2049 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2050 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   2051 ; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
   2052 ; AVX1-NEXT:    retq
   2053 ;
   2054 ; AVX256-LABEL: pmaddwd_negative1:
   2055 ; AVX256:       # %bb.0:
   2056 ; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2057 ; AVX256-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2058 ; AVX256-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   2059 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2060 ; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
   2061 ; AVX256-NEXT:    vzeroupper
   2062 ; AVX256-NEXT:    retq
   2063    %a = zext <8 x i16> %A to <8 x i32>
   2064    %b = zext <8 x i16> %B to <8 x i32>
   2065    %m = mul nuw <8 x i32> %a, %b
   2066    %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2067    %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2068    %ret = add <4 x i32> %odd, %even
   2069    ret <4 x i32> %ret
   2070 }
   2071 
   2072 ; Do not select if constant is too large
   2073 define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
   2074 ; SSE2-LABEL: pmaddwd_negative2:
   2075 ; SSE2:       # %bb.0:
   2076 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2077 ; SSE2-NEXT:    psrad $16, %xmm1
   2078 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
   2079 ; SSE2-NEXT:    psrad $16, %xmm0
   2080 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,7,42,32]
   2081 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
   2082 ; SSE2-NEXT:    pmuludq %xmm2, %xmm0
   2083 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
   2084 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
   2085 ; SSE2-NEXT:    pmuludq %xmm3, %xmm0
   2086 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2087 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
   2088 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0]
   2089 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
   2090 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
   2091 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
   2092 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2093 ; SSE2-NEXT:    pmuludq %xmm3, %xmm1
   2094 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   2095 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2096 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   2097 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
   2098 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
   2099 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   2100 ; SSE2-NEXT:    retq
   2101 ;
   2102 ; AVX1-LABEL: pmaddwd_negative2:
   2103 ; AVX1:       # %bb.0:
   2104 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2105 ; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
   2106 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
   2107 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2108 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
   2109 ; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
   2110 ; AVX1-NEXT:    retq
   2111 ;
   2112 ; AVX256-LABEL: pmaddwd_negative2:
   2113 ; AVX256:       # %bb.0:
   2114 ; AVX256-NEXT:    vpmovsxwd %xmm0, %ymm0
   2115 ; AVX256-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
   2116 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2117 ; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
   2118 ; AVX256-NEXT:    vzeroupper
   2119 ; AVX256-NEXT:    retq
   2120    %a = sext <8 x i16> %A to <8 x i32>
   2121    %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32>
   2122    %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2123    %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2124    %ret = add <4 x i32> %odd, %even
   2125    ret <4 x i32> %ret
   2126 }
   2127 
   2128 define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) {
   2129 ; SSE2-LABEL: jumbled_indices4:
   2130 ; SSE2:       # %bb.0:
   2131 ; SSE2-NEXT:    pmaddwd %xmm1, %xmm0
   2132 ; SSE2-NEXT:    retq
   2133 ;
   2134 ; AVX-LABEL: jumbled_indices4:
   2135 ; AVX:       # %bb.0:
   2136 ; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   2137 ; AVX-NEXT:    retq
   2138   %exta = sext <8 x i16> %A to <8 x i32>
   2139   %extb = sext <8 x i16> %B to <8 x i32>
   2140   %m = mul <8 x i32> %exta, %extb
   2141   %sa = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 3, i32 1, i32 5, i32 6>
   2142   %sb = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 2, i32 0, i32 4, i32 7>
   2143   %a = add <4 x i32> %sa, %sb
   2144   ret <4 x i32> %a
   2145 }
   2146 
   2147 define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) {
   2148 ; SSE2-LABEL: jumbled_indices8:
   2149 ; SSE2:       # %bb.0:
   2150 ; SSE2-NEXT:    pmaddwd %xmm2, %xmm0
   2151 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm1
   2152 ; SSE2-NEXT:    retq
   2153 ;
   2154 ; AVX1-LABEL: jumbled_indices8:
   2155 ; AVX1:       # %bb.0:
   2156 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2157 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   2158 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
   2159 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   2160 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2161 ; AVX1-NEXT:    retq
   2162 ;
   2163 ; AVX256-LABEL: jumbled_indices8:
   2164 ; AVX256:       # %bb.0:
   2165 ; AVX256-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
   2166 ; AVX256-NEXT:    retq
   2167   %exta = sext <16 x i16> %A to <16 x i32>
   2168   %extb = sext <16 x i16> %B to <16 x i32>
   2169   %m = mul <16 x i32> %exta, %extb
   2170   %sa = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 7, i32 4, i32 11, i32 8, i32 15, i32 12>
   2171   %sb = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 6, i32 5, i32 10, i32 9, i32 14, i32 13>
   2172   %a = add <8 x i32> %sa, %sb
   2173   ret <8 x i32> %a
   2174 }
   2175 
   2176 define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
   2177 ; SSE2-LABEL: jumbled_indices16:
   2178 ; SSE2:       # %bb.0:
   2179 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm0
   2180 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm1
   2181 ; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
   2182 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm3
   2183 ; SSE2-NEXT:    retq
   2184 ;
   2185 ; AVX1-LABEL: jumbled_indices16:
   2186 ; AVX1:       # %bb.0:
   2187 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   2188 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   2189 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
   2190 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
   2191 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
   2192 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
   2193 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
   2194 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
   2195 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
   2196 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
   2197 ; AVX1-NEXT:    retq
   2198 ;
   2199 ; AVX2-LABEL: jumbled_indices16:
   2200 ; AVX2:       # %bb.0:
   2201 ; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   2202 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
   2203 ; AVX2-NEXT:    retq
   2204 ;
   2205 ; AVX512F-LABEL: jumbled_indices16:
   2206 ; AVX512F:       # %bb.0:
   2207 ; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
   2208 ; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   2209 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2210 ; AVX512F-NEXT:    retq
   2211 ;
   2212 ; AVX512BW-LABEL: jumbled_indices16:
   2213 ; AVX512BW:       # %bb.0:
   2214 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
   2215 ; AVX512BW-NEXT:    retq
   2216   %exta = sext <32 x i16> %A to <32 x i32>
   2217   %extb = sext <32 x i16> %B to <32 x i32>
   2218   %m = mul <32 x i32> %exta, %extb
   2219   %sa = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 2, i32 0, i32 5, i32 6, i32 11, i32 9, i32 15, i32 12, i32 17, i32 18, i32 20, i32 23, i32 27, i32 24, i32 31, i32 29>
   2220   %sb = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 3, i32 1, i32 4, i32 7, i32 10, i32 8, i32 14, i32 13, i32 16, i32 19, i32 21, i32 22, i32 26, i32 25, i32 30, i32 28>
   2221   %a = add <16 x i32> %sa, %sb
   2222   ret <16 x i32> %a
   2223 }
   2224 
   2225 define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
   2226 ; SSE2-LABEL: jumbled_indices32:
   2227 ; SSE2:       # %bb.0:
   2228 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm0
   2229 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm1
   2230 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm2
   2231 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm3
   2232 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm4
   2233 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm5
   2234 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm6
   2235 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm7
   2236 ; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
   2237 ; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
   2238 ; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
   2239 ; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
   2240 ; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
   2241 ; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
   2242 ; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
   2243 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
   2244 ; SSE2-NEXT:    movq %rdi, %rax
   2245 ; SSE2-NEXT:    retq
   2246 ;
   2247 ; AVX1-LABEL: jumbled_indices32:
   2248 ; AVX1:       # %bb.0:
   2249 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
   2250 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
   2251 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm10
   2252 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
   2253 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm12
   2254 ; AVX1-NEXT:    vpmaddwd %xmm12, %xmm8, %xmm8
   2255 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm12
   2256 ; AVX1-NEXT:    vpmaddwd %xmm12, %xmm9, %xmm9
   2257 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm12
   2258 ; AVX1-NEXT:    vpmaddwd %xmm12, %xmm10, %xmm10
   2259 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm12
   2260 ; AVX1-NEXT:    vpmaddwd %xmm12, %xmm11, %xmm11
   2261 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
   2262 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
   2263 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
   2264 ; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
   2265 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
   2266 ; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm2
   2267 ; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
   2268 ; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
   2269 ; AVX1-NEXT:    retq
   2270 ;
   2271 ; AVX2-LABEL: jumbled_indices32:
   2272 ; AVX2:       # %bb.0:
   2273 ; AVX2-NEXT:    vpmaddwd %ymm4, %ymm0, %ymm0
   2274 ; AVX2-NEXT:    vpmaddwd %ymm5, %ymm1, %ymm1
   2275 ; AVX2-NEXT:    vpmaddwd %ymm6, %ymm2, %ymm2
   2276 ; AVX2-NEXT:    vpmaddwd %ymm7, %ymm3, %ymm3
   2277 ; AVX2-NEXT:    retq
   2278 ;
   2279 ; AVX512F-LABEL: jumbled_indices32:
   2280 ; AVX512F:       # %bb.0:
   2281 ; AVX512F-NEXT:    vpmaddwd %ymm5, %ymm1, %ymm1
   2282 ; AVX512F-NEXT:    vpmaddwd %ymm4, %ymm0, %ymm0
   2283 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2284 ; AVX512F-NEXT:    vpmaddwd %ymm7, %ymm3, %ymm1
   2285 ; AVX512F-NEXT:    vpmaddwd %ymm6, %ymm2, %ymm2
   2286 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
   2287 ; AVX512F-NEXT:    retq
   2288 ;
   2289 ; AVX512BW-LABEL: jumbled_indices32:
   2290 ; AVX512BW:       # %bb.0:
   2291 ; AVX512BW-NEXT:    vpmaddwd %zmm2, %zmm0, %zmm0
   2292 ; AVX512BW-NEXT:    vpmaddwd %zmm3, %zmm1, %zmm1
   2293 ; AVX512BW-NEXT:    retq
   2294   %exta = sext <64 x i16> %A to <64 x i32>
   2295   %extb = sext <64 x i16> %B to <64 x i32>
   2296   %m = mul <64 x i32> %exta, %extb
   2297   %sa = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 1, i32 2, i32 6, i32 5, i32 10, i32 8, i32 14, i32 12, i32 19, i32 17, i32 22, i32 20, i32 25, i32 27, i32 30, i32 28, i32 32, i32 34, i32 37, i32 38, i32 41, i32 43, i32 45, i32 47, i32 50, i32 48, i32 52, i32 54, i32 59, i32 56, i32 61, i32 63>
   2298   %sb = shufflevector <64 x i32> %m, <64 x i32> undef, <32 x i32> <i32 0, i32 3, i32 7, i32 4, i32 11, i32 9, i32 15, i32 13, i32 18, i32 16, i32 23, i32 21, i32 24, i32 26, i32 31, i32 29, i32 33, i32 35, i32 36, i32 39, i32 40, i32 42, i32 44, i32 46, i32 51, i32 49, i32 53, i32 55, i32 58, i32 57, i32 60, i32 62>
   2299   %a = add <32 x i32> %sa, %sb
   2300   ret <32 x i32> %a
   2301 }
   2302 
   2303 ; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
   2304 ; This would require the combine to recreate the concat_vectors.
   2305 define <4 x i32> @pmaddwd_128(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
   2306 ; SSE2-LABEL: pmaddwd_128:
   2307 ; SSE2:       # %bb.0:
   2308 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2309 ; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
   2310 ; SSE2-NEXT:    retq
   2311 ;
   2312 ; AVX-LABEL: pmaddwd_128:
   2313 ; AVX:       # %bb.0:
   2314 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   2315 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
   2316 ; AVX-NEXT:    retq
   2317   %A = load <8 x i16>, <8 x i16>* %Aptr
   2318   %B = load <8 x i16>, <8 x i16>* %Bptr
   2319   %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2320   %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2321   %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2322   %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2323   %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
   2324   %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
   2325   %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
   2326   %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
   2327   %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
   2328   %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
   2329   %add = add <4 x i32> %even_mul, %odd_mul
   2330   ret <4 x i32> %add
   2331 }
   2332 
   2333 define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) {
   2334 ; SSE2-LABEL: pmaddwd_256:
   2335 ; SSE2:       # %bb.0:
   2336 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2337 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
   2338 ; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
   2339 ; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
   2340 ; SSE2-NEXT:    retq
   2341 ;
   2342 ; AVX1-LABEL: pmaddwd_256:
   2343 ; AVX1:       # %bb.0:
   2344 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2345 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
   2346 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2347 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   2348 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
   2349 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   2350 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2351 ; AVX1-NEXT:    retq
   2352 ;
   2353 ; AVX256-LABEL: pmaddwd_256:
   2354 ; AVX256:       # %bb.0:
   2355 ; AVX256-NEXT:    vmovdqa (%rdi), %ymm0
   2356 ; AVX256-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
   2357 ; AVX256-NEXT:    retq
   2358   %A = load <16 x i16>, <16 x i16>* %Aptr
   2359   %B = load <16 x i16>, <16 x i16>* %Bptr
   2360   %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   2361   %A_odd = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   2362   %B_even = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   2363   %B_odd = shufflevector <16 x i16> %B, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   2364   %A_even_ext = sext <8 x i16> %A_even to <8 x i32>
   2365   %B_even_ext = sext <8 x i16> %B_even to <8 x i32>
   2366   %A_odd_ext = sext <8 x i16> %A_odd to <8 x i32>
   2367   %B_odd_ext = sext <8 x i16> %B_odd to <8 x i32>
   2368   %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
   2369   %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
   2370   %add = add <8 x i32> %even_mul, %odd_mul
   2371   ret <8 x i32> %add
   2372 }
   2373 
   2374 define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) {
   2375 ; SSE2-LABEL: pmaddwd_512:
   2376 ; SSE2:       # %bb.0:
   2377 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2378 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
   2379 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
   2380 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
   2381 ; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
   2382 ; SSE2-NEXT:    pmaddwd 16(%rsi), %xmm1
   2383 ; SSE2-NEXT:    pmaddwd 32(%rsi), %xmm2
   2384 ; SSE2-NEXT:    pmaddwd 48(%rsi), %xmm3
   2385 ; SSE2-NEXT:    retq
   2386 ;
   2387 ; AVX1-LABEL: pmaddwd_512:
   2388 ; AVX1:       # %bb.0:
   2389 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2390 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   2391 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
   2392 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
   2393 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   2394 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   2395 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
   2396 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
   2397 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   2398 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   2399 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   2400 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
   2401 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
   2402 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2403 ; AVX1-NEXT:    retq
   2404 ;
   2405 ; AVX2-LABEL: pmaddwd_512:
   2406 ; AVX2:       # %bb.0:
   2407 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2408 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
   2409 ; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
   2410 ; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
   2411 ; AVX2-NEXT:    retq
   2412 ;
   2413 ; AVX512F-LABEL: pmaddwd_512:
   2414 ; AVX512F:       # %bb.0:
   2415 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   2416 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
   2417 ; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
   2418 ; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
   2419 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2420 ; AVX512F-NEXT:    retq
   2421 ;
   2422 ; AVX512BW-LABEL: pmaddwd_512:
   2423 ; AVX512BW:       # %bb.0:
   2424 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   2425 ; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
   2426 ; AVX512BW-NEXT:    retq
   2427   %A = load <32 x i16>, <32 x i16>* %Aptr
   2428   %B = load <32 x i16>, <32 x i16>* %Bptr
   2429   %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   2430   %A_odd = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   2431   %B_even = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   2432   %B_odd = shufflevector <32 x i16> %B, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   2433   %A_even_ext = sext <16 x i16> %A_even to <16 x i32>
   2434   %B_even_ext = sext <16 x i16> %B_even to <16 x i32>
   2435   %A_odd_ext = sext <16 x i16> %A_odd to <16 x i32>
   2436   %B_odd_ext = sext <16 x i16> %B_odd to <16 x i32>
   2437   %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
   2438   %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
   2439   %add = add <16 x i32> %even_mul, %odd_mul
   2440   ret <16 x i32> %add
   2441 }
   2442 
   2443 define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) {
   2444 ; SSE2-LABEL: pmaddwd_1024:
   2445 ; SSE2:       # %bb.0:
   2446 ; SSE2-NEXT:    movdqa 112(%rsi), %xmm0
   2447 ; SSE2-NEXT:    movdqa 96(%rsi), %xmm1
   2448 ; SSE2-NEXT:    movdqa 80(%rsi), %xmm2
   2449 ; SSE2-NEXT:    movdqa 64(%rsi), %xmm3
   2450 ; SSE2-NEXT:    movdqa (%rsi), %xmm4
   2451 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm5
   2452 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm6
   2453 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
   2454 ; SSE2-NEXT:    pmaddwd (%rdx), %xmm4
   2455 ; SSE2-NEXT:    pmaddwd 16(%rdx), %xmm5
   2456 ; SSE2-NEXT:    pmaddwd 32(%rdx), %xmm6
   2457 ; SSE2-NEXT:    pmaddwd 48(%rdx), %xmm7
   2458 ; SSE2-NEXT:    pmaddwd 64(%rdx), %xmm3
   2459 ; SSE2-NEXT:    pmaddwd 80(%rdx), %xmm2
   2460 ; SSE2-NEXT:    pmaddwd 96(%rdx), %xmm1
   2461 ; SSE2-NEXT:    pmaddwd 112(%rdx), %xmm0
   2462 ; SSE2-NEXT:    movdqa %xmm0, 112(%rdi)
   2463 ; SSE2-NEXT:    movdqa %xmm1, 96(%rdi)
   2464 ; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
   2465 ; SSE2-NEXT:    movdqa %xmm3, 64(%rdi)
   2466 ; SSE2-NEXT:    movdqa %xmm7, 48(%rdi)
   2467 ; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
   2468 ; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
   2469 ; SSE2-NEXT:    movdqa %xmm4, (%rdi)
   2470 ; SSE2-NEXT:    movq %rdi, %rax
   2471 ; SSE2-NEXT:    retq
   2472 ;
   2473 ; AVX1-LABEL: pmaddwd_1024:
   2474 ; AVX1:       # %bb.0:
   2475 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2476 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   2477 ; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
   2478 ; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
   2479 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
   2480 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
   2481 ; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
   2482 ; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
   2483 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
   2484 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
   2485 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
   2486 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
   2487 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
   2488 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
   2489 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   2490 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
   2491 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
   2492 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   2493 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm3
   2494 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   2495 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
   2496 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
   2497 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   2498 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm3
   2499 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm4
   2500 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
   2501 ; AVX1-NEXT:    vpmaddwd %xmm9, %xmm8, %xmm4
   2502 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   2503 ; AVX1-NEXT:    retq
   2504 ;
   2505 ; AVX2-LABEL: pmaddwd_1024:
   2506 ; AVX2:       # %bb.0:
   2507 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2508 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
   2509 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
   2510 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
   2511 ; AVX2-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
   2512 ; AVX2-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
   2513 ; AVX2-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
   2514 ; AVX2-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm3
   2515 ; AVX2-NEXT:    retq
   2516 ;
   2517 ; AVX512F-LABEL: pmaddwd_1024:
   2518 ; AVX512F:       # %bb.0:
   2519 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   2520 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
   2521 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm2
   2522 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %ymm3
   2523 ; AVX512F-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
   2524 ; AVX512F-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
   2525 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2526 ; AVX512F-NEXT:    vpmaddwd 96(%rsi), %ymm3, %ymm1
   2527 ; AVX512F-NEXT:    vpmaddwd 64(%rsi), %ymm2, %ymm2
   2528 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
   2529 ; AVX512F-NEXT:    retq
   2530 ;
   2531 ; AVX512BW-LABEL: pmaddwd_1024:
   2532 ; AVX512BW:       # %bb.0:
   2533 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   2534 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
   2535 ; AVX512BW-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
   2536 ; AVX512BW-NEXT:    vpmaddwd 64(%rsi), %zmm1, %zmm1
   2537 ; AVX512BW-NEXT:    retq
   2538   %A = load <64 x i16>, <64 x i16>* %Aptr
   2539   %B = load <64 x i16>, <64 x i16>* %Bptr
   2540   %A_even = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
   2541   %A_odd = shufflevector <64 x i16> %A, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
   2542   %B_even = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
   2543   %B_odd = shufflevector <64 x i16> %B, <64 x i16> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
   2544   %A_even_ext = sext <32 x i16> %A_even to <32 x i32>
   2545   %B_even_ext = sext <32 x i16> %B_even to <32 x i32>
   2546   %A_odd_ext = sext <32 x i16> %A_odd to <32 x i32>
   2547   %B_odd_ext = sext <32 x i16> %B_odd to <32 x i32>
   2548   %even_mul = mul <32 x i32> %A_even_ext, %B_even_ext
   2549   %odd_mul = mul <32 x i32> %A_odd_ext, %B_odd_ext
   2550   %add = add <32 x i32> %even_mul, %odd_mul
   2551   ret <32 x i32> %add
   2552 }
   2553 
   2554 define <4 x i32> @pmaddwd_commuted_mul(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
   2555 ; SSE2-LABEL: pmaddwd_commuted_mul:
   2556 ; SSE2:       # %bb.0:
   2557 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2558 ; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
   2559 ; SSE2-NEXT:    retq
   2560 ;
   2561 ; AVX-LABEL: pmaddwd_commuted_mul:
   2562 ; AVX:       # %bb.0:
   2563 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   2564 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
   2565 ; AVX-NEXT:    retq
   2566   %A = load <8 x i16>, <8 x i16>* %Aptr
   2567   %B = load <8 x i16>, <8 x i16>* %Bptr
   2568   %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2569   %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2570   %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   2571   %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   2572   %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
   2573   %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
   2574   %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
   2575   %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
   2576   %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
   2577   %odd_mul = mul <4 x i32> %B_odd_ext, %A_odd_ext ; Different order than previous mul
   2578   %add = add <4 x i32> %even_mul, %odd_mul
   2579   ret <4 x i32> %add
   2580 }
   2581 
   2582 define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
   2583 ; SSE2-LABEL: pmaddwd_swapped_indices:
   2584 ; SSE2:       # %bb.0:
   2585 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2586 ; SSE2-NEXT:    pmaddwd (%rsi), %xmm0
   2587 ; SSE2-NEXT:    retq
   2588 ;
   2589 ; AVX-LABEL: pmaddwd_swapped_indices:
   2590 ; AVX:       # %bb.0:
   2591 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   2592 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
   2593 ; AVX-NEXT:    retq
   2594   %A = load <8 x i16>, <8 x i16>* %Aptr
   2595   %B = load <8 x i16>, <8 x i16>* %Bptr
   2596   %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; indices aren't all even
   2597   %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; indices aren't all odd
   2598   %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6> ; same indices as A
   2599   %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7> ; same indices as A
   2600   %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
   2601   %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
   2602   %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
   2603   %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
   2604   %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
   2605   %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
   2606   %add = add <4 x i32> %even_mul, %odd_mul
   2607   ret <4 x i32> %add
   2608 }
   2609 
   2610 ; Negative test were indices aren't paired properly
   2611 define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
   2612 ; SSE2-LABEL: pmaddwd_bad_indices:
   2613 ; SSE2:       # %bb.0:
   2614 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   2615 ; SSE2-NEXT:    movdqa (%rsi), %xmm1
   2616 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
   2617 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
   2618 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   2619 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,1,2,3,4,5,6,7]
   2620 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
   2621 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   2622 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
   2623 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2624 ; SSE2-NEXT:    pmulhw %xmm2, %xmm4
   2625 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
   2626 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
   2627 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
   2628 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
   2629 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2630 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
   2631 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
   2632 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   2633 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
   2634 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   2635 ; SSE2-NEXT:    pmulhw %xmm1, %xmm2
   2636 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2637 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   2638 ; SSE2-NEXT:    paddd %xmm3, %xmm0
   2639 ; SSE2-NEXT:    retq
   2640 ;
   2641 ; AVX-LABEL: pmaddwd_bad_indices:
   2642 ; AVX:       # %bb.0:
   2643 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   2644 ; AVX-NEXT:    vmovdqa (%rsi), %xmm1
   2645 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,12,13,10,11,12,13,14,15]
   2646 ; AVX-NEXT:    vpmovsxwd %xmm2, %xmm2
   2647 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2648 ; AVX-NEXT:    vpmovsxwd %xmm3, %xmm3
   2649 ; AVX-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
   2650 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15]
   2651 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
   2652 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
   2653 ; AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
   2654 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   2655 ; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
   2656 ; AVX-NEXT:    retq
   2657   %A = load <8 x i16>, <8 x i16>* %Aptr
   2658   %B = load <8 x i16>, <8 x i16>* %Bptr
   2659   %A_even = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
   2660   %A_odd = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
   2661   %B_even = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ; different indices than A
   2662   %B_odd = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> ; different indices than A
   2663   %A_even_ext = sext <4 x i16> %A_even to <4 x i32>
   2664   %B_even_ext = sext <4 x i16> %B_even to <4 x i32>
   2665   %A_odd_ext = sext <4 x i16> %A_odd to <4 x i32>
   2666   %B_odd_ext = sext <4 x i16> %B_odd to <4 x i32>
   2667   %even_mul = mul <4 x i32> %A_even_ext, %B_even_ext
   2668   %odd_mul = mul <4 x i32> %A_odd_ext, %B_odd_ext
   2669   %add = add <4 x i32> %even_mul, %odd_mul
   2670   ret <4 x i32> %add
   2671 }
   2672