Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s
      3 
      4 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
      5 
      6 define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" {
      7 ; CHECK-LABEL: add256:
      8 ; CHECK:       # %bb.0:
      9 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
     10 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
     11 ; CHECK-NEXT:    vpaddd (%rsi), %ymm0, %ymm0
     12 ; CHECK-NEXT:    vpaddd 32(%rsi), %ymm1, %ymm1
     13 ; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
     14 ; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
     15 ; CHECK-NEXT:    vzeroupper
     16 ; CHECK-NEXT:    retq
     17   %d = load <16 x i32>, <16 x i32>* %a
     18   %e = load <16 x i32>, <16 x i32>* %b
     19   %f = add <16 x i32> %d, %e
     20   store <16 x i32> %f, <16 x i32>* %c
     21   ret void
     22 }
     23 
     24 define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" {
     25 ; CHECK-LABEL: add512:
     26 ; CHECK:       # %bb.0:
     27 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
     28 ; CHECK-NEXT:    vpaddd (%rsi), %zmm0, %zmm0
     29 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
     30 ; CHECK-NEXT:    vzeroupper
     31 ; CHECK-NEXT:    retq
     32   %d = load <16 x i32>, <16 x i32>* %a
     33   %e = load <16 x i32>, <16 x i32>* %b
     34   %f = add <16 x i32> %d, %e
     35   store <16 x i32> %f, <16 x i32>* %c
     36   ret void
     37 }
     38 
     39 define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
     40 ; CHECK-LABEL: avg_v64i8_256:
     41 ; CHECK:       # %bb.0:
     42 ; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
     43 ; CHECK-NEXT:    vmovdqa 32(%rsi), %ymm1
     44 ; CHECK-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
     45 ; CHECK-NEXT:    vpavgb 32(%rdi), %ymm1, %ymm1
     46 ; CHECK-NEXT:    vmovdqu %ymm1, (%rax)
     47 ; CHECK-NEXT:    vmovdqu %ymm0, (%rax)
     48 ; CHECK-NEXT:    vzeroupper
     49 ; CHECK-NEXT:    retq
     50   %1 = load <64 x i8>, <64 x i8>* %a
     51   %2 = load <64 x i8>, <64 x i8>* %b
     52   %3 = zext <64 x i8> %1 to <64 x i32>
     53   %4 = zext <64 x i8> %2 to <64 x i32>
     54   %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     55   %6 = add nuw nsw <64 x i32> %5, %4
     56   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     57   %8 = trunc <64 x i32> %7 to <64 x i8>
     58   store <64 x i8> %8, <64 x i8>* undef, align 4
     59   ret void
     60 }
     61 
     62 
     63 define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" {
     64 ; CHECK-LABEL: avg_v64i8_512:
     65 ; CHECK:       # %bb.0:
     66 ; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm0
     67 ; CHECK-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
     68 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rax)
     69 ; CHECK-NEXT:    vzeroupper
     70 ; CHECK-NEXT:    retq
     71   %1 = load <64 x i8>, <64 x i8>* %a
     72   %2 = load <64 x i8>, <64 x i8>* %b
     73   %3 = zext <64 x i8> %1 to <64 x i32>
     74   %4 = zext <64 x i8> %2 to <64 x i32>
     75   %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     76   %6 = add nuw nsw <64 x i32> %5, %4
     77   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     78   %8 = trunc <64 x i32> %7 to <64 x i8>
     79   store <64 x i8> %8, <64 x i8>* undef, align 4
     80   ret void
     81 }
     82 
     83 define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" {
     84 ; CHECK-LABEL: pmaddwd_32_256:
     85 ; CHECK:       # %bb.0:
     86 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
     87 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
     88 ; CHECK-NEXT:    vpmaddwd (%rsi), %ymm0, %ymm0
     89 ; CHECK-NEXT:    vpmaddwd 32(%rsi), %ymm1, %ymm1
     90 ; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
     91 ; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
     92 ; CHECK-NEXT:    vzeroupper
     93 ; CHECK-NEXT:    retq
     94    %A = load <32 x i16>, <32 x i16>* %APtr
     95    %B = load <32 x i16>, <32 x i16>* %BPtr
     96    %a = sext <32 x i16> %A to <32 x i32>
     97    %b = sext <32 x i16> %B to <32 x i32>
     98    %m = mul nsw <32 x i32> %a, %b
     99    %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
    100    %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
    101    %ret = add <16 x i32> %odd, %even
    102    store <16 x i32> %ret, <16 x i32>* %CPtr
    103    ret void
    104 }
    105 
    106 define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" {
    107 ; CHECK-LABEL: pmaddwd_32_512:
    108 ; CHECK:       # %bb.0:
    109 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
    110 ; CHECK-NEXT:    vpmaddwd (%rsi), %zmm0, %zmm0
    111 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
    112 ; CHECK-NEXT:    vzeroupper
    113 ; CHECK-NEXT:    retq
    114    %A = load <32 x i16>, <32 x i16>* %APtr
    115    %B = load <32 x i16>, <32 x i16>* %BPtr
    116    %a = sext <32 x i16> %A to <32 x i32>
    117    %b = sext <32 x i16> %B to <32 x i32>
    118    %m = mul nsw <32 x i32> %a, %b
    119    %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
    120    %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
    121    %ret = add <16 x i32> %odd, %even
    122    store <16 x i32> %ret, <16 x i32>* %CPtr
    123    ret void
    124 }
    125 
    126 define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" {
    127 ; CHECK-LABEL: psubus_64i8_max_256:
    128 ; CHECK:       # %bb.0:
    129 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
    130 ; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
    131 ; CHECK-NEXT:    vpsubusb (%rsi), %ymm0, %ymm0
    132 ; CHECK-NEXT:    vpsubusb 32(%rsi), %ymm1, %ymm1
    133 ; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdx)
    134 ; CHECK-NEXT:    vmovdqa %ymm0, (%rdx)
    135 ; CHECK-NEXT:    vzeroupper
    136 ; CHECK-NEXT:    retq
    137   %x = load <64 x i8>, <64 x i8>* %xptr
    138   %y = load <64 x i8>, <64 x i8>* %yptr
    139   %cmp = icmp ult <64 x i8> %x, %y
    140   %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
    141   %res = sub <64 x i8> %max, %y
    142   store <64 x i8> %res, <64 x i8>* %zptr
    143   ret void
    144 }
    145 
    146 define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" {
    147 ; CHECK-LABEL: psubus_64i8_max_512:
    148 ; CHECK:       # %bb.0:
    149 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
    150 ; CHECK-NEXT:    vpsubusb (%rsi), %zmm0, %zmm0
    151 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdx)
    152 ; CHECK-NEXT:    vzeroupper
    153 ; CHECK-NEXT:    retq
    154   %x = load <64 x i8>, <64 x i8>* %xptr
    155   %y = load <64 x i8>, <64 x i8>* %yptr
    156   %cmp = icmp ult <64 x i8> %x, %y
    157   %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
    158   %res = sub <64 x i8> %max, %y
    159   store <64 x i8> %res, <64 x i8>* %zptr
    160   ret void
    161 }
    162 
    163 define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" {
    164 ; CHECK-LABEL: _Z9test_charPcS_i_256:
    165 ; CHECK:       # %bb.0: # %entry
    166 ; CHECK-NEXT:    movl %edx, %eax
    167 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    168 ; CHECK-NEXT:    xorl %ecx, %ecx
    169 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    170 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    171 ; CHECK-NEXT:    .p2align 4, 0x90
    172 ; CHECK-NEXT:  .LBB8_1: # %vector.body
    173 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    174 ; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
    175 ; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
    176 ; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
    177 ; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
    178 ; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
    179 ; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
    180 ; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
    181 ; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
    182 ; CHECK-NEXT:    addq $32, %rcx
    183 ; CHECK-NEXT:    cmpq %rcx, %rax
    184 ; CHECK-NEXT:    jne .LBB8_1
    185 ; CHECK-NEXT:  # %bb.2: # %middle.block
    186 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
    187 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
    188 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    189 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
    190 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    191 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    192 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    193 ; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    194 ; CHECK-NEXT:    vmovd %xmm0, %eax
    195 ; CHECK-NEXT:    vzeroupper
    196 ; CHECK-NEXT:    retq
    197 entry:
    198   %3 = zext i32 %2 to i64
    199   br label %vector.body
    200 
    201 vector.body:
    202   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    203   %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    204   %4 = getelementptr inbounds i8, i8* %0, i64 %index
    205   %5 = bitcast i8* %4 to <32 x i8>*
    206   %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
    207   %6 = sext <32 x i8> %wide.load to <32 x i32>
    208   %7 = getelementptr inbounds i8, i8* %1, i64 %index
    209   %8 = bitcast i8* %7 to <32 x i8>*
    210   %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
    211   %9 = sext <32 x i8> %wide.load14 to <32 x i32>
    212   %10 = mul nsw <32 x i32> %9, %6
    213   %11 = add nsw <32 x i32> %10, %vec.phi
    214   %index.next = add i64 %index, 32
    215   %12 = icmp eq i64 %index.next, %3
    216   br i1 %12, label %middle.block, label %vector.body
    217 
    218 middle.block:
    219   %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    220   %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
    221   %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    222   %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
    223   %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    224   %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
    225   %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    226   %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
    227   %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    228   %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
    229   %13 = extractelement <32 x i32> %bin.rdx20, i32 0
    230   ret i32 %13
    231 }
    232 
    233 define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" {
    234 ; CHECK-LABEL: _Z9test_charPcS_i_512:
    235 ; CHECK:       # %bb.0: # %entry
    236 ; CHECK-NEXT:    movl %edx, %eax
    237 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    238 ; CHECK-NEXT:    xorl %ecx, %ecx
    239 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    240 ; CHECK-NEXT:    .p2align 4, 0x90
    241 ; CHECK-NEXT:  .LBB9_1: # %vector.body
    242 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    243 ; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %zmm2
    244 ; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %zmm3
    245 ; CHECK-NEXT:    vpmaddwd %zmm2, %zmm3, %zmm2
    246 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
    247 ; CHECK-NEXT:    addq $32, %rcx
    248 ; CHECK-NEXT:    cmpq %rcx, %rax
    249 ; CHECK-NEXT:    jne .LBB9_1
    250 ; CHECK-NEXT:  # %bb.2: # %middle.block
    251 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    252 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    253 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    254 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
    255 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    256 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    257 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    258 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    259 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    260 ; CHECK-NEXT:    vmovd %xmm0, %eax
    261 ; CHECK-NEXT:    vzeroupper
    262 ; CHECK-NEXT:    retq
    263 entry:
    264   %3 = zext i32 %2 to i64
    265   br label %vector.body
    266 
    267 vector.body:
    268   %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
    269   %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
    270   %4 = getelementptr inbounds i8, i8* %0, i64 %index
    271   %5 = bitcast i8* %4 to <32 x i8>*
    272   %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
    273   %6 = sext <32 x i8> %wide.load to <32 x i32>
    274   %7 = getelementptr inbounds i8, i8* %1, i64 %index
    275   %8 = bitcast i8* %7 to <32 x i8>*
    276   %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
    277   %9 = sext <32 x i8> %wide.load14 to <32 x i32>
    278   %10 = mul nsw <32 x i32> %9, %6
    279   %11 = add nsw <32 x i32> %10, %vec.phi
    280   %index.next = add i64 %index, 32
    281   %12 = icmp eq i64 %index.next, %3
    282   br i1 %12, label %middle.block, label %vector.body
    283 
    284 middle.block:
    285   %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    286   %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
    287   %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    288   %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
    289   %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    290   %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
    291   %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    292   %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
    293   %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    294   %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
    295   %13 = extractelement <32 x i32> %bin.rdx20, i32 0
    296   ret i32 %13
    297 }
    298 
    299 @a = global [1024 x i8] zeroinitializer, align 16
    300 @b = global [1024 x i8] zeroinitializer, align 16
    301 
    302 define i32 @sad_16i8_256() "required-vector-width"="256" {
    303 ; CHECK-LABEL: sad_16i8_256:
    304 ; CHECK:       # %bb.0: # %entry
    305 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    306 ; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
    307 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    308 ; CHECK-NEXT:    .p2align 4, 0x90
    309 ; CHECK-NEXT:  .LBB10_1: # %vector.body
    310 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    311 ; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm2
    312 ; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
    313 ; CHECK-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
    314 ; CHECK-NEXT:    addq $4, %rax
    315 ; CHECK-NEXT:    jne .LBB10_1
    316 ; CHECK-NEXT:  # %bb.2: # %middle.block
    317 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    318 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
    319 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    320 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    321 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    322 ; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    323 ; CHECK-NEXT:    vmovd %xmm0, %eax
    324 ; CHECK-NEXT:    vzeroupper
    325 ; CHECK-NEXT:    retq
    326 entry:
    327   br label %vector.body
    328 
    329 vector.body:
    330   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
    331   %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
    332   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
    333   %1 = bitcast i8* %0 to <16 x i8>*
    334   %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
    335   %2 = zext <16 x i8> %wide.load to <16 x i32>
    336   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
    337   %4 = bitcast i8* %3 to <16 x i8>*
    338   %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
    339   %5 = zext <16 x i8> %wide.load1 to <16 x i32>
    340   %6 = sub nsw <16 x i32> %2, %5
    341   %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
    342   %8 = sub nsw <16 x i32> zeroinitializer, %6
    343   %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
    344   %10 = add nsw <16 x i32> %9, %vec.phi
    345   %index.next = add i64 %index, 4
    346   %11 = icmp eq i64 %index.next, 1024
    347   br i1 %11, label %middle.block, label %vector.body
    348 
    349 middle.block:
    350   %.lcssa = phi <16 x i32> [ %10, %vector.body ]
    351   %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    352   %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
    353   %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    354   %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
    355   %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    356   %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
    357   %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    358   %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
    359   %12 = extractelement <16 x i32> %bin.rdx4, i32 0
    360   ret i32 %12
    361 }
    362 
    363 define i32 @sad_16i8_512() "required-vector-width"="512" {
    364 ; CHECK-LABEL: sad_16i8_512:
    365 ; CHECK:       # %bb.0: # %entry
    366 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    367 ; CHECK-NEXT:    movq $-1024, %rax # imm = 0xFC00
    368 ; CHECK-NEXT:    .p2align 4, 0x90
    369 ; CHECK-NEXT:  .LBB11_1: # %vector.body
    370 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    371 ; CHECK-NEXT:    vmovdqu a+1024(%rax), %xmm1
    372 ; CHECK-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
    373 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    374 ; CHECK-NEXT:    addq $4, %rax
    375 ; CHECK-NEXT:    jne .LBB11_1
    376 ; CHECK-NEXT:  # %bb.2: # %middle.block
    377 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    378 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    379 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
    380 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    381 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    382 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    383 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    384 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    385 ; CHECK-NEXT:    vmovd %xmm0, %eax
    386 ; CHECK-NEXT:    vzeroupper
    387 ; CHECK-NEXT:    retq
    388 entry:
    389   br label %vector.body
    390 
    391 vector.body:
    392   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
    393   %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
    394   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
    395   %1 = bitcast i8* %0 to <16 x i8>*
    396   %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
    397   %2 = zext <16 x i8> %wide.load to <16 x i32>
    398   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
    399   %4 = bitcast i8* %3 to <16 x i8>*
    400   %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
    401   %5 = zext <16 x i8> %wide.load1 to <16 x i32>
    402   %6 = sub nsw <16 x i32> %2, %5
    403   %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
    404   %8 = sub nsw <16 x i32> zeroinitializer, %6
    405   %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
    406   %10 = add nsw <16 x i32> %9, %vec.phi
    407   %index.next = add i64 %index, 4
    408   %11 = icmp eq i64 %index.next, 1024
    409   br i1 %11, label %middle.block, label %vector.body
    410 
    411 middle.block:
    412   %.lcssa = phi <16 x i32> [ %10, %vector.body ]
    413   %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    414   %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
    415   %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    416   %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
    417   %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    418   %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
    419   %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    420   %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
    421   %12 = extractelement <16 x i32> %bin.rdx4, i32 0
    422   ret i32 %12
    423 }
    424 
    425 define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
    426 ; CHECK-LABEL: sbto16f32_256:
    427 ; CHECK:       # %bb.0:
    428 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    429 ; CHECK-NEXT:    kshiftrw $8, %k0, %k1
    430 ; CHECK-NEXT:    vpmovm2d %k1, %ymm0
    431 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
    432 ; CHECK-NEXT:    vpmovm2d %k0, %ymm1
    433 ; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
    434 ; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
    435 ; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
    436 ; CHECK-NEXT:    vzeroupper
    437 ; CHECK-NEXT:    retq
    438   %mask = icmp slt <16 x i16> %a, zeroinitializer
    439   %1 = sitofp <16 x i1> %mask to <16 x float>
    440   store <16 x float> %1, <16 x float>* %res
    441   ret void
    442 }
    443 
    444 define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
    445 ; CHECK-LABEL: sbto16f32_512:
    446 ; CHECK:       # %bb.0:
    447 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    448 ; CHECK-NEXT:    vpmovm2d %k0, %zmm0
    449 ; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
    450 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
    451 ; CHECK-NEXT:    vzeroupper
    452 ; CHECK-NEXT:    retq
    453   %mask = icmp slt <16 x i16> %a, zeroinitializer
    454   %1 = sitofp <16 x i1> %mask to <16 x float>
    455   store <16 x float> %1, <16 x float>* %res
    456   ret void
    457 }
    458 
    459 define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="256" {
    460 ; CHECK-LABEL: sbto16f64_256:
    461 ; CHECK:       # %bb.0:
    462 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    463 ; CHECK-NEXT:    kshiftrw $8, %k0, %k1
    464 ; CHECK-NEXT:    vpmovm2d %k1, %ymm0
    465 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
    466 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
    467 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
    468 ; CHECK-NEXT:    vpmovm2d %k0, %ymm2
    469 ; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
    470 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
    471 ; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
    472 ; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
    473 ; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
    474 ; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
    475 ; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
    476 ; CHECK-NEXT:    vzeroupper
    477 ; CHECK-NEXT:    retq
    478   %mask = icmp slt <16 x i16> %a, zeroinitializer
    479   %1 = sitofp <16 x i1> %mask to <16 x double>
    480   store <16 x double> %1, <16 x double>* %res
    481   ret void
    482 }
    483 
    484 define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="512" {
    485 ; CHECK-LABEL: sbto16f64_512:
    486 ; CHECK:       # %bb.0:
    487 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    488 ; CHECK-NEXT:    vpmovm2d %k0, %zmm0
    489 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
    490 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
    491 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
    492 ; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
    493 ; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
    494 ; CHECK-NEXT:    vzeroupper
    495 ; CHECK-NEXT:    retq
    496   %mask = icmp slt <16 x i16> %a, zeroinitializer
    497   %1 = sitofp <16 x i1> %mask to <16 x double>
    498   store <16 x double> %1, <16 x double>* %res
    499   ret void
    500 }
    501 
    502 define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
    503 ; CHECK-LABEL: ubto16f32_256:
    504 ; CHECK:       # %bb.0:
    505 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    506 ; CHECK-NEXT:    kshiftrw $8, %k0, %k1
    507 ; CHECK-NEXT:    vpmovm2d %k1, %ymm0
    508 ; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
    509 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
    510 ; CHECK-NEXT:    vpmovm2d %k0, %ymm1
    511 ; CHECK-NEXT:    vpsrld $31, %ymm1, %ymm1
    512 ; CHECK-NEXT:    vcvtdq2ps %ymm1, %ymm1
    513 ; CHECK-NEXT:    vmovaps %ymm1, (%rdi)
    514 ; CHECK-NEXT:    vmovaps %ymm0, 32(%rdi)
    515 ; CHECK-NEXT:    vzeroupper
    516 ; CHECK-NEXT:    retq
    517   %mask = icmp slt <16 x i16> %a, zeroinitializer
    518   %1 = uitofp <16 x i1> %mask to <16 x float>
    519   store <16 x float> %1, <16 x float>* %res
    520   ret void
    521 }
    522 
    523 define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
    524 ; CHECK-LABEL: ubto16f32_512:
    525 ; CHECK:       # %bb.0:
    526 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    527 ; CHECK-NEXT:    vpmovm2d %k0, %zmm0
    528 ; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
    529 ; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
    530 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
    531 ; CHECK-NEXT:    vzeroupper
    532 ; CHECK-NEXT:    retq
    533   %mask = icmp slt <16 x i16> %a, zeroinitializer
    534   %1 = uitofp <16 x i1> %mask to <16 x float>
    535   store <16 x float> %1, <16 x float>* %res
    536   ret void
    537 }
    538 
    539 define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" {
    540 ; CHECK-LABEL: ubto16f64_256:
    541 ; CHECK:       # %bb.0:
    542 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    543 ; CHECK-NEXT:    kshiftrw $8, %k0, %k1
    544 ; CHECK-NEXT:    vpmovm2d %k1, %ymm0
    545 ; CHECK-NEXT:    vpsrld $31, %ymm0, %ymm0
    546 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1
    547 ; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
    548 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
    549 ; CHECK-NEXT:    vpmovm2d %k0, %ymm2
    550 ; CHECK-NEXT:    vpsrld $31, %ymm2, %ymm2
    551 ; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm3
    552 ; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
    553 ; CHECK-NEXT:    vcvtdq2pd %xmm2, %ymm2
    554 ; CHECK-NEXT:    vmovaps %ymm2, 32(%rdi)
    555 ; CHECK-NEXT:    vmovaps %ymm3, (%rdi)
    556 ; CHECK-NEXT:    vmovaps %ymm0, 96(%rdi)
    557 ; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
    558 ; CHECK-NEXT:    vzeroupper
    559 ; CHECK-NEXT:    retq
    560   %mask = icmp slt <16 x i16> %a, zeroinitializer
    561   %1 = uitofp <16 x i1> %mask to <16 x double>
    562   store <16 x double> %1, <16 x double>* %res
    563   ret void
    564 }
    565 
    566 define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" {
    567 ; CHECK-LABEL: ubto16f64_512:
    568 ; CHECK:       # %bb.0:
    569 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
    570 ; CHECK-NEXT:    vpmovm2d %k0, %zmm0
    571 ; CHECK-NEXT:    vpsrld $31, %zmm0, %zmm0
    572 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
    573 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
    574 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
    575 ; CHECK-NEXT:    vmovaps %zmm0, 64(%rdi)
    576 ; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
    577 ; CHECK-NEXT:    vzeroupper
    578 ; CHECK-NEXT:    retq
    579   %mask = icmp slt <16 x i16> %a, zeroinitializer
    580   %1 = uitofp <16 x i1> %mask to <16 x double>
    581   store <16 x double> %1, <16 x double>* %res
    582   ret void
    583 }
    584 
    585 define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
    586 ; CHECK-LABEL: test_16f32toub_256:
    587 ; CHECK:       # %bb.0:
    588 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
    589 ; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
    590 ; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
    591 ; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
    592 ; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
    593 ; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
    594 ; CHECK-NEXT:    vpmovw2m %ymm1, %k1
    595 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
    596 ; CHECK-NEXT:    retq
    597   %a = load <16 x float>, <16 x float>* %ptr
    598   %mask = fptoui <16 x float> %a to <16 x i1>
    599   %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
    600   ret <16 x i16> %select
    601 }
    602 
    603 define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
    604 ; CHECK-LABEL: test_16f32toub_512:
    605 ; CHECK:       # %bb.0:
    606 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
    607 ; CHECK-NEXT:    vpslld $31, %zmm1, %zmm1
    608 ; CHECK-NEXT:    vpmovd2m %zmm1, %k1
    609 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
    610 ; CHECK-NEXT:    retq
    611   %a = load <16 x float>, <16 x float>* %ptr
    612   %mask = fptoui <16 x float> %a to <16 x i1>
    613   %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
    614   ret <16 x i16> %select
    615 }
    616 
    617 define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
    618 ; CHECK-LABEL: test_16f32tosb_256:
    619 ; CHECK:       # %bb.0:
    620 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
    621 ; CHECK-NEXT:    vpmovdw %ymm1, %xmm1
    622 ; CHECK-NEXT:    vcvttps2dq 32(%rdi), %ymm2
    623 ; CHECK-NEXT:    vpmovdw %ymm2, %xmm2
    624 ; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
    625 ; CHECK-NEXT:    vpsllw $15, %ymm1, %ymm1
    626 ; CHECK-NEXT:    vpmovw2m %ymm1, %k1
    627 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
    628 ; CHECK-NEXT:    retq
    629   %a = load <16 x float>, <16 x float>* %ptr
    630   %mask = fptosi <16 x float> %a to <16 x i1>
    631   %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
    632   ret <16 x i16> %select
    633 }
    634 
    635 define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
    636 ; CHECK-LABEL: test_16f32tosb_512:
    637 ; CHECK:       # %bb.0:
    638 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
    639 ; CHECK-NEXT:    vpmovd2m %zmm1, %k1
    640 ; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
    641 ; CHECK-NEXT:    retq
    642   %a = load <16 x float>, <16 x float>* %ptr
    643   %mask = fptosi <16 x float> %a to <16 x i1>
    644   %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
    645   ret <16 x i16> %select
    646 }
    647