Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
      7 
      8 @a = global [1024 x i8] zeroinitializer, align 16
      9 @b = global [1024 x i8] zeroinitializer, align 16
     10 
     11 define i32 @sad_16i8() nounwind {
     12 ; SSE2-LABEL: sad_16i8:
     13 ; SSE2:       # %bb.0: # %entry
     14 ; SSE2-NEXT:    pxor %xmm0, %xmm0
     15 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
     16 ; SSE2-NEXT:    pxor %xmm1, %xmm1
     17 ; SSE2-NEXT:    .p2align 4, 0x90
     18 ; SSE2-NEXT:  .LBB0_1: # %vector.body
     19 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
     20 ; SSE2-NEXT:    movdqu a+1024(%rax), %xmm2
     21 ; SSE2-NEXT:    movdqu b+1024(%rax), %xmm3
     22 ; SSE2-NEXT:    psadbw %xmm2, %xmm3
     23 ; SSE2-NEXT:    paddd %xmm3, %xmm1
     24 ; SSE2-NEXT:    addq $4, %rax
     25 ; SSE2-NEXT:    jne .LBB0_1
     26 ; SSE2-NEXT:  # %bb.2: # %middle.block
     27 ; SSE2-NEXT:    paddd %xmm0, %xmm1
     28 ; SSE2-NEXT:    paddd %xmm0, %xmm0
     29 ; SSE2-NEXT:    paddd %xmm1, %xmm0
     30 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     31 ; SSE2-NEXT:    paddd %xmm0, %xmm1
     32 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
     33 ; SSE2-NEXT:    paddd %xmm1, %xmm0
     34 ; SSE2-NEXT:    movd %xmm0, %eax
     35 ; SSE2-NEXT:    retq
     36 ;
     37 ; AVX1-LABEL: sad_16i8:
     38 ; AVX1:       # %bb.0: # %entry
     39 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
     40 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
     41 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     42 ; AVX1-NEXT:    .p2align 4, 0x90
     43 ; AVX1-NEXT:  .LBB0_1: # %vector.body
     44 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
     45 ; AVX1-NEXT:    vmovdqu a+1024(%rax), %xmm2
     46 ; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
     47 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm2
     48 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
     49 ; AVX1-NEXT:    addq $4, %rax
     50 ; AVX1-NEXT:    jne .LBB0_1
     51 ; AVX1-NEXT:  # %bb.2: # %middle.block
     52 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
     53 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
     54 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
     55 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
     56 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
     57 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     58 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
     59 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
     60 ; AVX1-NEXT:    vmovd %xmm0, %eax
     61 ; AVX1-NEXT:    vzeroupper
     62 ; AVX1-NEXT:    retq
     63 ;
     64 ; AVX2-LABEL: sad_16i8:
     65 ; AVX2:       # %bb.0: # %entry
     66 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
     67 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
     68 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     69 ; AVX2-NEXT:    .p2align 4, 0x90
     70 ; AVX2-NEXT:  .LBB0_1: # %vector.body
     71 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
     72 ; AVX2-NEXT:    vmovdqu a+1024(%rax), %xmm2
     73 ; AVX2-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
     74 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
     75 ; AVX2-NEXT:    addq $4, %rax
     76 ; AVX2-NEXT:    jne .LBB0_1
     77 ; AVX2-NEXT:  # %bb.2: # %middle.block
     78 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
     79 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     80 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
     81 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     82 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
     83 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
     84 ; AVX2-NEXT:    vmovd %xmm0, %eax
     85 ; AVX2-NEXT:    vzeroupper
     86 ; AVX2-NEXT:    retq
     87 ;
     88 ; AVX512-LABEL: sad_16i8:
     89 ; AVX512:       # %bb.0: # %entry
     90 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
     91 ; AVX512-NEXT:    movq $-1024, %rax # imm = 0xFC00
     92 ; AVX512-NEXT:    .p2align 4, 0x90
     93 ; AVX512-NEXT:  .LBB0_1: # %vector.body
     94 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
     95 ; AVX512-NEXT:    vmovdqu a+1024(%rax), %xmm1
     96 ; AVX512-NEXT:    vpsadbw b+1024(%rax), %xmm1, %xmm1
     97 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
     98 ; AVX512-NEXT:    addq $4, %rax
     99 ; AVX512-NEXT:    jne .LBB0_1
    100 ; AVX512-NEXT:  # %bb.2: # %middle.block
    101 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    102 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    103 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    104 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    105 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    106 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    107 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    108 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    109 ; AVX512-NEXT:    vmovd %xmm0, %eax
    110 ; AVX512-NEXT:    vzeroupper
    111 ; AVX512-NEXT:    retq
    112 entry:
    113   br label %vector.body
    114 
    115 vector.body:
    116   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
    117   %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
    118   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
    119   %1 = bitcast i8* %0 to <16 x i8>*
    120   %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
    121   %2 = zext <16 x i8> %wide.load to <16 x i32>
    122   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
    123   %4 = bitcast i8* %3 to <16 x i8>*
    124   %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
    125   %5 = zext <16 x i8> %wide.load1 to <16 x i32>
    126   %6 = sub nsw <16 x i32> %2, %5
    127   %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
    128   %8 = sub nsw <16 x i32> zeroinitializer, %6
    129   %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
    130   %10 = add nsw <16 x i32> %9, %vec.phi
    131   %index.next = add i64 %index, 4
    132   %11 = icmp eq i64 %index.next, 1024
    133   br i1 %11, label %middle.block, label %vector.body
    134 
    135 middle.block:
    136   %.lcssa = phi <16 x i32> [ %10, %vector.body ]
    137   %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    138   %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
    139   %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    140   %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
    141   %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    142   %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
    143   %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    144   %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
    145   %12 = extractelement <16 x i32> %bin.rdx4, i32 0
    146   ret i32 %12
    147 }
    148 
    149 define i32 @sad_32i8() nounwind {
    150 ; SSE2-LABEL: sad_32i8:
    151 ; SSE2:       # %bb.0: # %entry
    152 ; SSE2-NEXT:    pxor %xmm12, %xmm12
    153 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
    154 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    155 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    156 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    157 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    158 ; SSE2-NEXT:    pxor %xmm6, %xmm6
    159 ; SSE2-NEXT:    pxor %xmm13, %xmm13
    160 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    161 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    162 ; SSE2-NEXT:    pxor %xmm15, %xmm15
    163 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    164 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    165 ; SSE2-NEXT:    pxor %xmm14, %xmm14
    166 ; SSE2-NEXT:    .p2align 4, 0x90
    167 ; SSE2-NEXT:  .LBB1_1: # %vector.body
    168 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    169 ; SSE2-NEXT:    movdqa a+1040(%rax), %xmm8
    170 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
    171 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    172 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
    173 ; SSE2-NEXT:    movdqa %xmm4, %xmm7
    174 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
    175 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
    176 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
    177 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
    178 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
    179 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
    180 ; SSE2-NEXT:    movdqa %xmm8, %xmm0
    181 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
    182 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
    183 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
    184 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
    185 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
    186 ; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
    187 ; SSE2-NEXT:    movdqa %xmm11, %xmm10
    188 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
    189 ; SSE2-NEXT:    movdqa %xmm10, %xmm2
    190 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
    191 ; SSE2-NEXT:    psubd %xmm2, %xmm7
    192 ; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
    193 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
    194 ; SSE2-NEXT:    psubd %xmm10, %xmm4
    195 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
    196 ; SSE2-NEXT:    movdqa %xmm11, %xmm2
    197 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
    198 ; SSE2-NEXT:    psubd %xmm2, %xmm1
    199 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
    200 ; SSE2-NEXT:    psubd %xmm11, %xmm3
    201 ; SSE2-NEXT:    movdqa %xmm6, %xmm10
    202 ; SSE2-NEXT:    movdqa %xmm9, %xmm6
    203 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
    204 ; SSE2-NEXT:    movdqa %xmm6, %xmm2
    205 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
    206 ; SSE2-NEXT:    psubd %xmm2, %xmm5
    207 ; SSE2-NEXT:    movdqa %xmm8, %xmm2
    208 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
    209 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
    210 ; SSE2-NEXT:    psubd %xmm6, %xmm0
    211 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
    212 ; SSE2-NEXT:    movdqa %xmm9, %xmm6
    213 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
    214 ; SSE2-NEXT:    psubd %xmm6, %xmm2
    215 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
    216 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
    217 ; SSE2-NEXT:    psubd %xmm9, %xmm8
    218 ; SSE2-NEXT:    movdqa %xmm7, %xmm6
    219 ; SSE2-NEXT:    psrad $31, %xmm6
    220 ; SSE2-NEXT:    paddd %xmm6, %xmm7
    221 ; SSE2-NEXT:    pxor %xmm6, %xmm7
    222 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
    223 ; SSE2-NEXT:    paddd %xmm7, %xmm6
    224 ; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
    225 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
    226 ; SSE2-NEXT:    psrad $31, %xmm6
    227 ; SSE2-NEXT:    paddd %xmm6, %xmm4
    228 ; SSE2-NEXT:    pxor %xmm6, %xmm4
    229 ; SSE2-NEXT:    movdqa %xmm10, %xmm6
    230 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
    231 ; SSE2-NEXT:    paddd %xmm4, %xmm7
    232 ; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
    233 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
    234 ; SSE2-NEXT:    psrad $31, %xmm4
    235 ; SSE2-NEXT:    paddd %xmm4, %xmm1
    236 ; SSE2-NEXT:    pxor %xmm4, %xmm1
    237 ; SSE2-NEXT:    paddd %xmm1, %xmm6
    238 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
    239 ; SSE2-NEXT:    psrad $31, %xmm1
    240 ; SSE2-NEXT:    paddd %xmm1, %xmm3
    241 ; SSE2-NEXT:    pxor %xmm1, %xmm3
    242 ; SSE2-NEXT:    paddd %xmm3, %xmm13
    243 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
    244 ; SSE2-NEXT:    psrad $31, %xmm1
    245 ; SSE2-NEXT:    paddd %xmm1, %xmm5
    246 ; SSE2-NEXT:    pxor %xmm1, %xmm5
    247 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    248 ; SSE2-NEXT:    paddd %xmm5, %xmm1
    249 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    250 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    251 ; SSE2-NEXT:    psrad $31, %xmm1
    252 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    253 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    254 ; SSE2-NEXT:    paddd %xmm0, %xmm15
    255 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    256 ; SSE2-NEXT:    psrad $31, %xmm0
    257 ; SSE2-NEXT:    paddd %xmm0, %xmm2
    258 ; SSE2-NEXT:    pxor %xmm0, %xmm2
    259 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    260 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    261 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    262 ; SSE2-NEXT:    movdqa %xmm8, %xmm0
    263 ; SSE2-NEXT:    psrad $31, %xmm0
    264 ; SSE2-NEXT:    paddd %xmm0, %xmm8
    265 ; SSE2-NEXT:    pxor %xmm0, %xmm8
    266 ; SSE2-NEXT:    paddd %xmm8, %xmm14
    267 ; SSE2-NEXT:    addq $4, %rax
    268 ; SSE2-NEXT:    jne .LBB1_1
    269 ; SSE2-NEXT:  # %bb.2: # %middle.block
    270 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    271 ; SSE2-NEXT:    paddd %xmm15, %xmm0
    272 ; SSE2-NEXT:    paddd %xmm14, %xmm13
    273 ; SSE2-NEXT:    paddd %xmm0, %xmm13
    274 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    275 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
    276 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
    277 ; SSE2-NEXT:    paddd %xmm13, %xmm6
    278 ; SSE2-NEXT:    paddd %xmm0, %xmm6
    279 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
    280 ; SSE2-NEXT:    paddd %xmm6, %xmm0
    281 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    282 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    283 ; SSE2-NEXT:    movd %xmm1, %eax
    284 ; SSE2-NEXT:    retq
    285 ;
    286 ; AVX1-LABEL: sad_32i8:
    287 ; AVX1:       # %bb.0: # %entry
    288 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    289 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
    290 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    291 ; AVX1-NEXT:    .p2align 4, 0x90
    292 ; AVX1-NEXT:  .LBB1_1: # %vector.body
    293 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    294 ; AVX1-NEXT:    vmovdqa a+1024(%rax), %ymm2
    295 ; AVX1-NEXT:    vmovdqa b+1024(%rax), %ymm3
    296 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
    297 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
    298 ; AVX1-NEXT:    vpsadbw %xmm4, %xmm5, %xmm4
    299 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
    300 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
    301 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
    302 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
    303 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
    304 ; AVX1-NEXT:    addq $4, %rax
    305 ; AVX1-NEXT:    jne .LBB1_1
    306 ; AVX1-NEXT:  # %bb.2: # %middle.block
    307 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
    308 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    309 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    310 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
    311 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
    312 ; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
    313 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
    314 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
    315 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    316 ; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
    317 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    318 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    319 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    320 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    321 ; AVX1-NEXT:    vmovd %xmm0, %eax
    322 ; AVX1-NEXT:    vzeroupper
    323 ; AVX1-NEXT:    retq
    324 ;
    325 ; AVX2-LABEL: sad_32i8:
    326 ; AVX2:       # %bb.0: # %entry
    327 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    328 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
    329 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    330 ; AVX2-NEXT:    .p2align 4, 0x90
    331 ; AVX2-NEXT:  .LBB1_1: # %vector.body
    332 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
    333 ; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm2
    334 ; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
    335 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
    336 ; AVX2-NEXT:    addq $4, %rax
    337 ; AVX2-NEXT:    jne .LBB1_1
    338 ; AVX2-NEXT:  # %bb.2: # %middle.block
    339 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
    340 ; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
    341 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
    342 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    343 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    344 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    345 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    346 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    347 ; AVX2-NEXT:    vmovd %xmm0, %eax
    348 ; AVX2-NEXT:    vzeroupper
    349 ; AVX2-NEXT:    retq
    350 ;
    351 ; AVX512-LABEL: sad_32i8:
    352 ; AVX512:       # %bb.0: # %entry
    353 ; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    354 ; AVX512-NEXT:    movq $-1024, %rax # imm = 0xFC00
    355 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    356 ; AVX512-NEXT:    .p2align 4, 0x90
    357 ; AVX512-NEXT:  .LBB1_1: # %vector.body
    358 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
    359 ; AVX512-NEXT:    vmovdqa a+1024(%rax), %ymm2
    360 ; AVX512-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
    361 ; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
    362 ; AVX512-NEXT:    addq $4, %rax
    363 ; AVX512-NEXT:    jne .LBB1_1
    364 ; AVX512-NEXT:  # %bb.2: # %middle.block
    365 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
    366 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    367 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    368 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    369 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    370 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    371 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    372 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    373 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    374 ; AVX512-NEXT:    vmovd %xmm0, %eax
    375 ; AVX512-NEXT:    vzeroupper
    376 ; AVX512-NEXT:    retq
    377 entry:
    378   br label %vector.body
    379 
    380 vector.body:
    381   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
    382   %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
    383   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
    384   %1 = bitcast i8* %0 to <32 x i8>*
    385   %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
    386   %2 = zext <32 x i8> %wide.load to <32 x i32>
    387   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
    388   %4 = bitcast i8* %3 to <32 x i8>*
    389   %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
    390   %5 = zext <32 x i8> %wide.load1 to <32 x i32>
    391   %6 = sub nsw <32 x i32> %2, %5
    392   %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
    393   %8 = sub nsw <32 x i32> zeroinitializer, %6
    394   %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
    395   %10 = add nsw <32 x i32> %9, %vec.phi
    396   %index.next = add i64 %index, 4
    397   %11 = icmp eq i64 %index.next, 1024
    398   br i1 %11, label %middle.block, label %vector.body
    399 
    400 middle.block:
    401   %.lcssa = phi <32 x i32> [ %10, %vector.body ]
    402   %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    403   %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf
    404   %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    405   %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
    406   %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    407   %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
    408   %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    409   %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
    410   %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    411   %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
    412   %12 = extractelement <32 x i32> %bin.rdx5, i32 0
    413   ret i32 %12
    414 }
    415 
    416 define i32 @sad_avx64i8() nounwind {
    417 ; SSE2-LABEL: sad_avx64i8:
    418 ; SSE2:       # %bb.0: # %entry
    419 ; SSE2-NEXT:    subq $200, %rsp
    420 ; SSE2-NEXT:    pxor %xmm14, %xmm14
    421 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
    422 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    423 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    424 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    425 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    426 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    427 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    428 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    429 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    430 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    431 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    432 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    433 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    434 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    435 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    436 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    437 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    438 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    439 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    440 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    441 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    442 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    443 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    444 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    445 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    446 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    447 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    448 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    449 ; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
    450 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    451 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    452 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    453 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    454 ; SSE2-NEXT:    .p2align 4, 0x90
    455 ; SSE2-NEXT:  .LBB2_1: # %vector.body
    456 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
    457 ; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
    458 ; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    459 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
    460 ; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
    461 ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
    462 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
    463 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
    464 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
    465 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
    466 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
    467 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
    468 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
    469 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
    470 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
    471 ; SSE2-NEXT:    movdqa %xmm15, %xmm11
    472 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
    473 ; SSE2-NEXT:    movdqa %xmm11, %xmm8
    474 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
    475 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
    476 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
    477 ; SSE2-NEXT:    movdqa %xmm15, %xmm0
    478 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
    479 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    480 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
    481 ; SSE2-NEXT:    movdqa %xmm12, %xmm10
    482 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
    483 ; SSE2-NEXT:    movdqa %xmm10, %xmm0
    484 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
    485 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
    486 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
    487 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
    488 ; SSE2-NEXT:    movdqa %xmm12, %xmm0
    489 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
    490 ; SSE2-NEXT:    movdqa %xmm0, %xmm13
    491 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
    492 ; SSE2-NEXT:    movdqa b+1072(%rax), %xmm3
    493 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
    494 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
    495 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
    496 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
    497 ; SSE2-NEXT:    psubd %xmm0, %xmm1
    498 ; SSE2-NEXT:    movdqa b+1056(%rax), %xmm0
    499 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
    500 ; SSE2-NEXT:    psubd %xmm7, %xmm6
    501 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
    502 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
    503 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
    504 ; SSE2-NEXT:    psubd %xmm7, %xmm5
    505 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
    506 ; SSE2-NEXT:    psubd %xmm3, %xmm4
    507 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    508 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
    509 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
    510 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
    511 ; SSE2-NEXT:    psubd %xmm7, %xmm8
    512 ; SSE2-NEXT:    movdqa b+1024(%rax), %xmm7
    513 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
    514 ; SSE2-NEXT:    psubd %xmm3, %xmm11
    515 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
    516 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    517 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
    518 ; SSE2-NEXT:    psubd %xmm3, %xmm2
    519 ; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
    520 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
    521 ; SSE2-NEXT:    psubd %xmm0, %xmm15
    522 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
    523 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
    524 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    525 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
    526 ; SSE2-NEXT:    psubd %xmm3, %xmm9
    527 ; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
    528 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
    529 ; SSE2-NEXT:    movdqa %xmm2, %xmm9
    530 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
    531 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
    532 ; SSE2-NEXT:    psubd %xmm0, %xmm10
    533 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
    534 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
    535 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
    536 ; SSE2-NEXT:    psubd %xmm0, %xmm13
    537 ; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
    538 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
    539 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
    540 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
    541 ; SSE2-NEXT:    psubd %xmm7, %xmm12
    542 ; SSE2-NEXT:    movdqa b+1040(%rax), %xmm13
    543 ; SSE2-NEXT:    movdqa %xmm13, %xmm3
    544 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
    545 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
    546 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
    547 ; SSE2-NEXT:    psubd %xmm7, %xmm0
    548 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
    549 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
    550 ; SSE2-NEXT:    psubd %xmm3, %xmm9
    551 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
    552 ; SSE2-NEXT:    movdqa %xmm2, %xmm7
    553 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
    554 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
    555 ; SSE2-NEXT:    movdqa %xmm13, %xmm3
    556 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
    557 ; SSE2-NEXT:    psubd %xmm3, %xmm7
    558 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
    559 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
    560 ; SSE2-NEXT:    psubd %xmm13, %xmm2
    561 ; SSE2-NEXT:    movdqa %xmm2, %xmm13
    562 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
    563 ; SSE2-NEXT:    psrad $31, %xmm3
    564 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    565 ; SSE2-NEXT:    pxor %xmm3, %xmm1
    566 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
    567 ; SSE2-NEXT:    paddd %xmm1, %xmm3
    568 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
    569 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
    570 ; SSE2-NEXT:    psrad $31, %xmm1
    571 ; SSE2-NEXT:    paddd %xmm1, %xmm6
    572 ; SSE2-NEXT:    pxor %xmm1, %xmm6
    573 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    574 ; SSE2-NEXT:    paddd %xmm6, %xmm1
    575 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    576 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
    577 ; SSE2-NEXT:    psrad $31, %xmm1
    578 ; SSE2-NEXT:    paddd %xmm1, %xmm5
    579 ; SSE2-NEXT:    pxor %xmm1, %xmm5
    580 ; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
    581 ; SSE2-NEXT:    paddd %xmm5, %xmm1
    582 ; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
    583 ; SSE2-NEXT:    movdqa %xmm4, %xmm1
    584 ; SSE2-NEXT:    psrad $31, %xmm1
    585 ; SSE2-NEXT:    paddd %xmm1, %xmm4
    586 ; SSE2-NEXT:    pxor %xmm1, %xmm4
    587 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    588 ; SSE2-NEXT:    paddd %xmm4, %xmm1
    589 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    590 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
    591 ; SSE2-NEXT:    psrad $31, %xmm1
    592 ; SSE2-NEXT:    paddd %xmm1, %xmm8
    593 ; SSE2-NEXT:    pxor %xmm1, %xmm8
    594 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    595 ; SSE2-NEXT:    paddd %xmm8, %xmm1
    596 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    597 ; SSE2-NEXT:    movdqa %xmm11, %xmm1
    598 ; SSE2-NEXT:    psrad $31, %xmm1
    599 ; SSE2-NEXT:    paddd %xmm1, %xmm11
    600 ; SSE2-NEXT:    pxor %xmm1, %xmm11
    601 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    602 ; SSE2-NEXT:    paddd %xmm11, %xmm1
    603 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    604 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
    605 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
    606 ; SSE2-NEXT:    psrad $31, %xmm1
    607 ; SSE2-NEXT:    paddd %xmm1, %xmm2
    608 ; SSE2-NEXT:    pxor %xmm1, %xmm2
    609 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    610 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    611 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
    612 ; SSE2-NEXT:    movdqa %xmm15, %xmm1
    613 ; SSE2-NEXT:    psrad $31, %xmm1
    614 ; SSE2-NEXT:    paddd %xmm1, %xmm15
    615 ; SSE2-NEXT:    pxor %xmm1, %xmm15
    616 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    617 ; SSE2-NEXT:    paddd %xmm15, %xmm1
    618 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    619 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
    620 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
    621 ; SSE2-NEXT:    psrad $31, %xmm1
    622 ; SSE2-NEXT:    paddd %xmm1, %xmm2
    623 ; SSE2-NEXT:    pxor %xmm1, %xmm2
    624 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    625 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    626 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
    627 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
    628 ; SSE2-NEXT:    psrad $31, %xmm1
    629 ; SSE2-NEXT:    paddd %xmm1, %xmm10
    630 ; SSE2-NEXT:    pxor %xmm1, %xmm10
    631 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    632 ; SSE2-NEXT:    paddd %xmm10, %xmm1
    633 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
    634 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
    635 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
    636 ; SSE2-NEXT:    psrad $31, %xmm1
    637 ; SSE2-NEXT:    paddd %xmm1, %xmm2
    638 ; SSE2-NEXT:    pxor %xmm1, %xmm2
    639 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    640 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    641 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
    642 ; SSE2-NEXT:    movdqa %xmm12, %xmm1
    643 ; SSE2-NEXT:    psrad $31, %xmm1
    644 ; SSE2-NEXT:    paddd %xmm1, %xmm12
    645 ; SSE2-NEXT:    pxor %xmm1, %xmm12
    646 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    647 ; SSE2-NEXT:    paddd %xmm12, %xmm1
    648 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
    649 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    650 ; SSE2-NEXT:    psrad $31, %xmm1
    651 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    652 ; SSE2-NEXT:    pxor %xmm1, %xmm0
    653 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    654 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    655 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
    656 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
    657 ; SSE2-NEXT:    psrad $31, %xmm0
    658 ; SSE2-NEXT:    paddd %xmm0, %xmm9
    659 ; SSE2-NEXT:    pxor %xmm0, %xmm9
    660 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    661 ; SSE2-NEXT:    paddd %xmm9, %xmm0
    662 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    663 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
    664 ; SSE2-NEXT:    psrad $31, %xmm0
    665 ; SSE2-NEXT:    paddd %xmm0, %xmm7
    666 ; SSE2-NEXT:    pxor %xmm0, %xmm7
    667 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    668 ; SSE2-NEXT:    paddd %xmm7, %xmm0
    669 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    670 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
    671 ; SSE2-NEXT:    movdqa %xmm13, %xmm0
    672 ; SSE2-NEXT:    psrad $31, %xmm0
    673 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    674 ; SSE2-NEXT:    pxor %xmm0, %xmm1
    675 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    676 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    677 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
    678 ; SSE2-NEXT:    addq $4, %rax
    679 ; SSE2-NEXT:    jne .LBB2_1
    680 ; SSE2-NEXT:  # %bb.2: # %middle.block
    681 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    682 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
    683 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    684 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
    685 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    686 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
    687 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
    688 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
    689 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
    690 ; SSE2-NEXT:    paddd %xmm1, %xmm3
    691 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    692 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
    693 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
    694 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
    695 ; SSE2-NEXT:    paddd %xmm1, %xmm4
    696 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
    697 ; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
    698 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
    699 ; SSE2-NEXT:    paddd (%rsp), %xmm1 # 16-byte Folded Reload
    700 ; SSE2-NEXT:    paddd %xmm4, %xmm1
    701 ; SSE2-NEXT:    paddd %xmm2, %xmm1
    702 ; SSE2-NEXT:    paddd %xmm3, %xmm1
    703 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    704 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    705 ; SSE2-NEXT:    paddd %xmm1, %xmm0
    706 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    707 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    708 ; SSE2-NEXT:    movd %xmm1, %eax
    709 ; SSE2-NEXT:    addq $200, %rsp
    710 ; SSE2-NEXT:    retq
    711 ;
    712 ; AVX1-LABEL: sad_avx64i8:
    713 ; AVX1:       # %bb.0: # %entry
    714 ; AVX1-NEXT:    subq $24, %rsp
    715 ; AVX1-NEXT:    vpxor %xmm14, %xmm14, %xmm14
    716 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
    717 ; AVX1-NEXT:    vpxor %xmm15, %xmm15, %xmm15
    718 ; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
    719 ; AVX1-NEXT:    vpxor %xmm13, %xmm13, %xmm13
    720 ; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
    721 ; AVX1-NEXT:    vpxor %xmm9, %xmm9, %xmm9
    722 ; AVX1-NEXT:    vpxor %xmm10, %xmm10, %xmm10
    723 ; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
    724 ; AVX1-NEXT:    .p2align 4, 0x90
    725 ; AVX1-NEXT:  .LBB2_1: # %vector.body
    726 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
    727 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    728 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    729 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    730 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    731 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    732 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    733 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    734 ; AVX1-NEXT:    vmovdqa %ymm7, %ymm11
    735 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    736 ; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
    737 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
    738 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    739 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
    740 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    741 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    742 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
    743 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    744 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    745 ; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
    746 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    747 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    748 ; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm0
    749 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    750 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    751 ; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm0
    752 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    753 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    754 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
    755 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    756 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    757 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    758 ; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
    759 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    760 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    761 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    762 ; AVX1-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
    763 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
    764 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    765 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    766 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm4
    767 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    768 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    769 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm3
    770 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    771 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    772 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm0
    773 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    774 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    775 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm5
    776 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    777 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    778 ; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
    779 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    780 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    781 ; AVX1-NEXT:    vpsubd %xmm1, %xmm7, %xmm1
    782 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    783 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    784 ; AVX1-NEXT:    vpsubd %xmm2, %xmm7, %xmm2
    785 ; AVX1-NEXT:    vpabsd %xmm2, %xmm2
    786 ; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm7
    787 ; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
    788 ; AVX1-NEXT:    vpabsd %xmm1, %xmm1
    789 ; AVX1-NEXT:    vpaddd %xmm11, %xmm1, %xmm1
    790 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm7
    791 ; AVX1-NEXT:    vpabsd %xmm6, %xmm1
    792 ; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm2
    793 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    794 ; AVX1-NEXT:    vpabsd %xmm5, %xmm2
    795 ; AVX1-NEXT:    vpaddd %xmm15, %xmm2, %xmm2
    796 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm15
    797 ; AVX1-NEXT:    vpabsd %xmm0, %xmm1
    798 ; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm2
    799 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    800 ; AVX1-NEXT:    vpabsd %xmm3, %xmm2
    801 ; AVX1-NEXT:    vpaddd %xmm14, %xmm2, %xmm2
    802 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm14
    803 ; AVX1-NEXT:    vpabsd %xmm4, %xmm1
    804 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
    805 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    806 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
    807 ; AVX1-NEXT:    vpaddd %xmm13, %xmm0, %xmm0
    808 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm13
    809 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
    810 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
    811 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    812 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
    813 ; AVX1-NEXT:    vpaddd %xmm8, %xmm1, %xmm1
    814 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
    815 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
    816 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
    817 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
    818 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
    819 ; AVX1-NEXT:    vpaddd %xmm9, %xmm1, %xmm1
    820 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
    821 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
    822 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm0
    823 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
    824 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
    825 ; AVX1-NEXT:    vpaddd %xmm10, %xmm1, %xmm1
    826 ; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
    827 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm10
    828 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
    829 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
    830 ; AVX1-NEXT:    vpabsd (%rsp), %xmm1 # 16-byte Folded Reload
    831 ; AVX1-NEXT:    vpaddd %xmm12, %xmm1, %xmm1
    832 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm12
    833 ; AVX1-NEXT:    addq $4, %rax
    834 ; AVX1-NEXT:    jne .LBB2_1
    835 ; AVX1-NEXT:  # %bb.2: # %middle.block
    836 ; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm0
    837 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm1
    838 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
    839 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm3
    840 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
    841 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
    842 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    843 ; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm1
    844 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
    845 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
    846 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm4
    847 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
    848 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
    849 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
    850 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    851 ; AVX1-NEXT:    vpaddd %xmm12, %xmm13, %xmm1
    852 ; AVX1-NEXT:    vpaddd %xmm10, %xmm7, %xmm2
    853 ; AVX1-NEXT:    vpaddd %xmm2, %xmm8, %xmm2
    854 ; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm1
    855 ; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm1
    856 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
    857 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
    858 ; AVX1-NEXT:    vpaddd %xmm0, %xmm14, %xmm0
    859 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    860 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    861 ; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
    862 ; AVX1-NEXT:    vmovd %xmm0, %eax
    863 ; AVX1-NEXT:    addq $24, %rsp
    864 ; AVX1-NEXT:    vzeroupper
    865 ; AVX1-NEXT:    retq
    866 ;
    867 ; AVX2-LABEL: sad_avx64i8:
    868 ; AVX2:       # %bb.0: # %entry
    869 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    870 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
    871 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    872 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    873 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
    874 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    875 ; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
    876 ; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
    877 ; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
    878 ; AVX2-NEXT:    .p2align 4, 0x90
    879 ; AVX2-NEXT:  .LBB2_1: # %vector.body
    880 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
    881 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    882 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    883 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    884 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    885 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    886 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    887 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    888 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    889 ; AVX2-NEXT:    vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
    890 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    891 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
    892 ; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
    893 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    894 ; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
    895 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    896 ; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
    897 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    898 ; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
    899 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    900 ; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
    901 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    902 ; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
    903 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    904 ; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
    905 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    906 ; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
    907 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
    908 ; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
    909 ; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
    910 ; AVX2-NEXT:    vpabsd %ymm9, %ymm8
    911 ; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
    912 ; AVX2-NEXT:    vpabsd %ymm10, %ymm8
    913 ; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
    914 ; AVX2-NEXT:    vpabsd %ymm11, %ymm8
    915 ; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
    916 ; AVX2-NEXT:    vpabsd %ymm12, %ymm8
    917 ; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
    918 ; AVX2-NEXT:    vpabsd %ymm13, %ymm8
    919 ; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
    920 ; AVX2-NEXT:    vpabsd %ymm14, %ymm8
    921 ; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
    922 ; AVX2-NEXT:    vpabsd %ymm15, %ymm8
    923 ; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
    924 ; AVX2-NEXT:    addq $4, %rax
    925 ; AVX2-NEXT:    jne .LBB2_1
    926 ; AVX2-NEXT:  # %bb.2: # %middle.block
    927 ; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
    928 ; AVX2-NEXT:    vpaddd %ymm7, %ymm4, %ymm4
    929 ; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
    930 ; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
    931 ; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
    932 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
    933 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    934 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    935 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    936 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    937 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    938 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
    939 ; AVX2-NEXT:    vmovd %xmm0, %eax
    940 ; AVX2-NEXT:    vzeroupper
    941 ; AVX2-NEXT:    retq
    942 ;
    943 ; AVX512F-LABEL: sad_avx64i8:
    944 ; AVX512F:       # %bb.0: # %entry
    945 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    946 ; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
    947 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    948 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    949 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    950 ; AVX512F-NEXT:    .p2align 4, 0x90
    951 ; AVX512F-NEXT:  .LBB2_1: # %vector.body
    952 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
    953 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    954 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    955 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    956 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    957 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    958 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
    959 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    960 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm5, %zmm5
    961 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    962 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm6, %zmm6
    963 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
    964 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm7, %zmm7
    965 ; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
    966 ; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
    967 ; AVX512F-NEXT:    vpabsd %zmm5, %zmm4
    968 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm4, %zmm1
    969 ; AVX512F-NEXT:    vpabsd %zmm6, %zmm4
    970 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm4, %zmm2
    971 ; AVX512F-NEXT:    vpabsd %zmm7, %zmm4
    972 ; AVX512F-NEXT:    vpaddd %zmm3, %zmm4, %zmm3
    973 ; AVX512F-NEXT:    addq $4, %rax
    974 ; AVX512F-NEXT:    jne .LBB2_1
    975 ; AVX512F-NEXT:  # %bb.2: # %middle.block
    976 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    977 ; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
    978 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    979 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    980 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    981 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    982 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    983 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    984 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    985 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    986 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    987 ; AVX512F-NEXT:    vmovd %xmm0, %eax
    988 ; AVX512F-NEXT:    vzeroupper
    989 ; AVX512F-NEXT:    retq
    990 ;
    991 ; AVX512BW-LABEL: sad_avx64i8:
    992 ; AVX512BW:       # %bb.0: # %entry
    993 ; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
    994 ; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
    995 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    996 ; AVX512BW-NEXT:    .p2align 4, 0x90
    997 ; AVX512BW-NEXT:  .LBB2_1: # %vector.body
    998 ; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
    999 ; AVX512BW-NEXT:    vmovdqa64 a+1024(%rax), %zmm2
   1000 ; AVX512BW-NEXT:    vpsadbw b+1024(%rax), %zmm2, %zmm2
   1001 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
   1002 ; AVX512BW-NEXT:    addq $4, %rax
   1003 ; AVX512BW-NEXT:    jne .LBB2_1
   1004 ; AVX512BW-NEXT:  # %bb.2: # %middle.block
   1005 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm1
   1006 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
   1007 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
   1008 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1009 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1010 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1011 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1012 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1013 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1014 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1015 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1016 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
   1017 ; AVX512BW-NEXT:    vzeroupper
   1018 ; AVX512BW-NEXT:    retq
   1019 entry:
   1020   br label %vector.body
   1021 
   1022 vector.body:
   1023   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   1024   %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
   1025   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
   1026   %1 = bitcast i8* %0 to <64 x i8>*
   1027   %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
   1028   %2 = zext <64 x i8> %wide.load to <64 x i32>
   1029   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
   1030   %4 = bitcast i8* %3 to <64 x i8>*
   1031   %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
   1032   %5 = zext <64 x i8> %wide.load1 to <64 x i32>
   1033   %6 = sub nsw <64 x i32> %2, %5
   1034   %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   1035   %8 = sub nsw <64 x i32> zeroinitializer, %6
   1036   %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
   1037   %10 = add nsw <64 x i32> %9, %vec.phi
   1038   %index.next = add i64 %index, 4
   1039   %11 = icmp eq i64 %index.next, 1024
   1040   br i1 %11, label %middle.block, label %vector.body
   1041 
   1042 middle.block:
   1043   %.lcssa = phi <64 x i32> [ %10, %vector.body ]
   1044   %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1045   %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf
   1046   %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1047   %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
   1048   %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1049   %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
   1050   %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1051   %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
   1052   %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1053   %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
   1054   %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1055   %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
   1056   %12 = extractelement <64 x i32> %bin.rdx6, i32 0
   1057   ret i32 %12
   1058 }
   1059 
   1060 define i32 @sad_2i8() nounwind {
   1061 ; SSE2-LABEL: sad_2i8:
   1062 ; SSE2:       # %bb.0: # %entry
   1063 ; SSE2-NEXT:    pxor %xmm0, %xmm0
   1064 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
   1065 ; SSE2-NEXT:    movl $65535, %ecx # imm = 0xFFFF
   1066 ; SSE2-NEXT:    movd %ecx, %xmm1
   1067 ; SSE2-NEXT:    .p2align 4, 0x90
   1068 ; SSE2-NEXT:  .LBB3_1: # %vector.body
   1069 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
   1070 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1071 ; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
   1072 ; SSE2-NEXT:    pand %xmm1, %xmm3
   1073 ; SSE2-NEXT:    pand %xmm1, %xmm2
   1074 ; SSE2-NEXT:    psadbw %xmm3, %xmm2
   1075 ; SSE2-NEXT:    paddq %xmm2, %xmm0
   1076 ; SSE2-NEXT:    addq $4, %rax
   1077 ; SSE2-NEXT:    jne .LBB3_1
   1078 ; SSE2-NEXT:  # %bb.2: # %middle.block
   1079 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1080 ; SSE2-NEXT:    paddq %xmm0, %xmm1
   1081 ; SSE2-NEXT:    movd %xmm1, %eax
   1082 ; SSE2-NEXT:    retq
   1083 ;
   1084 ; AVX-LABEL: sad_2i8:
   1085 ; AVX:       # %bb.0: # %entry
   1086 ; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
   1087 ; AVX-NEXT:    movq $-1024, %rax # imm = 0xFC00
   1088 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1089 ; AVX-NEXT:    .p2align 4, 0x90
   1090 ; AVX-NEXT:  .LBB3_1: # %vector.body
   1091 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
   1092 ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1093 ; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
   1094 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
   1095 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
   1096 ; AVX-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
   1097 ; AVX-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
   1098 ; AVX-NEXT:    addq $4, %rax
   1099 ; AVX-NEXT:    jne .LBB3_1
   1100 ; AVX-NEXT:  # %bb.2: # %middle.block
   1101 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1102 ; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
   1103 ; AVX-NEXT:    vmovd %xmm0, %eax
   1104 ; AVX-NEXT:    retq
   1105 entry:
   1106   br label %vector.body
   1107 
   1108 vector.body:
   1109   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
   1110   %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
   1111   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
   1112   %1 = bitcast i8* %0 to <2 x i8>*
   1113   %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
   1114   %2 = zext <2 x i8> %wide.load to <2 x i32>
   1115   %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
   1116   %4 = bitcast i8* %3 to <2 x i8>*
   1117   %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
   1118   %5 = zext <2 x i8> %wide.load1 to <2 x i32>
   1119   %6 = sub nsw <2 x i32> %2, %5
   1120   %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
   1121   %8 = sub nsw <2 x i32> zeroinitializer, %6
   1122   %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
   1123   %10 = add nsw <2 x i32> %9, %vec.phi
   1124   %index.next = add i64 %index, 4
   1125   %11 = icmp eq i64 %index.next, 1024
   1126   br i1 %11, label %middle.block, label %vector.body
   1127 
   1128 middle.block:
   1129   %.lcssa = phi <2 x i32> [ %10, %vector.body ]
   1130   %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
   1131   %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf
   1132   %12 = extractelement <2 x i32> %bin.rdx, i32 0
   1133   ret i32 %12
   1134 }
   1135 
   1136 define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
   1137 ; SSE2-LABEL: sad_nonloop_4i8:
   1138 ; SSE2:       # %bb.0:
   1139 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1140 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1141 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
   1142 ; SSE2-NEXT:    movd %xmm1, %eax
   1143 ; SSE2-NEXT:    retq
   1144 ;
   1145 ; AVX-LABEL: sad_nonloop_4i8:
   1146 ; AVX:       # %bb.0:
   1147 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1148 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1149 ; AVX-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
   1150 ; AVX-NEXT:    vmovd %xmm0, %eax
   1151 ; AVX-NEXT:    retq
   1152   %v1 = load <4 x i8>, <4 x i8>* %p, align 1
   1153   %z1 = zext <4 x i8> %v1 to <4 x i32>
   1154   %v2 = load <4 x i8>, <4 x i8>* %q, align 1
   1155   %z2 = zext <4 x i8> %v2 to <4 x i32>
   1156   %sub = sub nsw <4 x i32> %z1, %z2
   1157   %isneg = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1>
   1158   %neg = sub nsw <4 x i32> zeroinitializer, %sub
   1159   %abs = select <4 x i1> %isneg, <4 x i32> %sub, <4 x i32> %neg
   1160   %h2 = shufflevector <4 x i32> %abs, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   1161   %sum2 = add <4 x i32> %abs, %h2
   1162   %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   1163   %sum3 = add <4 x i32> %sum2, %h3
   1164   %sum = extractelement <4 x i32> %sum3, i32 0
   1165   ret i32 %sum
   1166 }
   1167 
   1168 define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
   1169 ; SSE2-LABEL: sad_nonloop_8i8:
   1170 ; SSE2:       # %bb.0:
   1171 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1172 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   1173 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
   1174 ; SSE2-NEXT:    movd %xmm1, %eax
   1175 ; SSE2-NEXT:    retq
   1176 ;
   1177 ; AVX-LABEL: sad_nonloop_8i8:
   1178 ; AVX:       # %bb.0:
   1179 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1180 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
   1181 ; AVX-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
   1182 ; AVX-NEXT:    vmovd %xmm0, %eax
   1183 ; AVX-NEXT:    retq
   1184   %v1 = load <8 x i8>, <8 x i8>* %p, align 1
   1185   %z1 = zext <8 x i8> %v1 to <8 x i32>
   1186   %v2 = load <8 x i8>, <8 x i8>* %q, align 1
   1187   %z2 = zext <8 x i8> %v2 to <8 x i32>
   1188   %sub = sub nsw <8 x i32> %z1, %z2
   1189   %isneg = icmp sgt <8 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   1190   %neg = sub nsw <8 x i32> zeroinitializer, %sub
   1191   %abs = select <8 x i1> %isneg, <8 x i32> %sub, <8 x i32> %neg
   1192   %h1 = shufflevector <8 x i32> %abs, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   1193   %sum1 = add <8 x i32> %abs, %h1
   1194   %h2 = shufflevector <8 x i32> %sum1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1195   %sum2 = add <8 x i32> %sum1, %h2
   1196   %h3 = shufflevector <8 x i32> %sum2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1197   %sum3 = add <8 x i32> %sum2, %h3
   1198   %sum = extractelement <8 x i32> %sum3, i32 0
   1199   ret i32 %sum
   1200 }
   1201 
   1202 define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
   1203 ; SSE2-LABEL: sad_nonloop_16i8:
   1204 ; SSE2:       # %bb.0:
   1205 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
   1206 ; SSE2-NEXT:    movdqu (%rdx), %xmm1
   1207 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
   1208 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1209 ; SSE2-NEXT:    paddq %xmm1, %xmm0
   1210 ; SSE2-NEXT:    movd %xmm0, %eax
   1211 ; SSE2-NEXT:    retq
   1212 ;
   1213 ; AVX-LABEL: sad_nonloop_16i8:
   1214 ; AVX:       # %bb.0:
   1215 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
   1216 ; AVX-NEXT:    vpsadbw (%rdx), %xmm0, %xmm0
   1217 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1218 ; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   1219 ; AVX-NEXT:    vmovd %xmm0, %eax
   1220 ; AVX-NEXT:    retq
   1221   %v1 = load <16 x i8>, <16 x i8>* %p, align 1
   1222   %z1 = zext <16 x i8> %v1 to <16 x i32>
   1223   %v2 = load <16 x i8>, <16 x i8>* %q, align 1
   1224   %z2 = zext <16 x i8> %v2 to <16 x i32>
   1225   %sub = sub nsw <16 x i32> %z1, %z2
   1226   %isneg = icmp sgt <16 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   1227   %neg = sub nsw <16 x i32> zeroinitializer, %sub
   1228   %abs = select <16 x i1> %isneg, <16 x i32> %sub, <16 x i32> %neg
   1229   %h0 = shufflevector <16 x i32> %abs, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1230   %sum0 = add <16 x i32> %abs, %h0
   1231   %h1 = shufflevector <16 x i32> %sum0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1232   %sum1 = add <16 x i32> %sum0, %h1
   1233   %h2 = shufflevector <16 x i32> %sum1, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1234   %sum2 = add <16 x i32> %sum1, %h2
   1235   %h3 = shufflevector <16 x i32> %sum2, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1236   %sum3 = add <16 x i32> %sum2, %h3
   1237   %sum = extractelement <16 x i32> %sum3, i32 0
   1238   ret i32 %sum
   1239 }
   1240 
   1241 define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
   1242 ; SSE2-LABEL: sad_nonloop_32i8:
   1243 ; SSE2:       # %bb.0:
   1244 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
   1245 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm12
   1246 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1247 ; SSE2-NEXT:    movdqa %xmm12, %xmm8
   1248 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
   1249 ; SSE2-NEXT:    movdqa %xmm8, %xmm10
   1250 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
   1251 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
   1252 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
   1253 ; SSE2-NEXT:    movdqa %xmm9, %xmm11
   1254 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
   1255 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
   1256 ; SSE2-NEXT:    movdqa %xmm12, %xmm13
   1257 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
   1258 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
   1259 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1260 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
   1261 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
   1262 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
   1263 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
   1264 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1265 ; SSE2-NEXT:    movdqu (%rdx), %xmm7
   1266 ; SSE2-NEXT:    movdqu 16(%rdx), %xmm3
   1267 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1268 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
   1269 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
   1270 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
   1271 ; SSE2-NEXT:    psubd %xmm5, %xmm10
   1272 ; SSE2-NEXT:    movdqa %xmm7, %xmm2
   1273 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
   1274 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   1275 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
   1276 ; SSE2-NEXT:    psubd %xmm5, %xmm11
   1277 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
   1278 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   1279 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
   1280 ; SSE2-NEXT:    psubd %xmm5, %xmm13
   1281 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
   1282 ; SSE2-NEXT:    movdqa %xmm7, %xmm5
   1283 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
   1284 ; SSE2-NEXT:    psubd %xmm5, %xmm4
   1285 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
   1286 ; SSE2-NEXT:    psubd %xmm6, %xmm8
   1287 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   1288 ; SSE2-NEXT:    psubd %xmm2, %xmm9
   1289 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
   1290 ; SSE2-NEXT:    psubd %xmm3, %xmm12
   1291 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
   1292 ; SSE2-NEXT:    psubd %xmm7, %xmm0
   1293 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
   1294 ; SSE2-NEXT:    psrad $31, %xmm1
   1295 ; SSE2-NEXT:    paddd %xmm1, %xmm10
   1296 ; SSE2-NEXT:    pxor %xmm1, %xmm10
   1297 ; SSE2-NEXT:    movdqa %xmm11, %xmm1
   1298 ; SSE2-NEXT:    psrad $31, %xmm1
   1299 ; SSE2-NEXT:    paddd %xmm1, %xmm11
   1300 ; SSE2-NEXT:    pxor %xmm1, %xmm11
   1301 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
   1302 ; SSE2-NEXT:    psrad $31, %xmm1
   1303 ; SSE2-NEXT:    paddd %xmm1, %xmm13
   1304 ; SSE2-NEXT:    pxor %xmm1, %xmm13
   1305 ; SSE2-NEXT:    movdqa %xmm4, %xmm1
   1306 ; SSE2-NEXT:    psrad $31, %xmm1
   1307 ; SSE2-NEXT:    paddd %xmm1, %xmm4
   1308 ; SSE2-NEXT:    pxor %xmm1, %xmm4
   1309 ; SSE2-NEXT:    paddd %xmm13, %xmm4
   1310 ; SSE2-NEXT:    paddd %xmm10, %xmm4
   1311 ; SSE2-NEXT:    paddd %xmm11, %xmm4
   1312 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
   1313 ; SSE2-NEXT:    psrad $31, %xmm1
   1314 ; SSE2-NEXT:    paddd %xmm1, %xmm8
   1315 ; SSE2-NEXT:    pxor %xmm1, %xmm8
   1316 ; SSE2-NEXT:    movdqa %xmm9, %xmm1
   1317 ; SSE2-NEXT:    psrad $31, %xmm1
   1318 ; SSE2-NEXT:    paddd %xmm1, %xmm9
   1319 ; SSE2-NEXT:    pxor %xmm1, %xmm9
   1320 ; SSE2-NEXT:    movdqa %xmm12, %xmm1
   1321 ; SSE2-NEXT:    psrad $31, %xmm1
   1322 ; SSE2-NEXT:    paddd %xmm1, %xmm12
   1323 ; SSE2-NEXT:    pxor %xmm1, %xmm12
   1324 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1325 ; SSE2-NEXT:    psrad $31, %xmm1
   1326 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1327 ; SSE2-NEXT:    pxor %xmm1, %xmm0
   1328 ; SSE2-NEXT:    paddd %xmm12, %xmm0
   1329 ; SSE2-NEXT:    paddd %xmm8, %xmm0
   1330 ; SSE2-NEXT:    paddd %xmm4, %xmm0
   1331 ; SSE2-NEXT:    paddd %xmm9, %xmm0
   1332 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1333 ; SSE2-NEXT:    paddd %xmm0, %xmm1
   1334 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1335 ; SSE2-NEXT:    paddd %xmm1, %xmm0
   1336 ; SSE2-NEXT:    movd %xmm0, %eax
   1337 ; SSE2-NEXT:    retq
   1338 ;
   1339 ; AVX1-LABEL: sad_nonloop_32i8:
   1340 ; AVX1:       # %bb.0:
   1341 ; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
   1342 ; AVX1-NEXT:    vmovdqu (%rdx), %ymm1
   1343 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1344 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1345 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm2
   1346 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
   1347 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   1348 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1349 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
   1350 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1351 ; AVX1-NEXT:    vzeroupper
   1352 ; AVX1-NEXT:    retq
   1353 ;
   1354 ; AVX2-LABEL: sad_nonloop_32i8:
   1355 ; AVX2:       # %bb.0:
   1356 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
   1357 ; AVX2-NEXT:    vpsadbw (%rdx), %ymm0, %ymm0
   1358 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1359 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1360 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1361 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1362 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1363 ; AVX2-NEXT:    vzeroupper
   1364 ; AVX2-NEXT:    retq
   1365 ;
   1366 ; AVX512-LABEL: sad_nonloop_32i8:
   1367 ; AVX512:       # %bb.0:
   1368 ; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
   1369 ; AVX512-NEXT:    vpsadbw (%rdx), %ymm0, %ymm0
   1370 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1371 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1372 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1373 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1374 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1375 ; AVX512-NEXT:    vzeroupper
   1376 ; AVX512-NEXT:    retq
   1377   %v1 = load <32 x i8>, <32 x i8>* %p, align 1
   1378   %z1 = zext <32 x i8> %v1 to <32 x i32>
   1379   %v2 = load <32 x i8>, <32 x i8>* %q, align 1
   1380   %z2 = zext <32 x i8> %v2 to <32 x i32>
   1381   %sub = sub nsw <32 x i32> %z1, %z2
   1382   %isneg = icmp sgt <32 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   1383   %neg = sub nsw <32 x i32> zeroinitializer, %sub
   1384   %abs = select <32 x i1> %isneg, <32 x i32> %sub, <32 x i32> %neg
   1385   %h32 = shufflevector <32 x i32> %abs, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1386   %sum32 = add <32 x i32> %abs, %h32
   1387   %h0 = shufflevector <32 x i32> %sum32, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1388   %sum0 = add <32 x i32> %sum32, %h0
   1389   %h1 = shufflevector <32 x i32> %sum0, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1390   %sum1 = add <32 x i32> %sum0, %h1
   1391   %h2 = shufflevector <32 x i32> %sum1, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1392   %sum2 = add <32 x i32> %sum1, %h2
   1393   %h3 = shufflevector <32 x i32> %sum2, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1394   %sum3 = add <32 x i32> %sum2, %h3
   1395   %sum = extractelement <32 x i32> %sum3, i32 0
   1396   ret i32 %sum
   1397 }
   1398