1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 7 8 @a = global [1024 x i8] zeroinitializer, align 16 9 @b = global [1024 x i8] zeroinitializer, align 16 10 11 define i32 @sad_16i8() nounwind { 12 ; SSE2-LABEL: sad_16i8: 13 ; SSE2: # %bb.0: # %entry 14 ; SSE2-NEXT: pxor %xmm0, %xmm0 15 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 16 ; SSE2-NEXT: pxor %xmm1, %xmm1 17 ; SSE2-NEXT: .p2align 4, 0x90 18 ; SSE2-NEXT: .LBB0_1: # %vector.body 19 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 20 ; SSE2-NEXT: movdqu a+1024(%rax), %xmm2 21 ; SSE2-NEXT: movdqu b+1024(%rax), %xmm3 22 ; SSE2-NEXT: psadbw %xmm2, %xmm3 23 ; SSE2-NEXT: paddd %xmm3, %xmm1 24 ; SSE2-NEXT: addq $4, %rax 25 ; SSE2-NEXT: jne .LBB0_1 26 ; SSE2-NEXT: # %bb.2: # %middle.block 27 ; SSE2-NEXT: paddd %xmm0, %xmm1 28 ; SSE2-NEXT: paddd %xmm0, %xmm0 29 ; SSE2-NEXT: paddd %xmm1, %xmm0 30 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 31 ; SSE2-NEXT: paddd %xmm0, %xmm1 32 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 33 ; SSE2-NEXT: paddd %xmm1, %xmm0 34 ; SSE2-NEXT: movd %xmm0, %eax 35 ; SSE2-NEXT: retq 36 ; 37 ; AVX1-LABEL: sad_16i8: 38 ; AVX1: # %bb.0: # %entry 39 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 40 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 41 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 42 ; AVX1-NEXT: .p2align 4, 0x90 43 ; AVX1-NEXT: .LBB0_1: # %vector.body 44 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 45 ; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2 46 ; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 47 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2 48 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 49 ; AVX1-NEXT: addq $4, %rax 50 ; AVX1-NEXT: jne .LBB0_1 51 ; AVX1-NEXT: # %bb.2: # %middle.block 52 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 53 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 54 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 55 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 56 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 57 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 58 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 59 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 60 ; AVX1-NEXT: vmovd %xmm0, %eax 61 ; AVX1-NEXT: vzeroupper 62 ; AVX1-NEXT: retq 63 ; 64 ; AVX2-LABEL: sad_16i8: 65 ; AVX2: # %bb.0: # %entry 66 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 67 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 68 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 69 ; AVX2-NEXT: .p2align 4, 0x90 70 ; AVX2-NEXT: .LBB0_1: # %vector.body 71 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 72 ; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2 73 ; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 74 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 75 ; AVX2-NEXT: addq $4, %rax 76 ; AVX2-NEXT: jne .LBB0_1 77 ; AVX2-NEXT: # %bb.2: # %middle.block 78 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 79 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 80 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 81 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 82 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 83 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 84 ; AVX2-NEXT: vmovd %xmm0, %eax 85 ; AVX2-NEXT: vzeroupper 86 ; AVX2-NEXT: retq 87 ; 88 ; AVX512-LABEL: sad_16i8: 89 ; AVX512: # %bb.0: # %entry 90 ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 91 ; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 92 ; AVX512-NEXT: .p2align 4, 0x90 93 ; AVX512-NEXT: .LBB0_1: # %vector.body 94 ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 95 ; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 96 ; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 97 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 98 ; AVX512-NEXT: addq $4, %rax 99 ; AVX512-NEXT: jne .LBB0_1 100 ; AVX512-NEXT: # %bb.2: # %middle.block 101 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 102 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 103 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 104 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 105 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 106 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 107 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 108 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 109 ; AVX512-NEXT: vmovd %xmm0, %eax 110 ; AVX512-NEXT: vzeroupper 111 ; AVX512-NEXT: retq 112 entry: 113 br label %vector.body 114 115 vector.body: 116 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 117 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 118 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 119 %1 = bitcast i8* %0 to <16 x i8>* 120 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4 121 %2 = zext <16 x i8> %wide.load to <16 x i32> 122 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 123 %4 = bitcast i8* %3 to <16 x i8>* 124 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4 125 %5 = zext <16 x i8> %wide.load1 to <16 x i32> 126 %6 = sub nsw <16 x i32> %2, %5 127 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 128 %8 = sub nsw <16 x i32> zeroinitializer, %6 129 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 130 %10 = add nsw <16 x i32> %9, %vec.phi 131 %index.next = add i64 %index, 4 132 %11 = icmp eq i64 %index.next, 1024 133 br i1 %11, label %middle.block, label %vector.body 134 135 middle.block: 136 %.lcssa = phi <16 x i32> [ %10, %vector.body ] 137 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 138 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf 139 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 140 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2 141 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 142 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3 143 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 144 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4 145 %12 = extractelement <16 x i32> %bin.rdx4, i32 0 146 ret i32 %12 147 } 148 149 define i32 @sad_32i8() nounwind { 150 ; SSE2-LABEL: sad_32i8: 151 ; SSE2: # %bb.0: # %entry 152 ; SSE2-NEXT: pxor %xmm12, %xmm12 153 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 154 ; SSE2-NEXT: pxor %xmm0, %xmm0 155 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 156 ; SSE2-NEXT: pxor %xmm0, %xmm0 157 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 158 ; SSE2-NEXT: pxor %xmm6, %xmm6 159 ; SSE2-NEXT: pxor %xmm13, %xmm13 160 ; SSE2-NEXT: pxor %xmm0, %xmm0 161 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 162 ; SSE2-NEXT: pxor %xmm15, %xmm15 163 ; SSE2-NEXT: pxor %xmm0, %xmm0 164 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 165 ; SSE2-NEXT: pxor %xmm14, %xmm14 166 ; SSE2-NEXT: .p2align 4, 0x90 167 ; SSE2-NEXT: .LBB1_1: # %vector.body 168 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 169 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 170 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 171 ; SSE2-NEXT: movdqa %xmm3, %xmm4 172 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 173 ; SSE2-NEXT: movdqa %xmm4, %xmm7 174 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] 175 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 176 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] 177 ; SSE2-NEXT: movdqa %xmm3, %xmm1 178 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] 179 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 180 ; SSE2-NEXT: movdqa %xmm8, %xmm0 181 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 182 ; SSE2-NEXT: movdqa %xmm0, %xmm5 183 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] 184 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 185 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] 186 ; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 187 ; SSE2-NEXT: movdqa %xmm11, %xmm10 188 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] 189 ; SSE2-NEXT: movdqa %xmm10, %xmm2 190 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 191 ; SSE2-NEXT: psubd %xmm2, %xmm7 192 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 193 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] 194 ; SSE2-NEXT: psubd %xmm10, %xmm4 195 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 196 ; SSE2-NEXT: movdqa %xmm11, %xmm2 197 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 198 ; SSE2-NEXT: psubd %xmm2, %xmm1 199 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 200 ; SSE2-NEXT: psubd %xmm11, %xmm3 201 ; SSE2-NEXT: movdqa %xmm6, %xmm10 202 ; SSE2-NEXT: movdqa %xmm9, %xmm6 203 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 204 ; SSE2-NEXT: movdqa %xmm6, %xmm2 205 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 206 ; SSE2-NEXT: psubd %xmm2, %xmm5 207 ; SSE2-NEXT: movdqa %xmm8, %xmm2 208 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] 209 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 210 ; SSE2-NEXT: psubd %xmm6, %xmm0 211 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 212 ; SSE2-NEXT: movdqa %xmm9, %xmm6 213 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 214 ; SSE2-NEXT: psubd %xmm6, %xmm2 215 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 216 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 217 ; SSE2-NEXT: psubd %xmm9, %xmm8 218 ; SSE2-NEXT: movdqa %xmm7, %xmm6 219 ; SSE2-NEXT: psrad $31, %xmm6 220 ; SSE2-NEXT: paddd %xmm6, %xmm7 221 ; SSE2-NEXT: pxor %xmm6, %xmm7 222 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload 223 ; SSE2-NEXT: paddd %xmm7, %xmm6 224 ; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill 225 ; SSE2-NEXT: movdqa %xmm4, %xmm6 226 ; SSE2-NEXT: psrad $31, %xmm6 227 ; SSE2-NEXT: paddd %xmm6, %xmm4 228 ; SSE2-NEXT: pxor %xmm6, %xmm4 229 ; SSE2-NEXT: movdqa %xmm10, %xmm6 230 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload 231 ; SSE2-NEXT: paddd %xmm4, %xmm7 232 ; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill 233 ; SSE2-NEXT: movdqa %xmm1, %xmm4 234 ; SSE2-NEXT: psrad $31, %xmm4 235 ; SSE2-NEXT: paddd %xmm4, %xmm1 236 ; SSE2-NEXT: pxor %xmm4, %xmm1 237 ; SSE2-NEXT: paddd %xmm1, %xmm6 238 ; SSE2-NEXT: movdqa %xmm3, %xmm1 239 ; SSE2-NEXT: psrad $31, %xmm1 240 ; SSE2-NEXT: paddd %xmm1, %xmm3 241 ; SSE2-NEXT: pxor %xmm1, %xmm3 242 ; SSE2-NEXT: paddd %xmm3, %xmm13 243 ; SSE2-NEXT: movdqa %xmm5, %xmm1 244 ; SSE2-NEXT: psrad $31, %xmm1 245 ; SSE2-NEXT: paddd %xmm1, %xmm5 246 ; SSE2-NEXT: pxor %xmm1, %xmm5 247 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 248 ; SSE2-NEXT: paddd %xmm5, %xmm1 249 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 250 ; SSE2-NEXT: movdqa %xmm0, %xmm1 251 ; SSE2-NEXT: psrad $31, %xmm1 252 ; SSE2-NEXT: paddd %xmm1, %xmm0 253 ; SSE2-NEXT: pxor %xmm1, %xmm0 254 ; SSE2-NEXT: paddd %xmm0, %xmm15 255 ; SSE2-NEXT: movdqa %xmm2, %xmm0 256 ; SSE2-NEXT: psrad $31, %xmm0 257 ; SSE2-NEXT: paddd %xmm0, %xmm2 258 ; SSE2-NEXT: pxor %xmm0, %xmm2 259 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 260 ; SSE2-NEXT: paddd %xmm2, %xmm0 261 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 262 ; SSE2-NEXT: movdqa %xmm8, %xmm0 263 ; SSE2-NEXT: psrad $31, %xmm0 264 ; SSE2-NEXT: paddd %xmm0, %xmm8 265 ; SSE2-NEXT: pxor %xmm0, %xmm8 266 ; SSE2-NEXT: paddd %xmm8, %xmm14 267 ; SSE2-NEXT: addq $4, %rax 268 ; SSE2-NEXT: jne .LBB1_1 269 ; SSE2-NEXT: # %bb.2: # %middle.block 270 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 271 ; SSE2-NEXT: paddd %xmm15, %xmm0 272 ; SSE2-NEXT: paddd %xmm14, %xmm13 273 ; SSE2-NEXT: paddd %xmm0, %xmm13 274 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 275 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 276 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload 277 ; SSE2-NEXT: paddd %xmm13, %xmm6 278 ; SSE2-NEXT: paddd %xmm0, %xmm6 279 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] 280 ; SSE2-NEXT: paddd %xmm6, %xmm0 281 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 282 ; SSE2-NEXT: paddd %xmm0, %xmm1 283 ; SSE2-NEXT: movd %xmm1, %eax 284 ; SSE2-NEXT: retq 285 ; 286 ; AVX1-LABEL: sad_32i8: 287 ; AVX1: # %bb.0: # %entry 288 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 289 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 290 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 291 ; AVX1-NEXT: .p2align 4, 0x90 292 ; AVX1-NEXT: .LBB1_1: # %vector.body 293 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 294 ; AVX1-NEXT: vmovdqa a+1024(%rax), %ymm2 295 ; AVX1-NEXT: vmovdqa b+1024(%rax), %ymm3 296 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 297 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 298 ; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm4 299 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 300 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 301 ; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 302 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 303 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 304 ; AVX1-NEXT: addq $4, %rax 305 ; AVX1-NEXT: jne .LBB1_1 306 ; AVX1-NEXT: # %bb.2: # %middle.block 307 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 308 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 309 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 310 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 311 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 312 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 313 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 314 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 315 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 316 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 317 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 319 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 320 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 321 ; AVX1-NEXT: vmovd %xmm0, %eax 322 ; AVX1-NEXT: vzeroupper 323 ; AVX1-NEXT: retq 324 ; 325 ; AVX2-LABEL: sad_32i8: 326 ; AVX2: # %bb.0: # %entry 327 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 328 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 329 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 330 ; AVX2-NEXT: .p2align 4, 0x90 331 ; AVX2-NEXT: .LBB1_1: # %vector.body 332 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 333 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2 334 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 335 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 336 ; AVX2-NEXT: addq $4, %rax 337 ; AVX2-NEXT: jne .LBB1_1 338 ; AVX2-NEXT: # %bb.2: # %middle.block 339 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 340 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 341 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 342 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 343 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 344 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 345 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 346 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 347 ; AVX2-NEXT: vmovd %xmm0, %eax 348 ; AVX2-NEXT: vzeroupper 349 ; AVX2-NEXT: retq 350 ; 351 ; AVX512-LABEL: sad_32i8: 352 ; AVX512: # %bb.0: # %entry 353 ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 354 ; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 355 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 356 ; AVX512-NEXT: .p2align 4, 0x90 357 ; AVX512-NEXT: .LBB1_1: # %vector.body 358 ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 359 ; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2 360 ; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 361 ; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 362 ; AVX512-NEXT: addq $4, %rax 363 ; AVX512-NEXT: jne .LBB1_1 364 ; AVX512-NEXT: # %bb.2: # %middle.block 365 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 366 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 367 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 368 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 369 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 370 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 371 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 372 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 373 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 374 ; AVX512-NEXT: vmovd %xmm0, %eax 375 ; AVX512-NEXT: vzeroupper 376 ; AVX512-NEXT: retq 377 entry: 378 br label %vector.body 379 380 vector.body: 381 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 382 %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 383 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 384 %1 = bitcast i8* %0 to <32 x i8>* 385 %wide.load = load <32 x i8>, <32 x i8>* %1, align 32 386 %2 = zext <32 x i8> %wide.load to <32 x i32> 387 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 388 %4 = bitcast i8* %3 to <32 x i8>* 389 %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32 390 %5 = zext <32 x i8> %wide.load1 to <32 x i32> 391 %6 = sub nsw <32 x i32> %2, %5 392 %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 393 %8 = sub nsw <32 x i32> zeroinitializer, %6 394 %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8 395 %10 = add nsw <32 x i32> %9, %vec.phi 396 %index.next = add i64 %index, 4 397 %11 = icmp eq i64 %index.next, 1024 398 br i1 %11, label %middle.block, label %vector.body 399 400 middle.block: 401 %.lcssa = phi <32 x i32> [ %10, %vector.body ] 402 %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 403 %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf 404 %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 405 %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2 406 %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 407 %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3 408 %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 409 %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4 410 %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 411 %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5 412 %12 = extractelement <32 x i32> %bin.rdx5, i32 0 413 ret i32 %12 414 } 415 416 define i32 @sad_avx64i8() nounwind { 417 ; SSE2-LABEL: sad_avx64i8: 418 ; SSE2: # %bb.0: # %entry 419 ; SSE2-NEXT: subq $200, %rsp 420 ; SSE2-NEXT: pxor %xmm14, %xmm14 421 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 422 ; SSE2-NEXT: pxor %xmm0, %xmm0 423 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 424 ; SSE2-NEXT: pxor %xmm0, %xmm0 425 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 426 ; SSE2-NEXT: pxor %xmm0, %xmm0 427 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 428 ; SSE2-NEXT: pxor %xmm0, %xmm0 429 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 430 ; SSE2-NEXT: pxor %xmm0, %xmm0 431 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 432 ; SSE2-NEXT: pxor %xmm0, %xmm0 433 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 434 ; SSE2-NEXT: pxor %xmm0, %xmm0 435 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 436 ; SSE2-NEXT: pxor %xmm0, %xmm0 437 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 438 ; SSE2-NEXT: pxor %xmm0, %xmm0 439 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 440 ; SSE2-NEXT: pxor %xmm0, %xmm0 441 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 442 ; SSE2-NEXT: pxor %xmm0, %xmm0 443 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 444 ; SSE2-NEXT: pxor %xmm0, %xmm0 445 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 446 ; SSE2-NEXT: pxor %xmm0, %xmm0 447 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 448 ; SSE2-NEXT: pxor %xmm0, %xmm0 449 ; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill 450 ; SSE2-NEXT: pxor %xmm0, %xmm0 451 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 452 ; SSE2-NEXT: pxor %xmm0, %xmm0 453 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 454 ; SSE2-NEXT: .p2align 4, 0x90 455 ; SSE2-NEXT: .LBB2_1: # %vector.body 456 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 457 ; SSE2-NEXT: movaps a+1040(%rax), %xmm0 458 ; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 459 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 460 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 461 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 462 ; SSE2-NEXT: movdqa %xmm4, %xmm6 463 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 464 ; SSE2-NEXT: movdqa %xmm6, %xmm1 465 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 466 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] 467 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] 468 ; SSE2-NEXT: movdqa %xmm4, %xmm5 469 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] 470 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] 471 ; SSE2-NEXT: movdqa %xmm15, %xmm11 472 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] 473 ; SSE2-NEXT: movdqa %xmm11, %xmm8 474 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] 475 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] 476 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] 477 ; SSE2-NEXT: movdqa %xmm15, %xmm0 478 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 479 ; SSE2-NEXT: movdqa %xmm0, %xmm2 480 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 481 ; SSE2-NEXT: movdqa %xmm12, %xmm10 482 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] 483 ; SSE2-NEXT: movdqa %xmm10, %xmm0 484 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 485 ; SSE2-NEXT: movdqa %xmm0, %xmm9 486 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] 487 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] 488 ; SSE2-NEXT: movdqa %xmm12, %xmm0 489 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 490 ; SSE2-NEXT: movdqa %xmm0, %xmm13 491 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] 492 ; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 493 ; SSE2-NEXT: movdqa %xmm3, %xmm7 494 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] 495 ; SSE2-NEXT: movdqa %xmm7, %xmm0 496 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 497 ; SSE2-NEXT: psubd %xmm0, %xmm1 498 ; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 499 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] 500 ; SSE2-NEXT: psubd %xmm7, %xmm6 501 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 502 ; SSE2-NEXT: movdqa %xmm3, %xmm7 503 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] 504 ; SSE2-NEXT: psubd %xmm7, %xmm5 505 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] 506 ; SSE2-NEXT: psubd %xmm3, %xmm4 507 ; SSE2-NEXT: movdqa %xmm0, %xmm3 508 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] 509 ; SSE2-NEXT: movdqa %xmm3, %xmm7 510 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] 511 ; SSE2-NEXT: psubd %xmm7, %xmm8 512 ; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 513 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] 514 ; SSE2-NEXT: psubd %xmm3, %xmm11 515 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 516 ; SSE2-NEXT: movdqa %xmm0, %xmm3 517 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 518 ; SSE2-NEXT: psubd %xmm3, %xmm2 519 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill 520 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 521 ; SSE2-NEXT: psubd %xmm0, %xmm15 522 ; SSE2-NEXT: movdqa %xmm7, %xmm0 523 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 524 ; SSE2-NEXT: movdqa %xmm0, %xmm3 525 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] 526 ; SSE2-NEXT: psubd %xmm3, %xmm9 527 ; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill 528 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 529 ; SSE2-NEXT: movdqa %xmm2, %xmm9 530 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] 531 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 532 ; SSE2-NEXT: psubd %xmm0, %xmm10 533 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] 534 ; SSE2-NEXT: movdqa %xmm7, %xmm0 535 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 536 ; SSE2-NEXT: psubd %xmm0, %xmm13 537 ; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill 538 ; SSE2-NEXT: movdqa %xmm9, %xmm0 539 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 540 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] 541 ; SSE2-NEXT: psubd %xmm7, %xmm12 542 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 543 ; SSE2-NEXT: movdqa %xmm13, %xmm3 544 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 545 ; SSE2-NEXT: movdqa %xmm3, %xmm7 546 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] 547 ; SSE2-NEXT: psubd %xmm7, %xmm0 548 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] 549 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 550 ; SSE2-NEXT: psubd %xmm3, %xmm9 551 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] 552 ; SSE2-NEXT: movdqa %xmm2, %xmm7 553 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] 554 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] 555 ; SSE2-NEXT: movdqa %xmm13, %xmm3 556 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] 557 ; SSE2-NEXT: psubd %xmm3, %xmm7 558 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 559 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] 560 ; SSE2-NEXT: psubd %xmm13, %xmm2 561 ; SSE2-NEXT: movdqa %xmm2, %xmm13 562 ; SSE2-NEXT: movdqa %xmm1, %xmm3 563 ; SSE2-NEXT: psrad $31, %xmm3 564 ; SSE2-NEXT: paddd %xmm3, %xmm1 565 ; SSE2-NEXT: pxor %xmm3, %xmm1 566 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 567 ; SSE2-NEXT: paddd %xmm1, %xmm3 568 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill 569 ; SSE2-NEXT: movdqa %xmm6, %xmm1 570 ; SSE2-NEXT: psrad $31, %xmm1 571 ; SSE2-NEXT: paddd %xmm1, %xmm6 572 ; SSE2-NEXT: pxor %xmm1, %xmm6 573 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 574 ; SSE2-NEXT: paddd %xmm6, %xmm1 575 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 576 ; SSE2-NEXT: movdqa %xmm5, %xmm1 577 ; SSE2-NEXT: psrad $31, %xmm1 578 ; SSE2-NEXT: paddd %xmm1, %xmm5 579 ; SSE2-NEXT: pxor %xmm1, %xmm5 580 ; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 581 ; SSE2-NEXT: paddd %xmm5, %xmm1 582 ; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 583 ; SSE2-NEXT: movdqa %xmm4, %xmm1 584 ; SSE2-NEXT: psrad $31, %xmm1 585 ; SSE2-NEXT: paddd %xmm1, %xmm4 586 ; SSE2-NEXT: pxor %xmm1, %xmm4 587 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 588 ; SSE2-NEXT: paddd %xmm4, %xmm1 589 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 590 ; SSE2-NEXT: movdqa %xmm8, %xmm1 591 ; SSE2-NEXT: psrad $31, %xmm1 592 ; SSE2-NEXT: paddd %xmm1, %xmm8 593 ; SSE2-NEXT: pxor %xmm1, %xmm8 594 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 595 ; SSE2-NEXT: paddd %xmm8, %xmm1 596 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 597 ; SSE2-NEXT: movdqa %xmm11, %xmm1 598 ; SSE2-NEXT: psrad $31, %xmm1 599 ; SSE2-NEXT: paddd %xmm1, %xmm11 600 ; SSE2-NEXT: pxor %xmm1, %xmm11 601 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 602 ; SSE2-NEXT: paddd %xmm11, %xmm1 603 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 604 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 605 ; SSE2-NEXT: movdqa %xmm2, %xmm1 606 ; SSE2-NEXT: psrad $31, %xmm1 607 ; SSE2-NEXT: paddd %xmm1, %xmm2 608 ; SSE2-NEXT: pxor %xmm1, %xmm2 609 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 610 ; SSE2-NEXT: paddd %xmm2, %xmm1 611 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 612 ; SSE2-NEXT: movdqa %xmm15, %xmm1 613 ; SSE2-NEXT: psrad $31, %xmm1 614 ; SSE2-NEXT: paddd %xmm1, %xmm15 615 ; SSE2-NEXT: pxor %xmm1, %xmm15 616 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 617 ; SSE2-NEXT: paddd %xmm15, %xmm1 618 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 619 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 620 ; SSE2-NEXT: movdqa %xmm2, %xmm1 621 ; SSE2-NEXT: psrad $31, %xmm1 622 ; SSE2-NEXT: paddd %xmm1, %xmm2 623 ; SSE2-NEXT: pxor %xmm1, %xmm2 624 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 625 ; SSE2-NEXT: paddd %xmm2, %xmm1 626 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 627 ; SSE2-NEXT: movdqa %xmm10, %xmm1 628 ; SSE2-NEXT: psrad $31, %xmm1 629 ; SSE2-NEXT: paddd %xmm1, %xmm10 630 ; SSE2-NEXT: pxor %xmm1, %xmm10 631 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 632 ; SSE2-NEXT: paddd %xmm10, %xmm1 633 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 634 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 635 ; SSE2-NEXT: movdqa %xmm2, %xmm1 636 ; SSE2-NEXT: psrad $31, %xmm1 637 ; SSE2-NEXT: paddd %xmm1, %xmm2 638 ; SSE2-NEXT: pxor %xmm1, %xmm2 639 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 640 ; SSE2-NEXT: paddd %xmm2, %xmm1 641 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 642 ; SSE2-NEXT: movdqa %xmm12, %xmm1 643 ; SSE2-NEXT: psrad $31, %xmm1 644 ; SSE2-NEXT: paddd %xmm1, %xmm12 645 ; SSE2-NEXT: pxor %xmm1, %xmm12 646 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 647 ; SSE2-NEXT: paddd %xmm12, %xmm1 648 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 649 ; SSE2-NEXT: movdqa %xmm0, %xmm1 650 ; SSE2-NEXT: psrad $31, %xmm1 651 ; SSE2-NEXT: paddd %xmm1, %xmm0 652 ; SSE2-NEXT: pxor %xmm1, %xmm0 653 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 654 ; SSE2-NEXT: paddd %xmm0, %xmm1 655 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 656 ; SSE2-NEXT: movdqa %xmm9, %xmm0 657 ; SSE2-NEXT: psrad $31, %xmm0 658 ; SSE2-NEXT: paddd %xmm0, %xmm9 659 ; SSE2-NEXT: pxor %xmm0, %xmm9 660 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 661 ; SSE2-NEXT: paddd %xmm9, %xmm0 662 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 663 ; SSE2-NEXT: movdqa %xmm7, %xmm0 664 ; SSE2-NEXT: psrad $31, %xmm0 665 ; SSE2-NEXT: paddd %xmm0, %xmm7 666 ; SSE2-NEXT: pxor %xmm0, %xmm7 667 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 668 ; SSE2-NEXT: paddd %xmm7, %xmm0 669 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 670 ; SSE2-NEXT: movdqa %xmm13, %xmm1 671 ; SSE2-NEXT: movdqa %xmm13, %xmm0 672 ; SSE2-NEXT: psrad $31, %xmm0 673 ; SSE2-NEXT: paddd %xmm0, %xmm1 674 ; SSE2-NEXT: pxor %xmm0, %xmm1 675 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 676 ; SSE2-NEXT: paddd %xmm1, %xmm0 677 ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 678 ; SSE2-NEXT: addq $4, %rax 679 ; SSE2-NEXT: jne .LBB2_1 680 ; SSE2-NEXT: # %bb.2: # %middle.block 681 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 682 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 683 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 684 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 685 ; SSE2-NEXT: paddd %xmm0, %xmm1 686 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 687 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 688 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 689 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload 690 ; SSE2-NEXT: paddd %xmm1, %xmm3 691 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 692 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 693 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 694 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload 695 ; SSE2-NEXT: paddd %xmm1, %xmm4 696 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 697 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload 698 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 699 ; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload 700 ; SSE2-NEXT: paddd %xmm4, %xmm1 701 ; SSE2-NEXT: paddd %xmm2, %xmm1 702 ; SSE2-NEXT: paddd %xmm3, %xmm1 703 ; SSE2-NEXT: paddd %xmm0, %xmm1 704 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 705 ; SSE2-NEXT: paddd %xmm1, %xmm0 706 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 707 ; SSE2-NEXT: paddd %xmm0, %xmm1 708 ; SSE2-NEXT: movd %xmm1, %eax 709 ; SSE2-NEXT: addq $200, %rsp 710 ; SSE2-NEXT: retq 711 ; 712 ; AVX1-LABEL: sad_avx64i8: 713 ; AVX1: # %bb.0: # %entry 714 ; AVX1-NEXT: subq $24, %rsp 715 ; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14 716 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 717 ; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15 718 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 719 ; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13 720 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 721 ; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 722 ; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 723 ; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 724 ; AVX1-NEXT: .p2align 4, 0x90 725 ; AVX1-NEXT: .LBB2_1: # %vector.body 726 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 727 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 728 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 729 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 730 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 731 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 732 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 733 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 734 ; AVX1-NEXT: vmovdqa %ymm7, %ymm11 735 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 736 ; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 737 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 738 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 739 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 740 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 741 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 742 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 743 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 744 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 745 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 746 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 747 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 748 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 749 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 750 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 751 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0 752 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 753 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 754 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 755 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 756 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 757 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 758 ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 759 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 760 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 761 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 762 ; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 763 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 764 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 765 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 766 ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 767 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 768 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 769 ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3 770 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 771 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 772 ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0 773 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 774 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 775 ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 776 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 777 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 778 ; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 779 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 780 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 781 ; AVX1-NEXT: vpsubd %xmm1, %xmm7, %xmm1 782 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 783 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 784 ; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 785 ; AVX1-NEXT: vpabsd %xmm2, %xmm2 786 ; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7 787 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 788 ; AVX1-NEXT: vpabsd %xmm1, %xmm1 789 ; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1 790 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7 791 ; AVX1-NEXT: vpabsd %xmm6, %xmm1 792 ; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2 793 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 794 ; AVX1-NEXT: vpabsd %xmm5, %xmm2 795 ; AVX1-NEXT: vpaddd %xmm15, %xmm2, %xmm2 796 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 797 ; AVX1-NEXT: vpabsd %xmm0, %xmm1 798 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2 799 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 800 ; AVX1-NEXT: vpabsd %xmm3, %xmm2 801 ; AVX1-NEXT: vpaddd %xmm14, %xmm2, %xmm2 802 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14 803 ; AVX1-NEXT: vpabsd %xmm4, %xmm1 804 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 805 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 806 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 807 ; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0 808 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 809 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 810 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 811 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 812 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 813 ; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1 814 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload 815 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 816 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 817 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 818 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 819 ; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1 820 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload 821 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 822 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 823 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 824 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload 825 ; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1 826 ; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload 827 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 828 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 829 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 830 ; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload 831 ; AVX1-NEXT: vpaddd %xmm12, %xmm1, %xmm1 832 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 833 ; AVX1-NEXT: addq $4, %rax 834 ; AVX1-NEXT: jne .LBB2_1 835 ; AVX1-NEXT: # %bb.2: # %middle.block 836 ; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0 837 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 838 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 839 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3 840 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 841 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 842 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 843 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 844 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 845 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 846 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4 847 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 848 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 849 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 850 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 851 ; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1 852 ; AVX1-NEXT: vpaddd %xmm10, %xmm7, %xmm2 853 ; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 854 ; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 855 ; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 856 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 857 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 858 ; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 859 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 860 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 861 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 862 ; AVX1-NEXT: vmovd %xmm0, %eax 863 ; AVX1-NEXT: addq $24, %rsp 864 ; AVX1-NEXT: vzeroupper 865 ; AVX1-NEXT: retq 866 ; 867 ; AVX2-LABEL: sad_avx64i8: 868 ; AVX2: # %bb.0: # %entry 869 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 870 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 871 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 872 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 873 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 874 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 875 ; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 876 ; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 877 ; AVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7 878 ; AVX2-NEXT: .p2align 4, 0x90 879 ; AVX2-NEXT: .LBB2_1: # %vector.body 880 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 881 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 882 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 883 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 884 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 885 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 886 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 887 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 888 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 889 ; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill 890 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 891 ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 892 ; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill 893 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 894 ; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 895 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 896 ; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 897 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 898 ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 899 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 900 ; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 901 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 902 ; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 903 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 904 ; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 905 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 906 ; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload 907 ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 908 ; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload 909 ; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 910 ; AVX2-NEXT: vpabsd %ymm9, %ymm8 911 ; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 912 ; AVX2-NEXT: vpabsd %ymm10, %ymm8 913 ; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 914 ; AVX2-NEXT: vpabsd %ymm11, %ymm8 915 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 916 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 917 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 918 ; AVX2-NEXT: vpabsd %ymm13, %ymm8 919 ; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 920 ; AVX2-NEXT: vpabsd %ymm14, %ymm8 921 ; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 922 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 923 ; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 924 ; AVX2-NEXT: addq $4, %rax 925 ; AVX2-NEXT: jne .LBB2_1 926 ; AVX2-NEXT: # %bb.2: # %middle.block 927 ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 928 ; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 929 ; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 930 ; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 931 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 932 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 933 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 934 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 935 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 936 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 937 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 938 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 939 ; AVX2-NEXT: vmovd %xmm0, %eax 940 ; AVX2-NEXT: vzeroupper 941 ; AVX2-NEXT: retq 942 ; 943 ; AVX512F-LABEL: sad_avx64i8: 944 ; AVX512F: # %bb.0: # %entry 945 ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 946 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 947 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 948 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 949 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 950 ; AVX512F-NEXT: .p2align 4, 0x90 951 ; AVX512F-NEXT: .LBB2_1: # %vector.body 952 ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 953 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 954 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 955 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 956 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 957 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 958 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 959 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 960 ; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 961 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 962 ; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 963 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero 964 ; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 965 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 966 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 967 ; AVX512F-NEXT: vpabsd %zmm5, %zmm4 968 ; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 969 ; AVX512F-NEXT: vpabsd %zmm6, %zmm4 970 ; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 971 ; AVX512F-NEXT: vpabsd %zmm7, %zmm4 972 ; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 973 ; AVX512F-NEXT: addq $4, %rax 974 ; AVX512F-NEXT: jne .LBB2_1 975 ; AVX512F-NEXT: # %bb.2: # %middle.block 976 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 977 ; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 978 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 979 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 980 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 981 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 982 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 983 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 984 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 985 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 986 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 987 ; AVX512F-NEXT: vmovd %xmm0, %eax 988 ; AVX512F-NEXT: vzeroupper 989 ; AVX512F-NEXT: retq 990 ; 991 ; AVX512BW-LABEL: sad_avx64i8: 992 ; AVX512BW: # %bb.0: # %entry 993 ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 994 ; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 995 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 996 ; AVX512BW-NEXT: .p2align 4, 0x90 997 ; AVX512BW-NEXT: .LBB2_1: # %vector.body 998 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 999 ; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2 1000 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2 1001 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 1002 ; AVX512BW-NEXT: addq $4, %rax 1003 ; AVX512BW-NEXT: jne .LBB2_1 1004 ; AVX512BW-NEXT: # %bb.2: # %middle.block 1005 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1 1006 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 1007 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 1008 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1009 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1010 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1011 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1012 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1013 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1014 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1015 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 1016 ; AVX512BW-NEXT: vmovd %xmm0, %eax 1017 ; AVX512BW-NEXT: vzeroupper 1018 ; AVX512BW-NEXT: retq 1019 entry: 1020 br label %vector.body 1021 1022 vector.body: 1023 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 1024 %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 1025 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 1026 %1 = bitcast i8* %0 to <64 x i8>* 1027 %wide.load = load <64 x i8>, <64 x i8>* %1, align 64 1028 %2 = zext <64 x i8> %wide.load to <64 x i32> 1029 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 1030 %4 = bitcast i8* %3 to <64 x i8>* 1031 %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64 1032 %5 = zext <64 x i8> %wide.load1 to <64 x i32> 1033 %6 = sub nsw <64 x i32> %2, %5 1034 %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 1035 %8 = sub nsw <64 x i32> zeroinitializer, %6 1036 %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8 1037 %10 = add nsw <64 x i32> %9, %vec.phi 1038 %index.next = add i64 %index, 4 1039 %11 = icmp eq i64 %index.next, 1024 1040 br i1 %11, label %middle.block, label %vector.body 1041 1042 middle.block: 1043 %.lcssa = phi <64 x i32> [ %10, %vector.body ] 1044 %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1045 %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf 1046 %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1047 %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2 1048 %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1049 %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3 1050 %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1051 %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4 1052 %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1053 %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5 1054 %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1055 %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6 1056 %12 = extractelement <64 x i32> %bin.rdx6, i32 0 1057 ret i32 %12 1058 } 1059 1060 define i32 @sad_2i8() nounwind { 1061 ; SSE2-LABEL: sad_2i8: 1062 ; SSE2: # %bb.0: # %entry 1063 ; SSE2-NEXT: pxor %xmm0, %xmm0 1064 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 1065 ; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF 1066 ; SSE2-NEXT: movd %ecx, %xmm1 1067 ; SSE2-NEXT: .p2align 4, 0x90 1068 ; SSE2-NEXT: .LBB3_1: # %vector.body 1069 ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1070 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1071 ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 1072 ; SSE2-NEXT: pand %xmm1, %xmm3 1073 ; SSE2-NEXT: pand %xmm1, %xmm2 1074 ; SSE2-NEXT: psadbw %xmm3, %xmm2 1075 ; SSE2-NEXT: paddq %xmm2, %xmm0 1076 ; SSE2-NEXT: addq $4, %rax 1077 ; SSE2-NEXT: jne .LBB3_1 1078 ; SSE2-NEXT: # %bb.2: # %middle.block 1079 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1080 ; SSE2-NEXT: paddq %xmm0, %xmm1 1081 ; SSE2-NEXT: movd %xmm1, %eax 1082 ; SSE2-NEXT: retq 1083 ; 1084 ; AVX-LABEL: sad_2i8: 1085 ; AVX: # %bb.0: # %entry 1086 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1087 ; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 1088 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1089 ; AVX-NEXT: .p2align 4, 0x90 1090 ; AVX-NEXT: .LBB3_1: # %vector.body 1091 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 1092 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1093 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 1094 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] 1095 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] 1096 ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 1097 ; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1098 ; AVX-NEXT: addq $4, %rax 1099 ; AVX-NEXT: jne .LBB3_1 1100 ; AVX-NEXT: # %bb.2: # %middle.block 1101 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1102 ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1103 ; AVX-NEXT: vmovd %xmm0, %eax 1104 ; AVX-NEXT: retq 1105 entry: 1106 br label %vector.body 1107 1108 vector.body: 1109 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 1110 %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] 1111 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index 1112 %1 = bitcast i8* %0 to <2 x i8>* 1113 %wide.load = load <2 x i8>, <2 x i8>* %1, align 4 1114 %2 = zext <2 x i8> %wide.load to <2 x i32> 1115 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index 1116 %4 = bitcast i8* %3 to <2 x i8>* 1117 %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4 1118 %5 = zext <2 x i8> %wide.load1 to <2 x i32> 1119 %6 = sub nsw <2 x i32> %2, %5 1120 %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1> 1121 %8 = sub nsw <2 x i32> zeroinitializer, %6 1122 %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8 1123 %10 = add nsw <2 x i32> %9, %vec.phi 1124 %index.next = add i64 %index, 4 1125 %11 = icmp eq i64 %index.next, 1024 1126 br i1 %11, label %middle.block, label %vector.body 1127 1128 middle.block: 1129 %.lcssa = phi <2 x i32> [ %10, %vector.body ] 1130 %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 1131 %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf 1132 %12 = extractelement <2 x i32> %bin.rdx, i32 0 1133 ret i32 %12 1134 } 1135 1136 define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 { 1137 ; SSE2-LABEL: sad_nonloop_4i8: 1138 ; SSE2: # %bb.0: 1139 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1140 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1141 ; SSE2-NEXT: psadbw %xmm0, %xmm1 1142 ; SSE2-NEXT: movd %xmm1, %eax 1143 ; SSE2-NEXT: retq 1144 ; 1145 ; AVX-LABEL: sad_nonloop_4i8: 1146 ; AVX: # %bb.0: 1147 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1148 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1149 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 1150 ; AVX-NEXT: vmovd %xmm0, %eax 1151 ; AVX-NEXT: retq 1152 %v1 = load <4 x i8>, <4 x i8>* %p, align 1 1153 %z1 = zext <4 x i8> %v1 to <4 x i32> 1154 %v2 = load <4 x i8>, <4 x i8>* %q, align 1 1155 %z2 = zext <4 x i8> %v2 to <4 x i32> 1156 %sub = sub nsw <4 x i32> %z1, %z2 1157 %isneg = icmp sgt <4 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1> 1158 %neg = sub nsw <4 x i32> zeroinitializer, %sub 1159 %abs = select <4 x i1> %isneg, <4 x i32> %sub, <4 x i32> %neg 1160 %h2 = shufflevector <4 x i32> %abs, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1161 %sum2 = add <4 x i32> %abs, %h2 1162 %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1163 %sum3 = add <4 x i32> %sum2, %h3 1164 %sum = extractelement <4 x i32> %sum3, i32 0 1165 ret i32 %sum 1166 } 1167 1168 define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* nocapture readonly %q) local_unnamed_addr #0 { 1169 ; SSE2-LABEL: sad_nonloop_8i8: 1170 ; SSE2: # %bb.0: 1171 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1172 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1173 ; SSE2-NEXT: psadbw %xmm0, %xmm1 1174 ; SSE2-NEXT: movd %xmm1, %eax 1175 ; SSE2-NEXT: retq 1176 ; 1177 ; AVX-LABEL: sad_nonloop_8i8: 1178 ; AVX: # %bb.0: 1179 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1180 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1181 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 1182 ; AVX-NEXT: vmovd %xmm0, %eax 1183 ; AVX-NEXT: retq 1184 %v1 = load <8 x i8>, <8 x i8>* %p, align 1 1185 %z1 = zext <8 x i8> %v1 to <8 x i32> 1186 %v2 = load <8 x i8>, <8 x i8>* %q, align 1 1187 %z2 = zext <8 x i8> %v2 to <8 x i32> 1188 %sub = sub nsw <8 x i32> %z1, %z2 1189 %isneg = icmp sgt <8 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 1190 %neg = sub nsw <8 x i32> zeroinitializer, %sub 1191 %abs = select <8 x i1> %isneg, <8 x i32> %sub, <8 x i32> %neg 1192 %h1 = shufflevector <8 x i32> %abs, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1193 %sum1 = add <8 x i32> %abs, %h1 1194 %h2 = shufflevector <8 x i32> %sum1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1195 %sum2 = add <8 x i32> %sum1, %h2 1196 %h3 = shufflevector <8 x i32> %sum2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1197 %sum3 = add <8 x i32> %sum2, %h3 1198 %sum = extractelement <8 x i32> %sum3, i32 0 1199 ret i32 %sum 1200 } 1201 1202 define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* nocapture readonly %q) local_unnamed_addr #0 { 1203 ; SSE2-LABEL: sad_nonloop_16i8: 1204 ; SSE2: # %bb.0: 1205 ; SSE2-NEXT: movdqu (%rdi), %xmm0 1206 ; SSE2-NEXT: movdqu (%rdx), %xmm1 1207 ; SSE2-NEXT: psadbw %xmm0, %xmm1 1208 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1209 ; SSE2-NEXT: paddq %xmm1, %xmm0 1210 ; SSE2-NEXT: movd %xmm0, %eax 1211 ; SSE2-NEXT: retq 1212 ; 1213 ; AVX-LABEL: sad_nonloop_16i8: 1214 ; AVX: # %bb.0: 1215 ; AVX-NEXT: vmovdqu (%rdi), %xmm0 1216 ; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 1217 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1218 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1219 ; AVX-NEXT: vmovd %xmm0, %eax 1220 ; AVX-NEXT: retq 1221 %v1 = load <16 x i8>, <16 x i8>* %p, align 1 1222 %z1 = zext <16 x i8> %v1 to <16 x i32> 1223 %v2 = load <16 x i8>, <16 x i8>* %q, align 1 1224 %z2 = zext <16 x i8> %v2 to <16 x i32> 1225 %sub = sub nsw <16 x i32> %z1, %z2 1226 %isneg = icmp sgt <16 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 1227 %neg = sub nsw <16 x i32> zeroinitializer, %sub 1228 %abs = select <16 x i1> %isneg, <16 x i32> %sub, <16 x i32> %neg 1229 %h0 = shufflevector <16 x i32> %abs, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1230 %sum0 = add <16 x i32> %abs, %h0 1231 %h1 = shufflevector <16 x i32> %sum0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1232 %sum1 = add <16 x i32> %sum0, %h1 1233 %h2 = shufflevector <16 x i32> %sum1, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1234 %sum2 = add <16 x i32> %sum1, %h2 1235 %h3 = shufflevector <16 x i32> %sum2, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1236 %sum3 = add <16 x i32> %sum2, %h3 1237 %sum = extractelement <16 x i32> %sum3, i32 0 1238 ret i32 %sum 1239 } 1240 1241 define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 { 1242 ; SSE2-LABEL: sad_nonloop_32i8: 1243 ; SSE2: # %bb.0: 1244 ; SSE2-NEXT: movdqu (%rdi), %xmm0 1245 ; SSE2-NEXT: movdqu 16(%rdi), %xmm12 1246 ; SSE2-NEXT: pxor %xmm1, %xmm1 1247 ; SSE2-NEXT: movdqa %xmm12, %xmm8 1248 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 1249 ; SSE2-NEXT: movdqa %xmm8, %xmm10 1250 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1251 ; SSE2-NEXT: movdqa %xmm0, %xmm9 1252 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] 1253 ; SSE2-NEXT: movdqa %xmm9, %xmm11 1254 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] 1255 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 1256 ; SSE2-NEXT: movdqa %xmm12, %xmm13 1257 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] 1258 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 1259 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1260 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 1261 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] 1262 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] 1263 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] 1264 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1265 ; SSE2-NEXT: movdqu (%rdx), %xmm7 1266 ; SSE2-NEXT: movdqu 16(%rdx), %xmm3 1267 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1268 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] 1269 ; SSE2-NEXT: movdqa %xmm6, %xmm5 1270 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1271 ; SSE2-NEXT: psubd %xmm5, %xmm10 1272 ; SSE2-NEXT: movdqa %xmm7, %xmm2 1273 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1274 ; SSE2-NEXT: movdqa %xmm2, %xmm5 1275 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1276 ; SSE2-NEXT: psubd %xmm5, %xmm11 1277 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 1278 ; SSE2-NEXT: movdqa %xmm3, %xmm5 1279 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1280 ; SSE2-NEXT: psubd %xmm5, %xmm13 1281 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 1282 ; SSE2-NEXT: movdqa %xmm7, %xmm5 1283 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1284 ; SSE2-NEXT: psubd %xmm5, %xmm4 1285 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 1286 ; SSE2-NEXT: psubd %xmm6, %xmm8 1287 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1288 ; SSE2-NEXT: psubd %xmm2, %xmm9 1289 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1290 ; SSE2-NEXT: psubd %xmm3, %xmm12 1291 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] 1292 ; SSE2-NEXT: psubd %xmm7, %xmm0 1293 ; SSE2-NEXT: movdqa %xmm10, %xmm1 1294 ; SSE2-NEXT: psrad $31, %xmm1 1295 ; SSE2-NEXT: paddd %xmm1, %xmm10 1296 ; SSE2-NEXT: pxor %xmm1, %xmm10 1297 ; SSE2-NEXT: movdqa %xmm11, %xmm1 1298 ; SSE2-NEXT: psrad $31, %xmm1 1299 ; SSE2-NEXT: paddd %xmm1, %xmm11 1300 ; SSE2-NEXT: pxor %xmm1, %xmm11 1301 ; SSE2-NEXT: movdqa %xmm13, %xmm1 1302 ; SSE2-NEXT: psrad $31, %xmm1 1303 ; SSE2-NEXT: paddd %xmm1, %xmm13 1304 ; SSE2-NEXT: pxor %xmm1, %xmm13 1305 ; SSE2-NEXT: movdqa %xmm4, %xmm1 1306 ; SSE2-NEXT: psrad $31, %xmm1 1307 ; SSE2-NEXT: paddd %xmm1, %xmm4 1308 ; SSE2-NEXT: pxor %xmm1, %xmm4 1309 ; SSE2-NEXT: paddd %xmm13, %xmm4 1310 ; SSE2-NEXT: paddd %xmm10, %xmm4 1311 ; SSE2-NEXT: paddd %xmm11, %xmm4 1312 ; SSE2-NEXT: movdqa %xmm8, %xmm1 1313 ; SSE2-NEXT: psrad $31, %xmm1 1314 ; SSE2-NEXT: paddd %xmm1, %xmm8 1315 ; SSE2-NEXT: pxor %xmm1, %xmm8 1316 ; SSE2-NEXT: movdqa %xmm9, %xmm1 1317 ; SSE2-NEXT: psrad $31, %xmm1 1318 ; SSE2-NEXT: paddd %xmm1, %xmm9 1319 ; SSE2-NEXT: pxor %xmm1, %xmm9 1320 ; SSE2-NEXT: movdqa %xmm12, %xmm1 1321 ; SSE2-NEXT: psrad $31, %xmm1 1322 ; SSE2-NEXT: paddd %xmm1, %xmm12 1323 ; SSE2-NEXT: pxor %xmm1, %xmm12 1324 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1325 ; SSE2-NEXT: psrad $31, %xmm1 1326 ; SSE2-NEXT: paddd %xmm1, %xmm0 1327 ; SSE2-NEXT: pxor %xmm1, %xmm0 1328 ; SSE2-NEXT: paddd %xmm12, %xmm0 1329 ; SSE2-NEXT: paddd %xmm8, %xmm0 1330 ; SSE2-NEXT: paddd %xmm4, %xmm0 1331 ; SSE2-NEXT: paddd %xmm9, %xmm0 1332 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1333 ; SSE2-NEXT: paddd %xmm0, %xmm1 1334 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1335 ; SSE2-NEXT: paddd %xmm1, %xmm0 1336 ; SSE2-NEXT: movd %xmm0, %eax 1337 ; SSE2-NEXT: retq 1338 ; 1339 ; AVX1-LABEL: sad_nonloop_32i8: 1340 ; AVX1: # %bb.0: 1341 ; AVX1-NEXT: vmovdqu (%rdi), %ymm0 1342 ; AVX1-NEXT: vmovdqu (%rdx), %ymm1 1343 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1344 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1345 ; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 1346 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 1347 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 1348 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1349 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1350 ; AVX1-NEXT: vmovd %xmm0, %eax 1351 ; AVX1-NEXT: vzeroupper 1352 ; AVX1-NEXT: retq 1353 ; 1354 ; AVX2-LABEL: sad_nonloop_32i8: 1355 ; AVX2: # %bb.0: 1356 ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 1357 ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 1358 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1359 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1360 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1361 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1362 ; AVX2-NEXT: vmovd %xmm0, %eax 1363 ; AVX2-NEXT: vzeroupper 1364 ; AVX2-NEXT: retq 1365 ; 1366 ; AVX512-LABEL: sad_nonloop_32i8: 1367 ; AVX512: # %bb.0: 1368 ; AVX512-NEXT: vmovdqu (%rdi), %ymm0 1369 ; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 1370 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1371 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1372 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1373 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1374 ; AVX512-NEXT: vmovd %xmm0, %eax 1375 ; AVX512-NEXT: vzeroupper 1376 ; AVX512-NEXT: retq 1377 %v1 = load <32 x i8>, <32 x i8>* %p, align 1 1378 %z1 = zext <32 x i8> %v1 to <32 x i32> 1379 %v2 = load <32 x i8>, <32 x i8>* %q, align 1 1380 %z2 = zext <32 x i8> %v2 to <32 x i32> 1381 %sub = sub nsw <32 x i32> %z1, %z2 1382 %isneg = icmp sgt <32 x i32> %sub, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 1383 %neg = sub nsw <32 x i32> zeroinitializer, %sub 1384 %abs = select <32 x i1> %isneg, <32 x i32> %sub, <32 x i32> %neg 1385 %h32 = shufflevector <32 x i32> %abs, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1386 %sum32 = add <32 x i32> %abs, %h32 1387 %h0 = shufflevector <32 x i32> %sum32, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1388 %sum0 = add <32 x i32> %sum32, %h0 1389 %h1 = shufflevector <32 x i32> %sum0, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1390 %sum1 = add <32 x i32> %sum0, %h1 1391 %h2 = shufflevector <32 x i32> %sum1, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1392 %sum2 = add <32 x i32> %sum1, %h2 1393 %h3 = shufflevector <32 x i32> %sum2, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1394 %sum3 = add <32 x i32> %sum2, %h3 1395 %sum = extractelement <32 x i32> %sum3, i32 0 1396 ret i32 %sum 1397 } 1398