1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 7 8 define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 { 9 ; SSE2-LABEL: sad8_32bit_icmp_sge: 10 ; SSE2: # %bb.0: # %entry 11 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 12 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 13 ; SSE2-NEXT: psadbw %xmm0, %xmm1 14 ; SSE2-NEXT: movd %xmm1, %eax 15 ; SSE2-NEXT: retq 16 ; 17 ; AVX-LABEL: sad8_32bit_icmp_sge: 18 ; AVX: # %bb.0: # %entry 19 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 20 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 21 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 22 ; AVX-NEXT: vmovd %xmm0, %eax 23 ; AVX-NEXT: retq 24 25 entry: 26 %idx.ext = zext i32 %stride to i64 27 br label %for.body 28 29 for.body: ; preds = %entry 30 %0 = bitcast i8* %cur to <8 x i8>* 31 %1 = load <8 x i8>, <8 x i8>* %0, align 1 32 %2 = zext <8 x i8> %1 to <8 x i32> 33 %3 = bitcast i8* %ref to <8 x i8>* 34 %4 = load <8 x i8>, <8 x i8>* %3, align 1 35 %5 = zext <8 x i8> %4 to <8 x i32> 36 %6 = sub nsw <8 x i32> %2, %5 37 %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 38 %8 = sub nsw <8 x i32> zeroinitializer, %6 39 %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 40 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 41 %bin.rdx = add <8 x i32> %9, %rdx.shuf 42 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 43 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 44 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 45 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 46 %10 = extractelement <8 x i32> %bin.rdx232, i32 0 47 ret i32 %10 48 } 49 50 define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 { 51 ; SSE2-LABEL: sad8_32bit_icmp_sgt: 52 ; SSE2: # %bb.0: # %entry 53 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 54 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 55 ; SSE2-NEXT: psadbw %xmm0, %xmm1 56 ; SSE2-NEXT: movd %xmm1, %eax 57 ; SSE2-NEXT: retq 58 ; 59 ; AVX-LABEL: sad8_32bit_icmp_sgt: 60 ; AVX: # %bb.0: # %entry 61 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 62 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 63 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 64 ; AVX-NEXT: vmovd %xmm0, %eax 65 ; AVX-NEXT: retq 66 entry: 67 %idx.ext = zext i32 %stride to i64 68 br label %for.body 69 70 for.body: ; preds = %entry 71 %0 = bitcast i8* %cur to <8 x i8>* 72 %1 = load <8 x i8>, <8 x i8>* %0, align 1 73 %2 = zext <8 x i8> %1 to <8 x i32> 74 %3 = bitcast i8* %ref to <8 x i8>* 75 %4 = load <8 x i8>, <8 x i8>* %3, align 1 76 %5 = zext <8 x i8> %4 to <8 x i32> 77 %6 = sub nsw <8 x i32> %2, %5 78 %7 = icmp sgt <8 x i32> %6, zeroinitializer 79 %8 = sub nsw <8 x i32> zeroinitializer, %6 80 %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 81 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 82 %bin.rdx = add <8 x i32> %9, %rdx.shuf 83 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 84 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 85 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 86 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 87 %10 = extractelement <8 x i32> %bin.rdx232, i32 0 88 ret i32 %10 89 } 90 91 define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 { 92 ; SSE2-LABEL: sad8_32bit_icmp_sle: 93 ; SSE2: # %bb.0: # %entry 94 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 95 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 96 ; SSE2-NEXT: psadbw %xmm0, %xmm1 97 ; SSE2-NEXT: movd %xmm1, %eax 98 ; SSE2-NEXT: retq 99 ; 100 ; AVX-LABEL: sad8_32bit_icmp_sle: 101 ; AVX: # %bb.0: # %entry 102 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 103 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 104 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 105 ; AVX-NEXT: vmovd %xmm0, %eax 106 ; AVX-NEXT: retq 107 entry: 108 %idx.ext = zext i32 %stride to i64 109 br label %for.body 110 111 for.body: ; preds = %entry 112 %0 = bitcast i8* %cur to <8 x i8>* 113 %1 = load <8 x i8>, <8 x i8>* %0, align 1 114 %2 = zext <8 x i8> %1 to <8 x i32> 115 %3 = bitcast i8* %ref to <8 x i8>* 116 %4 = load <8 x i8>, <8 x i8>* %3, align 1 117 %5 = zext <8 x i8> %4 to <8 x i32> 118 %6 = sub nsw <8 x i32> %2, %5 119 %7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 120 %8 = sub nsw <8 x i32> zeroinitializer, %6 121 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 122 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 123 %bin.rdx = add <8 x i32> %9, %rdx.shuf 124 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 125 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 126 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 127 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 128 %10 = extractelement <8 x i32> %bin.rdx232, i32 0 129 ret i32 %10 130 } 131 132 define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 { 133 ; SSE2-LABEL: sad8_32bit_icmp_slt: 134 ; SSE2: # %bb.0: # %entry 135 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 136 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 137 ; SSE2-NEXT: psadbw %xmm0, %xmm1 138 ; SSE2-NEXT: movd %xmm1, %eax 139 ; SSE2-NEXT: retq 140 ; 141 ; AVX-LABEL: sad8_32bit_icmp_slt: 142 ; AVX: # %bb.0: # %entry 143 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 144 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 145 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 146 ; AVX-NEXT: vmovd %xmm0, %eax 147 ; AVX-NEXT: retq 148 entry: 149 %idx.ext = zext i32 %stride to i64 150 br label %for.body 151 152 for.body: ; preds = %entry 153 %0 = bitcast i8* %cur to <8 x i8>* 154 %1 = load <8 x i8>, <8 x i8>* %0, align 1 155 %2 = zext <8 x i8> %1 to <8 x i32> 156 %3 = bitcast i8* %ref to <8 x i8>* 157 %4 = load <8 x i8>, <8 x i8>* %3, align 1 158 %5 = zext <8 x i8> %4 to <8 x i32> 159 %6 = sub nsw <8 x i32> %2, %5 160 %7 = icmp slt <8 x i32> %6, zeroinitializer 161 %8 = sub nsw <8 x i32> zeroinitializer, %6 162 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 163 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 164 %bin.rdx = add <8 x i32> %9, %rdx.shuf 165 %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 166 %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229 167 %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 168 %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231 169 %10 = extractelement <8 x i32> %bin.rdx232, i32 0 170 ret i32 %10 171 } 172 173 define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 174 ; SSE2-LABEL: sad8_64bit_icmp_sext_slt: 175 ; SSE2: # %bb.0: # %entry 176 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 177 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 178 ; SSE2-NEXT: psadbw %xmm0, %xmm1 179 ; SSE2-NEXT: movq %xmm1, %rax 180 ; SSE2-NEXT: retq 181 ; 182 ; AVX-LABEL: sad8_64bit_icmp_sext_slt: 183 ; AVX: # %bb.0: # %entry 184 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 185 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 186 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 187 ; AVX-NEXT: vmovq %xmm0, %rax 188 ; AVX-NEXT: retq 189 entry: 190 br label %for.body 191 192 for.body: ; preds = %entry 193 %0 = bitcast i8* %cur to <8 x i8>* 194 %1 = load <8 x i8>, <8 x i8>* %0, align 1 195 %2 = zext <8 x i8> %1 to <8 x i32> 196 %3 = bitcast i8* %ref to <8 x i8>* 197 %4 = load <8 x i8>, <8 x i8>* %3, align 1 198 %5 = zext <8 x i8> %4 to <8 x i32> 199 %6 = sub nsw <8 x i32> %2, %5 200 %7 = icmp slt <8 x i32> %6, zeroinitializer 201 %8 = sub nsw <8 x i32> zeroinitializer, %6 202 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 203 %10 = sext <8 x i32> %9 to <8 x i64> 204 %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 205 %bin.rdx = add <8 x i64> %rdx.shuf, %10 206 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 207 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 208 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 209 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 210 %11 = extractelement <8 x i64> %bin.rdx239, i32 0 211 ret i64 %11 212 } 213 214 define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 215 ; SSE2-LABEL: sad8_64bit_icmp_zext_slt: 216 ; SSE2: # %bb.0: # %entry 217 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 218 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 219 ; SSE2-NEXT: psadbw %xmm0, %xmm1 220 ; SSE2-NEXT: movq %xmm1, %rax 221 ; SSE2-NEXT: retq 222 ; 223 ; AVX-LABEL: sad8_64bit_icmp_zext_slt: 224 ; AVX: # %bb.0: # %entry 225 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 226 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 227 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 228 ; AVX-NEXT: vmovq %xmm0, %rax 229 ; AVX-NEXT: retq 230 entry: 231 br label %for.body 232 233 for.body: ; preds = %entry 234 %0 = bitcast i8* %cur to <8 x i8>* 235 %1 = load <8 x i8>, <8 x i8>* %0, align 1 236 %2 = zext <8 x i8> %1 to <8 x i32> 237 %3 = bitcast i8* %ref to <8 x i8>* 238 %4 = load <8 x i8>, <8 x i8>* %3, align 1 239 %5 = zext <8 x i8> %4 to <8 x i32> 240 %6 = sub nsw <8 x i32> %2, %5 241 %7 = icmp slt <8 x i32> %6, zeroinitializer 242 %8 = sub nsw <8 x i32> zeroinitializer, %6 243 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 244 %10 = zext <8 x i32> %9 to <8 x i64> 245 %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 246 %bin.rdx = add <8 x i64> %rdx.shuf, %10 247 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 248 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 249 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 250 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 251 %11 = extractelement <8 x i64> %bin.rdx239, i32 0 252 ret i64 %11 253 } 254 255 define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 { 256 ; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt: 257 ; SSE2: # %bb.0: # %entry 258 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 259 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 260 ; SSE2-NEXT: psadbw %xmm0, %xmm1 261 ; SSE2-NEXT: movq %xmm1, %rax 262 ; SSE2-NEXT: retq 263 ; 264 ; AVX-LABEL: sad8_early_64bit_icmp_zext_slt: 265 ; AVX: # %bb.0: # %entry 266 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 267 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 268 ; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 269 ; AVX-NEXT: vmovq %xmm0, %rax 270 ; AVX-NEXT: retq 271 entry: 272 br label %for.body 273 274 for.body: ; preds = %entry 275 %0 = bitcast i8* %cur to <8 x i8>* 276 %1 = load <8 x i8>, <8 x i8>* %0, align 1 277 %2 = zext <8 x i8> %1 to <8 x i64> 278 %3 = bitcast i8* %ref to <8 x i8>* 279 %4 = load <8 x i8>, <8 x i8>* %3, align 1 280 %5 = zext <8 x i8> %4 to <8 x i64> 281 %6 = sub nsw <8 x i64> %2, %5 282 %7 = icmp slt <8 x i64> %6, zeroinitializer 283 %8 = sub nsw <8 x i64> zeroinitializer, %6 284 %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6 285 %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 286 %bin.rdx = add <8 x i64> %rdx.shuf, %9 287 %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 288 %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236 289 %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 290 %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238 291 %10 = extractelement <8 x i64> %bin.rdx239, i32 0 292 ret i64 %10 293 } 294