1 ; NOTE: Assertions have been autogenerated by update_test_checks.py 2 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE 3 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE 4 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST 5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 7 8 ; https://llvm.org/bugs/show_bug.cgi?id=27100 9 10 define void @memset_16_nonzero_bytes(i8* %x) { 11 ; SSE-LABEL: memset_16_nonzero_bytes: 12 ; SSE: # BB#0: 13 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 14 ; SSE-NEXT: movq %rax, 8(%rdi) 15 ; SSE-NEXT: movq %rax, (%rdi) 16 ; SSE-NEXT: retq 17 ; 18 ; SSE2FAST-LABEL: memset_16_nonzero_bytes: 19 ; SSE2FAST: # BB#0: 20 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 21 ; SSE2FAST-NEXT: movups %xmm0, (%rdi) 22 ; SSE2FAST-NEXT: retq 23 ; 24 ; AVX-LABEL: memset_16_nonzero_bytes: 25 ; AVX: # BB#0: 26 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 27 ; AVX-NEXT: vmovups %xmm0, (%rdi) 28 ; AVX-NEXT: retq 29 ; 30 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) 31 ret void 32 } 33 34 define void @memset_32_nonzero_bytes(i8* %x) { 35 ; SSE-LABEL: memset_32_nonzero_bytes: 36 ; SSE: # BB#0: 37 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 38 ; SSE-NEXT: movq %rax, 24(%rdi) 39 ; SSE-NEXT: movq %rax, 16(%rdi) 40 ; SSE-NEXT: movq %rax, 8(%rdi) 41 ; SSE-NEXT: movq %rax, (%rdi) 42 ; SSE-NEXT: retq 43 ; 44 ; SSE2FAST-LABEL: memset_32_nonzero_bytes: 45 ; SSE2FAST: # BB#0: 46 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 47 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 48 ; SSE2FAST-NEXT: movups %xmm0, (%rdi) 49 ; SSE2FAST-NEXT: retq 50 ; 51 ; AVX-LABEL: memset_32_nonzero_bytes: 52 ; AVX: # BB#0: 53 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 54 ; AVX-NEXT: vmovups %ymm0, (%rdi) 55 ; AVX-NEXT: vzeroupper 56 ; AVX-NEXT: retq 57 ; 58 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1) 59 ret void 60 } 61 62 define void @memset_64_nonzero_bytes(i8* %x) { 63 ; SSE-LABEL: memset_64_nonzero_bytes: 64 ; SSE: # BB#0: 65 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 66 ; SSE-NEXT: movq %rax, 56(%rdi) 67 ; SSE-NEXT: movq %rax, 48(%rdi) 68 ; SSE-NEXT: movq %rax, 40(%rdi) 69 ; SSE-NEXT: movq %rax, 32(%rdi) 70 ; SSE-NEXT: movq %rax, 24(%rdi) 71 ; SSE-NEXT: movq %rax, 16(%rdi) 72 ; SSE-NEXT: movq %rax, 8(%rdi) 73 ; SSE-NEXT: movq %rax, (%rdi) 74 ; SSE-NEXT: retq 75 ; 76 ; SSE2FAST-LABEL: memset_64_nonzero_bytes: 77 ; SSE2FAST: # BB#0: 78 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 79 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 80 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 81 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 82 ; SSE2FAST-NEXT: movups %xmm0, (%rdi) 83 ; SSE2FAST-NEXT: retq 84 ; 85 ; AVX-LABEL: memset_64_nonzero_bytes: 86 ; AVX: # BB#0: 87 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 88 ; AVX-NEXT: vmovups %ymm0, 32(%rdi) 89 ; AVX-NEXT: vmovups %ymm0, (%rdi) 90 ; AVX-NEXT: vzeroupper 91 ; AVX-NEXT: retq 92 ; 93 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1) 94 ret void 95 } 96 97 define void @memset_128_nonzero_bytes(i8* %x) { 98 ; SSE-LABEL: memset_128_nonzero_bytes: 99 ; SSE: # BB#0: 100 ; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 101 ; SSE-NEXT: movq %rax, 120(%rdi) 102 ; SSE-NEXT: movq %rax, 112(%rdi) 103 ; SSE-NEXT: movq %rax, 104(%rdi) 104 ; SSE-NEXT: movq %rax, 96(%rdi) 105 ; SSE-NEXT: movq %rax, 88(%rdi) 106 ; SSE-NEXT: movq %rax, 80(%rdi) 107 ; SSE-NEXT: movq %rax, 72(%rdi) 108 ; SSE-NEXT: movq %rax, 64(%rdi) 109 ; SSE-NEXT: movq %rax, 56(%rdi) 110 ; SSE-NEXT: movq %rax, 48(%rdi) 111 ; SSE-NEXT: movq %rax, 40(%rdi) 112 ; SSE-NEXT: movq %rax, 32(%rdi) 113 ; SSE-NEXT: movq %rax, 24(%rdi) 114 ; SSE-NEXT: movq %rax, 16(%rdi) 115 ; SSE-NEXT: movq %rax, 8(%rdi) 116 ; SSE-NEXT: movq %rax, (%rdi) 117 ; SSE-NEXT: retq 118 ; 119 ; SSE2FAST-LABEL: memset_128_nonzero_bytes: 120 ; SSE2FAST: # BB#0: 121 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 122 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 123 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 124 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 125 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 126 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 127 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 128 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 129 ; SSE2FAST-NEXT: movups %xmm0, (%rdi) 130 ; SSE2FAST-NEXT: retq 131 ; 132 ; AVX-LABEL: memset_128_nonzero_bytes: 133 ; AVX: # BB#0: 134 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 135 ; AVX-NEXT: vmovups %ymm0, 96(%rdi) 136 ; AVX-NEXT: vmovups %ymm0, 64(%rdi) 137 ; AVX-NEXT: vmovups %ymm0, 32(%rdi) 138 ; AVX-NEXT: vmovups %ymm0, (%rdi) 139 ; AVX-NEXT: vzeroupper 140 ; AVX-NEXT: retq 141 ; 142 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) 143 ret void 144 } 145 146 define void @memset_256_nonzero_bytes(i8* %x) { 147 ; SSE-LABEL: memset_256_nonzero_bytes: 148 ; SSE: # BB#0: 149 ; SSE-NEXT: pushq %rax 150 ; SSE-NEXT: .Ltmp0: 151 ; SSE-NEXT: .cfi_def_cfa_offset 16 152 ; SSE-NEXT: movl $42, %esi 153 ; SSE-NEXT: movl $256, %edx # imm = 0x100 154 ; SSE-NEXT: callq memset 155 ; SSE-NEXT: popq %rax 156 ; SSE-NEXT: retq 157 ; 158 ; SSE2FAST-LABEL: memset_256_nonzero_bytes: 159 ; SSE2FAST: # BB#0: 160 ; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 161 ; SSE2FAST-NEXT: movups %xmm0, 240(%rdi) 162 ; SSE2FAST-NEXT: movups %xmm0, 224(%rdi) 163 ; SSE2FAST-NEXT: movups %xmm0, 208(%rdi) 164 ; SSE2FAST-NEXT: movups %xmm0, 192(%rdi) 165 ; SSE2FAST-NEXT: movups %xmm0, 176(%rdi) 166 ; SSE2FAST-NEXT: movups %xmm0, 160(%rdi) 167 ; SSE2FAST-NEXT: movups %xmm0, 144(%rdi) 168 ; SSE2FAST-NEXT: movups %xmm0, 128(%rdi) 169 ; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 170 ; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 171 ; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 172 ; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 173 ; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 174 ; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 175 ; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 176 ; SSE2FAST-NEXT: movups %xmm0, (%rdi) 177 ; SSE2FAST-NEXT: retq 178 ; 179 ; AVX-LABEL: memset_256_nonzero_bytes: 180 ; AVX: # BB#0: 181 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 182 ; AVX-NEXT: vmovups %ymm0, 224(%rdi) 183 ; AVX-NEXT: vmovups %ymm0, 192(%rdi) 184 ; AVX-NEXT: vmovups %ymm0, 160(%rdi) 185 ; AVX-NEXT: vmovups %ymm0, 128(%rdi) 186 ; AVX-NEXT: vmovups %ymm0, 96(%rdi) 187 ; AVX-NEXT: vmovups %ymm0, 64(%rdi) 188 ; AVX-NEXT: vmovups %ymm0, 32(%rdi) 189 ; AVX-NEXT: vmovups %ymm0, (%rdi) 190 ; AVX-NEXT: vzeroupper 191 ; AVX-NEXT: retq 192 ; 193 %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) 194 ret void 195 } 196 197 declare i8* @__memset_chk(i8*, i32, i64, i64) 198 199 ; Repeat with a non-constant value for the stores. 200 201 define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { 202 ; SSE-LABEL: memset_16_nonconst_bytes: 203 ; SSE: # BB#0: 204 ; SSE-NEXT: movzbl %sil, %eax 205 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 206 ; SSE-NEXT: imulq %rax, %rcx 207 ; SSE-NEXT: movq %rcx, 8(%rdi) 208 ; SSE-NEXT: movq %rcx, (%rdi) 209 ; SSE-NEXT: retq 210 ; 211 ; SSE2FAST-LABEL: memset_16_nonconst_bytes: 212 ; SSE2FAST: # BB#0: 213 ; SSE2FAST-NEXT: movd %esi, %xmm0 214 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 215 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 216 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 217 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 218 ; SSE2FAST-NEXT: retq 219 ; 220 ; AVX1-LABEL: memset_16_nonconst_bytes: 221 ; AVX1: # BB#0: 222 ; AVX1-NEXT: vmovd %esi, %xmm0 223 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 224 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 225 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 226 ; AVX1-NEXT: retq 227 ; 228 ; AVX2-LABEL: memset_16_nonconst_bytes: 229 ; AVX2: # BB#0: 230 ; AVX2-NEXT: vmovd %esi, %xmm0 231 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 232 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 233 ; AVX2-NEXT: retq 234 ; 235 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false) 236 ret void 237 } 238 239 define void @memset_32_nonconst_bytes(i8* %x, i8 %c) { 240 ; SSE-LABEL: memset_32_nonconst_bytes: 241 ; SSE: # BB#0: 242 ; SSE-NEXT: movzbl %sil, %eax 243 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 244 ; SSE-NEXT: imulq %rax, %rcx 245 ; SSE-NEXT: movq %rcx, 24(%rdi) 246 ; SSE-NEXT: movq %rcx, 16(%rdi) 247 ; SSE-NEXT: movq %rcx, 8(%rdi) 248 ; SSE-NEXT: movq %rcx, (%rdi) 249 ; SSE-NEXT: retq 250 ; 251 ; SSE2FAST-LABEL: memset_32_nonconst_bytes: 252 ; SSE2FAST: # BB#0: 253 ; SSE2FAST-NEXT: movd %esi, %xmm0 254 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 255 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 256 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 257 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 258 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 259 ; SSE2FAST-NEXT: retq 260 ; 261 ; AVX1-LABEL: memset_32_nonconst_bytes: 262 ; AVX1: # BB#0: 263 ; AVX1-NEXT: vmovd %esi, %xmm0 264 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 265 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 266 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 267 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 268 ; AVX1-NEXT: vzeroupper 269 ; AVX1-NEXT: retq 270 ; 271 ; AVX2-LABEL: memset_32_nonconst_bytes: 272 ; AVX2: # BB#0: 273 ; AVX2-NEXT: vmovd %esi, %xmm0 274 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 275 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 276 ; AVX2-NEXT: vzeroupper 277 ; AVX2-NEXT: retq 278 ; 279 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false) 280 ret void 281 } 282 283 define void @memset_64_nonconst_bytes(i8* %x, i8 %c) { 284 ; SSE-LABEL: memset_64_nonconst_bytes: 285 ; SSE: # BB#0: 286 ; SSE-NEXT: movzbl %sil, %eax 287 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 288 ; SSE-NEXT: imulq %rax, %rcx 289 ; SSE-NEXT: movq %rcx, 56(%rdi) 290 ; SSE-NEXT: movq %rcx, 48(%rdi) 291 ; SSE-NEXT: movq %rcx, 40(%rdi) 292 ; SSE-NEXT: movq %rcx, 32(%rdi) 293 ; SSE-NEXT: movq %rcx, 24(%rdi) 294 ; SSE-NEXT: movq %rcx, 16(%rdi) 295 ; SSE-NEXT: movq %rcx, 8(%rdi) 296 ; SSE-NEXT: movq %rcx, (%rdi) 297 ; SSE-NEXT: retq 298 ; 299 ; SSE2FAST-LABEL: memset_64_nonconst_bytes: 300 ; SSE2FAST: # BB#0: 301 ; SSE2FAST-NEXT: movd %esi, %xmm0 302 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 303 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 304 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 305 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 306 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 307 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 308 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 309 ; SSE2FAST-NEXT: retq 310 ; 311 ; AVX1-LABEL: memset_64_nonconst_bytes: 312 ; AVX1: # BB#0: 313 ; AVX1-NEXT: vmovd %esi, %xmm0 314 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 315 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 316 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 317 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 318 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 319 ; AVX1-NEXT: vzeroupper 320 ; AVX1-NEXT: retq 321 ; 322 ; AVX2-LABEL: memset_64_nonconst_bytes: 323 ; AVX2: # BB#0: 324 ; AVX2-NEXT: vmovd %esi, %xmm0 325 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 326 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 327 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 328 ; AVX2-NEXT: vzeroupper 329 ; AVX2-NEXT: retq 330 ; 331 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false) 332 ret void 333 } 334 335 define void @memset_128_nonconst_bytes(i8* %x, i8 %c) { 336 ; SSE-LABEL: memset_128_nonconst_bytes: 337 ; SSE: # BB#0: 338 ; SSE-NEXT: movzbl %sil, %eax 339 ; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 340 ; SSE-NEXT: imulq %rax, %rcx 341 ; SSE-NEXT: movq %rcx, 120(%rdi) 342 ; SSE-NEXT: movq %rcx, 112(%rdi) 343 ; SSE-NEXT: movq %rcx, 104(%rdi) 344 ; SSE-NEXT: movq %rcx, 96(%rdi) 345 ; SSE-NEXT: movq %rcx, 88(%rdi) 346 ; SSE-NEXT: movq %rcx, 80(%rdi) 347 ; SSE-NEXT: movq %rcx, 72(%rdi) 348 ; SSE-NEXT: movq %rcx, 64(%rdi) 349 ; SSE-NEXT: movq %rcx, 56(%rdi) 350 ; SSE-NEXT: movq %rcx, 48(%rdi) 351 ; SSE-NEXT: movq %rcx, 40(%rdi) 352 ; SSE-NEXT: movq %rcx, 32(%rdi) 353 ; SSE-NEXT: movq %rcx, 24(%rdi) 354 ; SSE-NEXT: movq %rcx, 16(%rdi) 355 ; SSE-NEXT: movq %rcx, 8(%rdi) 356 ; SSE-NEXT: movq %rcx, (%rdi) 357 ; SSE-NEXT: retq 358 ; 359 ; SSE2FAST-LABEL: memset_128_nonconst_bytes: 360 ; SSE2FAST: # BB#0: 361 ; SSE2FAST-NEXT: movd %esi, %xmm0 362 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 363 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 364 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 365 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 366 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 367 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 368 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 369 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 370 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 371 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 372 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 373 ; SSE2FAST-NEXT: retq 374 ; 375 ; AVX1-LABEL: memset_128_nonconst_bytes: 376 ; AVX1: # BB#0: 377 ; AVX1-NEXT: vmovd %esi, %xmm0 378 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 379 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 380 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 381 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 382 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 383 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 384 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 385 ; AVX1-NEXT: vzeroupper 386 ; AVX1-NEXT: retq 387 ; 388 ; AVX2-LABEL: memset_128_nonconst_bytes: 389 ; AVX2: # BB#0: 390 ; AVX2-NEXT: vmovd %esi, %xmm0 391 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 392 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 393 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 394 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 395 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 396 ; AVX2-NEXT: vzeroupper 397 ; AVX2-NEXT: retq 398 ; 399 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false) 400 ret void 401 } 402 403 define void @memset_256_nonconst_bytes(i8* %x, i8 %c) { 404 ; SSE-LABEL: memset_256_nonconst_bytes: 405 ; SSE: # BB#0: 406 ; SSE-NEXT: movl $256, %edx # imm = 0x100 407 ; SSE-NEXT: jmp memset # TAILCALL 408 ; 409 ; SSE2FAST-LABEL: memset_256_nonconst_bytes: 410 ; SSE2FAST: # BB#0: 411 ; SSE2FAST-NEXT: movd %esi, %xmm0 412 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 413 ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 414 ; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 415 ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) 416 ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) 417 ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) 418 ; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi) 419 ; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi) 420 ; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi) 421 ; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi) 422 ; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi) 423 ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 424 ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 425 ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 426 ; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 427 ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 428 ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 429 ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 430 ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 431 ; SSE2FAST-NEXT: retq 432 ; 433 ; AVX1-LABEL: memset_256_nonconst_bytes: 434 ; AVX1: # BB#0: 435 ; AVX1-NEXT: vmovd %esi, %xmm0 436 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 437 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 438 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 439 ; AVX1-NEXT: vmovups %ymm0, 224(%rdi) 440 ; AVX1-NEXT: vmovups %ymm0, 192(%rdi) 441 ; AVX1-NEXT: vmovups %ymm0, 160(%rdi) 442 ; AVX1-NEXT: vmovups %ymm0, 128(%rdi) 443 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 444 ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 445 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 446 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 447 ; AVX1-NEXT: vzeroupper 448 ; AVX1-NEXT: retq 449 ; 450 ; AVX2-LABEL: memset_256_nonconst_bytes: 451 ; AVX2: # BB#0: 452 ; AVX2-NEXT: vmovd %esi, %xmm0 453 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 454 ; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi) 455 ; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi) 456 ; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi) 457 ; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi) 458 ; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 459 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 460 ; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 461 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 462 ; AVX2-NEXT: vzeroupper 463 ; AVX2-NEXT: retq 464 ; 465 tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false) 466 ret void 467 } 468 469 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 470 471