Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by update_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
      4 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
      5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      6 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      7 
      8 ; https://llvm.org/bugs/show_bug.cgi?id=27100
      9 
     10 define void @memset_16_nonzero_bytes(i8* %x) {
     11 ; SSE-LABEL: memset_16_nonzero_bytes:
     12 ; SSE:       # BB#0:
     13 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     14 ; SSE-NEXT:    movq %rax, 8(%rdi)
     15 ; SSE-NEXT:    movq %rax, (%rdi)
     16 ; SSE-NEXT:    retq
     17 ;
     18 ; SSE2FAST-LABEL: memset_16_nonzero_bytes:
     19 ; SSE2FAST:       # BB#0:
     20 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     21 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     22 ; SSE2FAST-NEXT:    retq
     23 ;
     24 ; AVX-LABEL: memset_16_nonzero_bytes:
     25 ; AVX:       # BB#0:
     26 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     27 ; AVX-NEXT:    vmovups %xmm0, (%rdi)
     28 ; AVX-NEXT:    retq
     29 ;
     30   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
     31   ret void
     32 }
     33 
     34 define void @memset_32_nonzero_bytes(i8* %x) {
     35 ; SSE-LABEL: memset_32_nonzero_bytes:
     36 ; SSE:       # BB#0:
     37 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     38 ; SSE-NEXT:    movq %rax, 24(%rdi)
     39 ; SSE-NEXT:    movq %rax, 16(%rdi)
     40 ; SSE-NEXT:    movq %rax, 8(%rdi)
     41 ; SSE-NEXT:    movq %rax, (%rdi)
     42 ; SSE-NEXT:    retq
     43 ;
     44 ; SSE2FAST-LABEL: memset_32_nonzero_bytes:
     45 ; SSE2FAST:       # BB#0:
     46 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     47 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
     48 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     49 ; SSE2FAST-NEXT:    retq
     50 ;
     51 ; AVX-LABEL: memset_32_nonzero_bytes:
     52 ; AVX:       # BB#0:
     53 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     54 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
     55 ; AVX-NEXT:    vzeroupper
     56 ; AVX-NEXT:    retq
     57 ;
     58   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
     59   ret void
     60 }
     61 
     62 define void @memset_64_nonzero_bytes(i8* %x) {
     63 ; SSE-LABEL: memset_64_nonzero_bytes:
     64 ; SSE:       # BB#0:
     65 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     66 ; SSE-NEXT:    movq %rax, 56(%rdi)
     67 ; SSE-NEXT:    movq %rax, 48(%rdi)
     68 ; SSE-NEXT:    movq %rax, 40(%rdi)
     69 ; SSE-NEXT:    movq %rax, 32(%rdi)
     70 ; SSE-NEXT:    movq %rax, 24(%rdi)
     71 ; SSE-NEXT:    movq %rax, 16(%rdi)
     72 ; SSE-NEXT:    movq %rax, 8(%rdi)
     73 ; SSE-NEXT:    movq %rax, (%rdi)
     74 ; SSE-NEXT:    retq
     75 ;
     76 ; SSE2FAST-LABEL: memset_64_nonzero_bytes:
     77 ; SSE2FAST:       # BB#0:
     78 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     79 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
     80 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
     81 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
     82 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     83 ; SSE2FAST-NEXT:    retq
     84 ;
     85 ; AVX-LABEL: memset_64_nonzero_bytes:
     86 ; AVX:       # BB#0:
     87 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     88 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
     89 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
     90 ; AVX-NEXT:    vzeroupper
     91 ; AVX-NEXT:    retq
     92 ;
     93   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
     94   ret void
     95 }
     96 
     97 define void @memset_128_nonzero_bytes(i8* %x) {
     98 ; SSE-LABEL: memset_128_nonzero_bytes:
     99 ; SSE:       # BB#0:
    100 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
    101 ; SSE-NEXT:    movq %rax, 120(%rdi)
    102 ; SSE-NEXT:    movq %rax, 112(%rdi)
    103 ; SSE-NEXT:    movq %rax, 104(%rdi)
    104 ; SSE-NEXT:    movq %rax, 96(%rdi)
    105 ; SSE-NEXT:    movq %rax, 88(%rdi)
    106 ; SSE-NEXT:    movq %rax, 80(%rdi)
    107 ; SSE-NEXT:    movq %rax, 72(%rdi)
    108 ; SSE-NEXT:    movq %rax, 64(%rdi)
    109 ; SSE-NEXT:    movq %rax, 56(%rdi)
    110 ; SSE-NEXT:    movq %rax, 48(%rdi)
    111 ; SSE-NEXT:    movq %rax, 40(%rdi)
    112 ; SSE-NEXT:    movq %rax, 32(%rdi)
    113 ; SSE-NEXT:    movq %rax, 24(%rdi)
    114 ; SSE-NEXT:    movq %rax, 16(%rdi)
    115 ; SSE-NEXT:    movq %rax, 8(%rdi)
    116 ; SSE-NEXT:    movq %rax, (%rdi)
    117 ; SSE-NEXT:    retq
    118 ;
    119 ; SSE2FAST-LABEL: memset_128_nonzero_bytes:
    120 ; SSE2FAST:       # BB#0:
    121 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    122 ; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
    123 ; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
    124 ; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
    125 ; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
    126 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
    127 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
    128 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
    129 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
    130 ; SSE2FAST-NEXT:    retq
    131 ;
    132 ; AVX-LABEL: memset_128_nonzero_bytes:
    133 ; AVX:       # BB#0:
    134 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    135 ; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
    136 ; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
    137 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
    138 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
    139 ; AVX-NEXT:    vzeroupper
    140 ; AVX-NEXT:    retq
    141 ;
    142   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
    143   ret void
    144 }
    145 
    146 define void @memset_256_nonzero_bytes(i8* %x) {
    147 ; SSE-LABEL: memset_256_nonzero_bytes:
    148 ; SSE:       # BB#0:
    149 ; SSE-NEXT:    pushq %rax
    150 ; SSE-NEXT:  .Ltmp0:
    151 ; SSE-NEXT:    .cfi_def_cfa_offset 16
    152 ; SSE-NEXT:    movl $42, %esi
    153 ; SSE-NEXT:    movl $256, %edx # imm = 0x100
    154 ; SSE-NEXT:    callq memset
    155 ; SSE-NEXT:    popq %rax
    156 ; SSE-NEXT:    retq
    157 ;
    158 ; SSE2FAST-LABEL: memset_256_nonzero_bytes:
    159 ; SSE2FAST:       # BB#0:
    160 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    161 ; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
    162 ; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
    163 ; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
    164 ; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
    165 ; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
    166 ; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
    167 ; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
    168 ; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
    169 ; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
    170 ; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
    171 ; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
    172 ; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
    173 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
    174 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
    175 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
    176 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
    177 ; SSE2FAST-NEXT:    retq
    178 ;
    179 ; AVX-LABEL: memset_256_nonzero_bytes:
    180 ; AVX:       # BB#0:
    181 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    182 ; AVX-NEXT:    vmovups %ymm0, 224(%rdi)
    183 ; AVX-NEXT:    vmovups %ymm0, 192(%rdi)
    184 ; AVX-NEXT:    vmovups %ymm0, 160(%rdi)
    185 ; AVX-NEXT:    vmovups %ymm0, 128(%rdi)
    186 ; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
    187 ; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
    188 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
    189 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
    190 ; AVX-NEXT:    vzeroupper
    191 ; AVX-NEXT:    retq
    192 ;
    193   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
    194   ret void
    195 }
    196 
    197 declare i8* @__memset_chk(i8*, i32, i64, i64)
    198 
    199 ; Repeat with a non-constant value for the stores.
    200 
    201 define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
    202 ; SSE-LABEL: memset_16_nonconst_bytes:
    203 ; SSE:       # BB#0:
    204 ; SSE-NEXT:    movzbl %sil, %eax
    205 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    206 ; SSE-NEXT:    imulq %rax, %rcx
    207 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    208 ; SSE-NEXT:    movq %rcx, (%rdi)
    209 ; SSE-NEXT:    retq
    210 ;
    211 ; SSE2FAST-LABEL: memset_16_nonconst_bytes:
    212 ; SSE2FAST:       # BB#0:
    213 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    214 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    215 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    216 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    217 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    218 ; SSE2FAST-NEXT:    retq
    219 ;
    220 ; AVX1-LABEL: memset_16_nonconst_bytes:
    221 ; AVX1:       # BB#0:
    222 ; AVX1-NEXT:    vmovd %esi, %xmm0
    223 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    224 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    225 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
    226 ; AVX1-NEXT:    retq
    227 ;
    228 ; AVX2-LABEL: memset_16_nonconst_bytes:
    229 ; AVX2:       # BB#0:
    230 ; AVX2-NEXT:    vmovd %esi, %xmm0
    231 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
    232 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
    233 ; AVX2-NEXT:    retq
    234 ;
    235   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
    236   ret void
    237 }
    238 
    239 define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
    240 ; SSE-LABEL: memset_32_nonconst_bytes:
    241 ; SSE:       # BB#0:
    242 ; SSE-NEXT:    movzbl %sil, %eax
    243 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    244 ; SSE-NEXT:    imulq %rax, %rcx
    245 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    246 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    247 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    248 ; SSE-NEXT:    movq %rcx, (%rdi)
    249 ; SSE-NEXT:    retq
    250 ;
    251 ; SSE2FAST-LABEL: memset_32_nonconst_bytes:
    252 ; SSE2FAST:       # BB#0:
    253 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    254 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    255 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    256 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    257 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    258 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    259 ; SSE2FAST-NEXT:    retq
    260 ;
    261 ; AVX1-LABEL: memset_32_nonconst_bytes:
    262 ; AVX1:       # BB#0:
    263 ; AVX1-NEXT:    vmovd %esi, %xmm0
    264 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    265 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    266 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    267 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    268 ; AVX1-NEXT:    vzeroupper
    269 ; AVX1-NEXT:    retq
    270 ;
    271 ; AVX2-LABEL: memset_32_nonconst_bytes:
    272 ; AVX2:       # BB#0:
    273 ; AVX2-NEXT:    vmovd %esi, %xmm0
    274 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    275 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    276 ; AVX2-NEXT:    vzeroupper
    277 ; AVX2-NEXT:    retq
    278 ;
    279   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
    280   ret void
    281 }
    282 
    283 define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
    284 ; SSE-LABEL: memset_64_nonconst_bytes:
    285 ; SSE:       # BB#0:
    286 ; SSE-NEXT:    movzbl %sil, %eax
    287 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    288 ; SSE-NEXT:    imulq %rax, %rcx
    289 ; SSE-NEXT:    movq %rcx, 56(%rdi)
    290 ; SSE-NEXT:    movq %rcx, 48(%rdi)
    291 ; SSE-NEXT:    movq %rcx, 40(%rdi)
    292 ; SSE-NEXT:    movq %rcx, 32(%rdi)
    293 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    294 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    295 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    296 ; SSE-NEXT:    movq %rcx, (%rdi)
    297 ; SSE-NEXT:    retq
    298 ;
    299 ; SSE2FAST-LABEL: memset_64_nonconst_bytes:
    300 ; SSE2FAST:       # BB#0:
    301 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    302 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    303 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    304 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    305 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    306 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    307 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    308 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    309 ; SSE2FAST-NEXT:    retq
    310 ;
    311 ; AVX1-LABEL: memset_64_nonconst_bytes:
    312 ; AVX1:       # BB#0:
    313 ; AVX1-NEXT:    vmovd %esi, %xmm0
    314 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    315 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    316 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    317 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    318 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    319 ; AVX1-NEXT:    vzeroupper
    320 ; AVX1-NEXT:    retq
    321 ;
    322 ; AVX2-LABEL: memset_64_nonconst_bytes:
    323 ; AVX2:       # BB#0:
    324 ; AVX2-NEXT:    vmovd %esi, %xmm0
    325 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    326 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    327 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    328 ; AVX2-NEXT:    vzeroupper
    329 ; AVX2-NEXT:    retq
    330 ;
    331   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
    332   ret void
    333 }
    334 
    335 define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
    336 ; SSE-LABEL: memset_128_nonconst_bytes:
    337 ; SSE:       # BB#0:
    338 ; SSE-NEXT:    movzbl %sil, %eax
    339 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    340 ; SSE-NEXT:    imulq %rax, %rcx
    341 ; SSE-NEXT:    movq %rcx, 120(%rdi)
    342 ; SSE-NEXT:    movq %rcx, 112(%rdi)
    343 ; SSE-NEXT:    movq %rcx, 104(%rdi)
    344 ; SSE-NEXT:    movq %rcx, 96(%rdi)
    345 ; SSE-NEXT:    movq %rcx, 88(%rdi)
    346 ; SSE-NEXT:    movq %rcx, 80(%rdi)
    347 ; SSE-NEXT:    movq %rcx, 72(%rdi)
    348 ; SSE-NEXT:    movq %rcx, 64(%rdi)
    349 ; SSE-NEXT:    movq %rcx, 56(%rdi)
    350 ; SSE-NEXT:    movq %rcx, 48(%rdi)
    351 ; SSE-NEXT:    movq %rcx, 40(%rdi)
    352 ; SSE-NEXT:    movq %rcx, 32(%rdi)
    353 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    354 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    355 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    356 ; SSE-NEXT:    movq %rcx, (%rdi)
    357 ; SSE-NEXT:    retq
    358 ;
    359 ; SSE2FAST-LABEL: memset_128_nonconst_bytes:
    360 ; SSE2FAST:       # BB#0:
    361 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    362 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    363 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    364 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    365 ; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
    366 ; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
    367 ; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
    368 ; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
    369 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    370 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    371 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    372 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    373 ; SSE2FAST-NEXT:    retq
    374 ;
    375 ; AVX1-LABEL: memset_128_nonconst_bytes:
    376 ; AVX1:       # BB#0:
    377 ; AVX1-NEXT:    vmovd %esi, %xmm0
    378 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    379 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    380 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    381 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    382 ; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
    383 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    384 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    385 ; AVX1-NEXT:    vzeroupper
    386 ; AVX1-NEXT:    retq
    387 ;
    388 ; AVX2-LABEL: memset_128_nonconst_bytes:
    389 ; AVX2:       # BB#0:
    390 ; AVX2-NEXT:    vmovd %esi, %xmm0
    391 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    392 ; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
    393 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
    394 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    395 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    396 ; AVX2-NEXT:    vzeroupper
    397 ; AVX2-NEXT:    retq
    398 ;
    399   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
    400   ret void
    401 }
    402 
    403 define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
    404 ; SSE-LABEL: memset_256_nonconst_bytes:
    405 ; SSE:       # BB#0:
    406 ; SSE-NEXT:    movl $256, %edx # imm = 0x100
    407 ; SSE-NEXT:    jmp memset # TAILCALL
    408 ;
    409 ; SSE2FAST-LABEL: memset_256_nonconst_bytes:
    410 ; SSE2FAST:       # BB#0:
    411 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    412 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    413 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
    414 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
    415 ; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
    416 ; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
    417 ; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
    418 ; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
    419 ; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
    420 ; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
    421 ; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
    422 ; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
    423 ; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
    424 ; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
    425 ; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
    426 ; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
    427 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    428 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    429 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    430 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    431 ; SSE2FAST-NEXT:    retq
    432 ;
    433 ; AVX1-LABEL: memset_256_nonconst_bytes:
    434 ; AVX1:       # BB#0:
    435 ; AVX1-NEXT:    vmovd %esi, %xmm0
    436 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    437 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    438 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    439 ; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
    440 ; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
    441 ; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
    442 ; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
    443 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    444 ; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
    445 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    446 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    447 ; AVX1-NEXT:    vzeroupper
    448 ; AVX1-NEXT:    retq
    449 ;
    450 ; AVX2-LABEL: memset_256_nonconst_bytes:
    451 ; AVX2:       # BB#0:
    452 ; AVX2-NEXT:    vmovd %esi, %xmm0
    453 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    454 ; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
    455 ; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
    456 ; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
    457 ; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
    458 ; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
    459 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
    460 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    461 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    462 ; AVX2-NEXT:    vzeroupper
    463 ; AVX2-NEXT:    retq
    464 ;
    465   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
    466   ret void
    467 }
    468 
    469 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
    470 
    471