Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
      4 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
      5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      6 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      7 
      8 ; https://llvm.org/bugs/show_bug.cgi?id=27100
      9 
     10 define void @memset_16_nonzero_bytes(i8* %x) {
     11 ; SSE-LABEL: memset_16_nonzero_bytes:
     12 ; SSE:       # %bb.0:
     13 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     14 ; SSE-NEXT:    movq %rax, 8(%rdi)
     15 ; SSE-NEXT:    movq %rax, (%rdi)
     16 ; SSE-NEXT:    retq
     17 ;
     18 ; SSE2FAST-LABEL: memset_16_nonzero_bytes:
     19 ; SSE2FAST:       # %bb.0:
     20 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     21 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     22 ; SSE2FAST-NEXT:    retq
     23 ;
     24 ; AVX-LABEL: memset_16_nonzero_bytes:
     25 ; AVX:       # %bb.0:
     26 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     27 ; AVX-NEXT:    vmovups %xmm0, (%rdi)
     28 ; AVX-NEXT:    retq
     29   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
     30   ret void
     31 }
     32 
     33 define void @memset_32_nonzero_bytes(i8* %x) {
     34 ; SSE-LABEL: memset_32_nonzero_bytes:
     35 ; SSE:       # %bb.0:
     36 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     37 ; SSE-NEXT:    movq %rax, 24(%rdi)
     38 ; SSE-NEXT:    movq %rax, 16(%rdi)
     39 ; SSE-NEXT:    movq %rax, 8(%rdi)
     40 ; SSE-NEXT:    movq %rax, (%rdi)
     41 ; SSE-NEXT:    retq
     42 ;
     43 ; SSE2FAST-LABEL: memset_32_nonzero_bytes:
     44 ; SSE2FAST:       # %bb.0:
     45 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     46 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
     47 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     48 ; SSE2FAST-NEXT:    retq
     49 ;
     50 ; AVX-LABEL: memset_32_nonzero_bytes:
     51 ; AVX:       # %bb.0:
     52 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     53 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
     54 ; AVX-NEXT:    vzeroupper
     55 ; AVX-NEXT:    retq
     56   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
     57   ret void
     58 }
     59 
     60 define void @memset_64_nonzero_bytes(i8* %x) {
     61 ; SSE-LABEL: memset_64_nonzero_bytes:
     62 ; SSE:       # %bb.0:
     63 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     64 ; SSE-NEXT:    movq %rax, 56(%rdi)
     65 ; SSE-NEXT:    movq %rax, 48(%rdi)
     66 ; SSE-NEXT:    movq %rax, 40(%rdi)
     67 ; SSE-NEXT:    movq %rax, 32(%rdi)
     68 ; SSE-NEXT:    movq %rax, 24(%rdi)
     69 ; SSE-NEXT:    movq %rax, 16(%rdi)
     70 ; SSE-NEXT:    movq %rax, 8(%rdi)
     71 ; SSE-NEXT:    movq %rax, (%rdi)
     72 ; SSE-NEXT:    retq
     73 ;
     74 ; SSE2FAST-LABEL: memset_64_nonzero_bytes:
     75 ; SSE2FAST:       # %bb.0:
     76 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     77 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
     78 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
     79 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
     80 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
     81 ; SSE2FAST-NEXT:    retq
     82 ;
     83 ; AVX-LABEL: memset_64_nonzero_bytes:
     84 ; AVX:       # %bb.0:
     85 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
     86 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
     87 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
     88 ; AVX-NEXT:    vzeroupper
     89 ; AVX-NEXT:    retq
     90   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
     91   ret void
     92 }
     93 
     94 define void @memset_128_nonzero_bytes(i8* %x) {
     95 ; SSE-LABEL: memset_128_nonzero_bytes:
     96 ; SSE:       # %bb.0:
     97 ; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
     98 ; SSE-NEXT:    movq %rax, 120(%rdi)
     99 ; SSE-NEXT:    movq %rax, 112(%rdi)
    100 ; SSE-NEXT:    movq %rax, 104(%rdi)
    101 ; SSE-NEXT:    movq %rax, 96(%rdi)
    102 ; SSE-NEXT:    movq %rax, 88(%rdi)
    103 ; SSE-NEXT:    movq %rax, 80(%rdi)
    104 ; SSE-NEXT:    movq %rax, 72(%rdi)
    105 ; SSE-NEXT:    movq %rax, 64(%rdi)
    106 ; SSE-NEXT:    movq %rax, 56(%rdi)
    107 ; SSE-NEXT:    movq %rax, 48(%rdi)
    108 ; SSE-NEXT:    movq %rax, 40(%rdi)
    109 ; SSE-NEXT:    movq %rax, 32(%rdi)
    110 ; SSE-NEXT:    movq %rax, 24(%rdi)
    111 ; SSE-NEXT:    movq %rax, 16(%rdi)
    112 ; SSE-NEXT:    movq %rax, 8(%rdi)
    113 ; SSE-NEXT:    movq %rax, (%rdi)
    114 ; SSE-NEXT:    retq
    115 ;
    116 ; SSE2FAST-LABEL: memset_128_nonzero_bytes:
    117 ; SSE2FAST:       # %bb.0:
    118 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    119 ; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
    120 ; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
    121 ; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
    122 ; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
    123 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
    124 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
    125 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
    126 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
    127 ; SSE2FAST-NEXT:    retq
    128 ;
    129 ; AVX-LABEL: memset_128_nonzero_bytes:
    130 ; AVX:       # %bb.0:
    131 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    132 ; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
    133 ; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
    134 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
    135 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
    136 ; AVX-NEXT:    vzeroupper
    137 ; AVX-NEXT:    retq
    138   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
    139   ret void
    140 }
    141 
    142 define void @memset_256_nonzero_bytes(i8* %x) {
    143 ; SSE-LABEL: memset_256_nonzero_bytes:
    144 ; SSE:       # %bb.0:
    145 ; SSE-NEXT:    pushq %rax
    146 ; SSE-NEXT:    .cfi_def_cfa_offset 16
    147 ; SSE-NEXT:    movl $42, %esi
    148 ; SSE-NEXT:    movl $256, %edx # imm = 0x100
    149 ; SSE-NEXT:    callq memset
    150 ; SSE-NEXT:    popq %rax
    151 ; SSE-NEXT:    .cfi_def_cfa_offset 8
    152 ; SSE-NEXT:    retq
    153 ;
    154 ; SSE2FAST-LABEL: memset_256_nonzero_bytes:
    155 ; SSE2FAST:       # %bb.0:
    156 ; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    157 ; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
    158 ; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
    159 ; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
    160 ; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
    161 ; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
    162 ; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
    163 ; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
    164 ; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
    165 ; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
    166 ; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
    167 ; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
    168 ; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
    169 ; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
    170 ; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
    171 ; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
    172 ; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
    173 ; SSE2FAST-NEXT:    retq
    174 ;
    175 ; AVX-LABEL: memset_256_nonzero_bytes:
    176 ; AVX:       # %bb.0:
    177 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
    178 ; AVX-NEXT:    vmovups %ymm0, 224(%rdi)
    179 ; AVX-NEXT:    vmovups %ymm0, 192(%rdi)
    180 ; AVX-NEXT:    vmovups %ymm0, 160(%rdi)
    181 ; AVX-NEXT:    vmovups %ymm0, 128(%rdi)
    182 ; AVX-NEXT:    vmovups %ymm0, 96(%rdi)
    183 ; AVX-NEXT:    vmovups %ymm0, 64(%rdi)
    184 ; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
    185 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
    186 ; AVX-NEXT:    vzeroupper
    187 ; AVX-NEXT:    retq
    188   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
    189   ret void
    190 }
    191 
    192 declare i8* @__memset_chk(i8*, i32, i64, i64)
    193 
    194 ; Repeat with a non-constant value for the stores.
    195 
    196 define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
    197 ; SSE-LABEL: memset_16_nonconst_bytes:
    198 ; SSE:       # %bb.0:
    199 ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
    200 ; SSE-NEXT:    movzbl %sil, %eax
    201 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    202 ; SSE-NEXT:    imulq %rax, %rcx
    203 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    204 ; SSE-NEXT:    movq %rcx, (%rdi)
    205 ; SSE-NEXT:    retq
    206 ;
    207 ; SSE2FAST-LABEL: memset_16_nonconst_bytes:
    208 ; SSE2FAST:       # %bb.0:
    209 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    210 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    211 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
    212 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    213 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    214 ; SSE2FAST-NEXT:    retq
    215 ;
    216 ; AVX1-LABEL: memset_16_nonconst_bytes:
    217 ; AVX1:       # %bb.0:
    218 ; AVX1-NEXT:    vmovd %esi, %xmm0
    219 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    220 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    221 ; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
    222 ; AVX1-NEXT:    retq
    223 ;
    224 ; AVX2-LABEL: memset_16_nonconst_bytes:
    225 ; AVX2:       # %bb.0:
    226 ; AVX2-NEXT:    vmovd %esi, %xmm0
    227 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
    228 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
    229 ; AVX2-NEXT:    retq
    230   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i1 false)
    231   ret void
    232 }
    233 
    234 define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
    235 ; SSE-LABEL: memset_32_nonconst_bytes:
    236 ; SSE:       # %bb.0:
    237 ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
    238 ; SSE-NEXT:    movzbl %sil, %eax
    239 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    240 ; SSE-NEXT:    imulq %rax, %rcx
    241 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    242 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    243 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    244 ; SSE-NEXT:    movq %rcx, (%rdi)
    245 ; SSE-NEXT:    retq
    246 ;
    247 ; SSE2FAST-LABEL: memset_32_nonconst_bytes:
    248 ; SSE2FAST:       # %bb.0:
    249 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    250 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    251 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
    252 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    253 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    254 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    255 ; SSE2FAST-NEXT:    retq
    256 ;
    257 ; AVX1-LABEL: memset_32_nonconst_bytes:
    258 ; AVX1:       # %bb.0:
    259 ; AVX1-NEXT:    vmovd %esi, %xmm0
    260 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    261 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    262 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    263 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    264 ; AVX1-NEXT:    vzeroupper
    265 ; AVX1-NEXT:    retq
    266 ;
    267 ; AVX2-LABEL: memset_32_nonconst_bytes:
    268 ; AVX2:       # %bb.0:
    269 ; AVX2-NEXT:    vmovd %esi, %xmm0
    270 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    271 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    272 ; AVX2-NEXT:    vzeroupper
    273 ; AVX2-NEXT:    retq
    274   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i1 false)
    275   ret void
    276 }
    277 
    278 define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
    279 ; SSE-LABEL: memset_64_nonconst_bytes:
    280 ; SSE:       # %bb.0:
    281 ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
    282 ; SSE-NEXT:    movzbl %sil, %eax
    283 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    284 ; SSE-NEXT:    imulq %rax, %rcx
    285 ; SSE-NEXT:    movq %rcx, 56(%rdi)
    286 ; SSE-NEXT:    movq %rcx, 48(%rdi)
    287 ; SSE-NEXT:    movq %rcx, 40(%rdi)
    288 ; SSE-NEXT:    movq %rcx, 32(%rdi)
    289 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    290 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    291 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    292 ; SSE-NEXT:    movq %rcx, (%rdi)
    293 ; SSE-NEXT:    retq
    294 ;
    295 ; SSE2FAST-LABEL: memset_64_nonconst_bytes:
    296 ; SSE2FAST:       # %bb.0:
    297 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    298 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    299 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
    300 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    301 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    302 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    303 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    304 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    305 ; SSE2FAST-NEXT:    retq
    306 ;
    307 ; AVX1-LABEL: memset_64_nonconst_bytes:
    308 ; AVX1:       # %bb.0:
    309 ; AVX1-NEXT:    vmovd %esi, %xmm0
    310 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    311 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    312 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    313 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    314 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    315 ; AVX1-NEXT:    vzeroupper
    316 ; AVX1-NEXT:    retq
    317 ;
    318 ; AVX2-LABEL: memset_64_nonconst_bytes:
    319 ; AVX2:       # %bb.0:
    320 ; AVX2-NEXT:    vmovd %esi, %xmm0
    321 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    322 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    323 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    324 ; AVX2-NEXT:    vzeroupper
    325 ; AVX2-NEXT:    retq
    326   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false)
    327   ret void
    328 }
    329 
    330 define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
    331 ; SSE-LABEL: memset_128_nonconst_bytes:
    332 ; SSE:       # %bb.0:
    333 ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
    334 ; SSE-NEXT:    movzbl %sil, %eax
    335 ; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
    336 ; SSE-NEXT:    imulq %rax, %rcx
    337 ; SSE-NEXT:    movq %rcx, 120(%rdi)
    338 ; SSE-NEXT:    movq %rcx, 112(%rdi)
    339 ; SSE-NEXT:    movq %rcx, 104(%rdi)
    340 ; SSE-NEXT:    movq %rcx, 96(%rdi)
    341 ; SSE-NEXT:    movq %rcx, 88(%rdi)
    342 ; SSE-NEXT:    movq %rcx, 80(%rdi)
    343 ; SSE-NEXT:    movq %rcx, 72(%rdi)
    344 ; SSE-NEXT:    movq %rcx, 64(%rdi)
    345 ; SSE-NEXT:    movq %rcx, 56(%rdi)
    346 ; SSE-NEXT:    movq %rcx, 48(%rdi)
    347 ; SSE-NEXT:    movq %rcx, 40(%rdi)
    348 ; SSE-NEXT:    movq %rcx, 32(%rdi)
    349 ; SSE-NEXT:    movq %rcx, 24(%rdi)
    350 ; SSE-NEXT:    movq %rcx, 16(%rdi)
    351 ; SSE-NEXT:    movq %rcx, 8(%rdi)
    352 ; SSE-NEXT:    movq %rcx, (%rdi)
    353 ; SSE-NEXT:    retq
    354 ;
    355 ; SSE2FAST-LABEL: memset_128_nonconst_bytes:
    356 ; SSE2FAST:       # %bb.0:
    357 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    358 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    359 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
    360 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    361 ; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
    362 ; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
    363 ; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
    364 ; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
    365 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    366 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    367 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    368 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    369 ; SSE2FAST-NEXT:    retq
    370 ;
    371 ; AVX1-LABEL: memset_128_nonconst_bytes:
    372 ; AVX1:       # %bb.0:
    373 ; AVX1-NEXT:    vmovd %esi, %xmm0
    374 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    375 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    376 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    377 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    378 ; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
    379 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    380 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    381 ; AVX1-NEXT:    vzeroupper
    382 ; AVX1-NEXT:    retq
    383 ;
    384 ; AVX2-LABEL: memset_128_nonconst_bytes:
    385 ; AVX2:       # %bb.0:
    386 ; AVX2-NEXT:    vmovd %esi, %xmm0
    387 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    388 ; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
    389 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
    390 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    391 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    392 ; AVX2-NEXT:    vzeroupper
    393 ; AVX2-NEXT:    retq
    394   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false)
    395   ret void
    396 }
    397 
    398 define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
    399 ; SSE-LABEL: memset_256_nonconst_bytes:
    400 ; SSE:       # %bb.0:
    401 ; SSE-NEXT:    movl $256, %edx # imm = 0x100
    402 ; SSE-NEXT:    jmp memset # TAILCALL
    403 ;
    404 ; SSE2FAST-LABEL: memset_256_nonconst_bytes:
    405 ; SSE2FAST:       # %bb.0:
    406 ; SSE2FAST-NEXT:    movd %esi, %xmm0
    407 ; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    408 ; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
    409 ; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
    410 ; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
    411 ; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
    412 ; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
    413 ; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
    414 ; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
    415 ; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
    416 ; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
    417 ; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
    418 ; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
    419 ; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
    420 ; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
    421 ; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
    422 ; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
    423 ; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
    424 ; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
    425 ; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
    426 ; SSE2FAST-NEXT:    retq
    427 ;
    428 ; AVX1-LABEL: memset_256_nonconst_bytes:
    429 ; AVX1:       # %bb.0:
    430 ; AVX1-NEXT:    vmovd %esi, %xmm0
    431 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    432 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    433 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    434 ; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
    435 ; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
    436 ; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
    437 ; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
    438 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    439 ; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
    440 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
    441 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
    442 ; AVX1-NEXT:    vzeroupper
    443 ; AVX1-NEXT:    retq
    444 ;
    445 ; AVX2-LABEL: memset_256_nonconst_bytes:
    446 ; AVX2:       # %bb.0:
    447 ; AVX2-NEXT:    vmovd %esi, %xmm0
    448 ; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
    449 ; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
    450 ; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
    451 ; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
    452 ; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
    453 ; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
    454 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
    455 ; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
    456 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
    457 ; AVX2-NEXT:    vzeroupper
    458 ; AVX2-NEXT:    retq
    459   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false)
    460   ret void
    461 }
    462 
    463 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1
    464 
    465