1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X32 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 4 5 define void @big_nonzero_16_bytes(i32* nocapture %a) { 6 ; X32-LABEL: big_nonzero_16_bytes: 7 ; X32: # %bb.0: 8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 9 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] 10 ; X32-NEXT: vmovups %xmm0, (%eax) 11 ; X32-NEXT: retl 12 ; 13 ; X64-LABEL: big_nonzero_16_bytes: 14 ; X64: # %bb.0: 15 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] 16 ; X64-NEXT: vmovups %xmm0, (%rdi) 17 ; X64-NEXT: retq 18 %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1 19 %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2 20 %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3 21 22 store i32 1, i32* %a, align 4 23 store i32 2, i32* %arrayidx1, align 4 24 store i32 3, i32* %arrayidx2, align 4 25 store i32 4, i32* %arrayidx3, align 4 26 ret void 27 } 28 29 ; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store. 30 ; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then 31 ; it takes extra instructions to do this in scalar. 32 33 define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) { 34 ; X32-LABEL: big_nonzero_16_bytes_big64bit_constants: 35 ; X32: # %bb.0: 36 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 37 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3] 38 ; X32-NEXT: vmovups %xmm0, (%eax) 39 ; X32-NEXT: retl 40 ; 41 ; X64-LABEL: big_nonzero_16_bytes_big64bit_constants: 42 ; X64: # %bb.0: 43 ; X64-NEXT: movabsq $4294967297, %rax # imm = 0x100000001 44 ; X64-NEXT: movq %rax, (%rdi) 45 ; X64-NEXT: movabsq $12884901889, %rax # imm = 0x300000001 46 ; X64-NEXT: movq %rax, 8(%rdi) 47 ; X64-NEXT: retq 48 %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1 49 50 store i64 4294967297, i64* %a 51 store i64 12884901889, i64* %arrayidx1 52 ret void 53 } 54 55 ; Splats may be an opportunity to use a broadcast op. 56 57 define void @big_nonzero_32_bytes_splat(i32* nocapture %a) { 58 ; X32-LABEL: big_nonzero_32_bytes_splat: 59 ; X32: # %bb.0: 60 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 61 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] 62 ; X32-NEXT: vmovups %ymm0, (%eax) 63 ; X32-NEXT: vzeroupper 64 ; X32-NEXT: retl 65 ; 66 ; X64-LABEL: big_nonzero_32_bytes_splat: 67 ; X64: # %bb.0: 68 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] 69 ; X64-NEXT: vmovups %ymm0, (%rdi) 70 ; X64-NEXT: vzeroupper 71 ; X64-NEXT: retq 72 %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1 73 %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2 74 %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3 75 %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 4 76 %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 5 77 %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 6 78 %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 7 79 80 store i32 42, i32* %a, align 4 81 store i32 42, i32* %arrayidx1, align 4 82 store i32 42, i32* %arrayidx2, align 4 83 store i32 42, i32* %arrayidx3, align 4 84 store i32 42, i32* %arrayidx4, align 4 85 store i32 42, i32* %arrayidx5, align 4 86 store i32 42, i32* %arrayidx6, align 4 87 store i32 42, i32* %arrayidx7, align 4 88 ret void 89 } 90 91 ; Verify that we choose the best-sized store(s) for each chunk. 92 93 define void @big_nonzero_63_bytes(i8* nocapture %a) { 94 ; X32-LABEL: big_nonzero_63_bytes: 95 ; X32: # %bb.0: 96 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 97 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0] 98 ; X32-NEXT: vmovups %ymm0, (%eax) 99 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0] 100 ; X32-NEXT: vmovups %xmm0, 32(%eax) 101 ; X32-NEXT: movl $0, 52(%eax) 102 ; X32-NEXT: movl $7, 48(%eax) 103 ; X32-NEXT: movl $8, 56(%eax) 104 ; X32-NEXT: movw $9, 60(%eax) 105 ; X32-NEXT: movb $10, 62(%eax) 106 ; X32-NEXT: vzeroupper 107 ; X32-NEXT: retl 108 ; 109 ; X64-LABEL: big_nonzero_63_bytes: 110 ; X64: # %bb.0: 111 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4] 112 ; X64-NEXT: vmovups %ymm0, (%rdi) 113 ; X64-NEXT: movq $5, 32(%rdi) 114 ; X64-NEXT: movq $6, 40(%rdi) 115 ; X64-NEXT: movq $7, 48(%rdi) 116 ; X64-NEXT: movl $8, 56(%rdi) 117 ; X64-NEXT: movw $9, 60(%rdi) 118 ; X64-NEXT: movb $10, 62(%rdi) 119 ; X64-NEXT: vzeroupper 120 ; X64-NEXT: retq 121 %a8 = bitcast i8* %a to i64* 122 %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1 123 %arrayidx16 = getelementptr inbounds i64, i64* %a8, i64 2 124 %arrayidx24 = getelementptr inbounds i64, i64* %a8, i64 3 125 %arrayidx32 = getelementptr inbounds i64, i64* %a8, i64 4 126 %arrayidx40 = getelementptr inbounds i64, i64* %a8, i64 5 127 %arrayidx48 = getelementptr inbounds i64, i64* %a8, i64 6 128 %a4 = bitcast i8* %a to i32* 129 %arrayidx56 = getelementptr inbounds i32, i32* %a4, i64 14 130 %a2 = bitcast i8* %a to i16* 131 %arrayidx60 = getelementptr inbounds i16, i16* %a2, i64 30 132 %arrayidx62 = getelementptr inbounds i8, i8* %a, i64 62 133 134 store i64 1, i64* %a8 135 store i64 2, i64* %arrayidx8 136 store i64 3, i64* %arrayidx16 137 store i64 4, i64* %arrayidx24 138 store i64 5, i64* %arrayidx32 139 store i64 6, i64* %arrayidx40 140 store i64 7, i64* %arrayidx48 141 store i32 8, i32* %arrayidx56 142 store i16 9, i16* %arrayidx60 143 store i8 10, i8* %arrayidx62 144 ret void 145 } 146 147