1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX 2 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR 3 4 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to 5 ; llvm.mem* intrinsics get lowered to loops. 6 7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 8 target triple = "nvptx64-unknown-unknown" 9 10 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1 11 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) #1 12 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 13 14 define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 { 15 entry: 16 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 false) 17 ret i8* %dst 18 19 ; IR-LABEL: @memcpy_caller 20 ; IR: entry: 21 ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 22 ; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion 23 24 ; IR: loop-memcpy-expansion: 25 ; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] 26 ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index 27 ; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] 28 ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index 29 ; IR: store i8 [[Load]], i8* [[DstGep]] 30 ; IR: [[IndexInc]] = add i64 %loop-index, 1 31 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n 32 ; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion 33 34 ; IR-LABEL: post-loop-memcpy-expansion: 35 ; IR: ret i8* %dst 36 37 ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller 38 ; PTX: LBB[[LABEL:[_0-9]+]]: 39 ; PTX: ld.u8 %rs[[REG:[0-9]+]] 40 ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] 41 ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 42 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd 43 ; PTX: @%p[[PRED]] bra LBB[[LABEL]] 44 45 } 46 47 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { 48 entry: 49 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 true) 50 ret i8* %dst 51 52 ; IR-LABEL: @memcpy_volatile_caller 53 ; IR: entry: 54 ; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 55 ; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion 56 57 ; IR: loop-memcpy-expansion: 58 ; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] 59 ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index 60 ; IR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]] 61 ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index 62 ; IR: store volatile i8 [[Load]], i8* [[DstGep]] 63 ; IR: [[IndexInc]] = add i64 %loop-index, 1 64 ; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n 65 ; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion 66 67 ; IR-LABEL: post-loop-memcpy-expansion: 68 ; IR: ret i8* %dst 69 70 71 ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_volatile_caller 72 ; PTX: LBB[[LABEL:[_0-9]+]]: 73 ; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] 74 ; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] 75 ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 76 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd 77 ; PTX: @%p[[PRED]] bra LBB[[LABEL]] 78 } 79 80 define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { 81 entry: 82 %0 = bitcast i32* %dst to i8* 83 %1 = bitcast i32* %src to i8* 84 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 %n, i1 false) 85 ret i8* %0 86 87 ; Check that casts in calls to memcpy are handled properly 88 ; IR-LABEL: @memcpy_casting_caller 89 ; IR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8* 90 ; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* 91 ; IR: getelementptr inbounds i8, i8* [[SRCCAST]] 92 ; IR: getelementptr inbounds i8, i8* [[DSTCAST]] 93 } 94 95 define i8* @memcpy_known_size(i8* %dst, i8* %src) { 96 entry: 97 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i1 false) 98 ret i8* %dst 99 100 ; Check that calls with compile-time constant size are handled correctly 101 ; IR-LABEL: @memcpy_known_size 102 ; IR: entry: 103 ; IR: br label %load-store-loop 104 ; IR: load-store-loop: 105 ; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] 106 ; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index 107 ; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] 108 ; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index 109 ; IR: store i8 [[Load]], i8* [[DstGep]] 110 ; IR: [[IndexInc]] = add i64 %loop-index, 1 111 ; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 112 ; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split 113 } 114 115 define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 { 116 entry: 117 %0 = trunc i32 %c to i8 118 tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i1 false) 119 ret i8* %dst 120 121 ; IR-LABEL: @memset_caller 122 ; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8 123 ; IR: [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n 124 ; IR: br i1 [[CMPREG]], label %split, label %loadstoreloop 125 ; IR: loadstoreloop: 126 ; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 127 ; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] 128 129 ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( 130 ; PTX: ld.param.u32 %r[[C:[0-9]+]] 131 ; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; 132 ; PTX: LBB[[LABEL:[_0-9]+]]: 133 ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] 134 ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 135 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd 136 ; PTX: @%p[[PRED]] bra LBB[[LABEL]] 137 } 138 139 define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 { 140 entry: 141 %0 = trunc i32 %c to i8 142 tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i1 true) 143 ret i8* %dst 144 145 ; IR-LABEL: @volatile_memset_caller 146 ; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8 147 ; IR: loadstoreloop: 148 ; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 149 ; IR-NEXT: store volatile i8 [[VAL]], i8* [[STOREPTR]] 150 } 151 152 define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 { 153 entry: 154 tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i1 false) 155 ret i8* %dst 156 157 ; IR-LABEL: @memmove_caller 158 ; IR: icmp ult i8* %src, %dst 159 ; IR: [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64 160 ; IR-NEXT: %index_ptr = sub i64 [[PHIVAL]], 1 161 ; IR: [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64 162 ; IR: {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1 163 164 ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memmove_caller( 165 ; PTX: ld.param.u64 %rd[[N:[0-9]+]] 166 ; PTX-DAG: setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0 167 ; PTX-DAG: setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} 168 ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]] 169 ; -- this is the backwards copying BB 170 ; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]] 171 ; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 172 ; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] 173 ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] 174 ; -- this is the forwards copying BB 175 ; PTX: LBB[[FORWARD_BB]]: 176 ; PTX: @%p[[NEQ0]] bra LBB[[EXIT]] 177 ; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] 178 ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] 179 ; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 180 ; -- exit block 181 ; PTX: LBB[[EXIT]]: 182 ; PTX-NEXT: st.param.b64 [func_retval0 183 ; PTX-NEXT: ret 184 } 185