1 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s 2 3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 4 5 declare i16 @llvm.ctlz.i16(i16, i1) readnone 6 declare i32 @llvm.ctlz.i32(i32, i1) readnone 7 declare i64 @llvm.ctlz.i64(i64, i1) readnone 8 9 ; There should be no difference between llvm.ctlz.i32(%a, true) and 10 ; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. 11 12 ; CHECK-LABEL: myctlz( 13 define i32 @myctlz(i32 %a) { 14 ; CHECK: ld.param. 15 ; CHECK-NEXT: clz.b32 16 ; CHECK-NEXT: st.param. 17 ; CHECK-NEXT: ret; 18 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone 19 ret i32 %val 20 } 21 ; CHECK-LABEL: myctlz_2( 22 define i32 @myctlz_2(i32 %a) { 23 ; CHECK: ld.param. 24 ; CHECK-NEXT: clz.b32 25 ; CHECK-NEXT: st.param. 26 ; CHECK-NEXT: ret; 27 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone 28 ret i32 %val 29 } 30 31 ; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit 32 ; value, so here we have to zero-extend it. 33 ; CHECK-LABEL: myctlz64( 34 define i64 @myctlz64(i64 %a) { 35 ; CHECK: ld.param. 36 ; CHECK-NEXT: clz.b64 37 ; CHECK-NEXT: cvt.u64.u32 38 ; CHECK-NEXT: st.param. 39 ; CHECK-NEXT: ret; 40 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 41 ret i64 %val 42 } 43 ; CHECK-LABEL: myctlz64_2( 44 define i64 @myctlz64_2(i64 %a) { 45 ; CHECK: ld.param. 46 ; CHECK-NEXT: clz.b64 47 ; CHECK-NEXT: cvt.u64.u32 48 ; CHECK-NEXT: st.param. 49 ; CHECK-NEXT: ret; 50 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone 51 ret i64 %val 52 } 53 54 ; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the 55 ; natural return width of ptx's clz.b64 instruction. No conversions should be 56 ; necessary in the PTX. 57 ; CHECK-LABEL: myctlz64_as_32( 58 define i32 @myctlz64_as_32(i64 %a) { 59 ; CHECK: ld.param. 60 ; CHECK-NEXT: clz.b64 61 ; CHECK-NEXT: st.param. 62 ; CHECK-NEXT: ret; 63 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 64 %trunc = trunc i64 %val to i32 65 ret i32 %trunc 66 } 67 ; CHECK-LABEL: myctlz64_as_32_2( 68 define i32 @myctlz64_as_32_2(i64 %a) { 69 ; CHECK: ld.param. 70 ; CHECK-NEXT: clz.b64 71 ; CHECK-NEXT: st.param. 72 ; CHECK-NEXT: ret; 73 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 74 %trunc = trunc i64 %val to i32 75 ret i32 %trunc 76 } 77 78 ; ctlz.i16 is implemented by extending the input to i32, computing the result, 79 ; and then truncating the result back down to i16. But the NVPTX ABI 80 ; zero-extends i16 return values to i32, so the final truncation doesn't appear 81 ; in this function. 82 ; CHECK-LABEL: myctlz_ret16( 83 define i16 @myctlz_ret16(i16 %a) { 84 ; CHECK: ld.param. 85 ; CHECK-NEXT: cvt.u32.u16 86 ; CHECK-NEXT: clz.b32 87 ; CHECK-NEXT: sub. 88 ; CHECK-NEXT: st.param. 89 ; CHECK-NEXT: ret; 90 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 91 ret i16 %val 92 } 93 ; CHECK-LABEL: myctlz_ret16_2( 94 define i16 @myctlz_ret16_2(i16 %a) { 95 ; CHECK: ld.param. 96 ; CHECK-NEXT: cvt.u32.u16 97 ; CHECK-NEXT: clz.b32 98 ; CHECK-NEXT: sub. 99 ; CHECK-NEXT: st.param. 100 ; CHECK-NEXT: ret; 101 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone 102 ret i16 %val 103 } 104 105 ; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should 106 ; remain. 107 ; CHECK-LABEL: myctlz_store16( 108 define void @myctlz_store16(i16 %a, i16* %b) { 109 ; CHECK: ld.param. 110 ; CHECK-NEXT: cvt.u32.u16 111 ; CHECK-NEXT: clz.b32 112 ; CHECK-DAG: cvt.u16.u32 113 ; CHECK-DAG: sub. 114 ; CHECK: st.{{[a-z]}}16 115 ; CHECK: ret; 116 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 117 store i16 %val, i16* %b 118 ret void 119 } 120 ; CHECK-LABEL: myctlz_store16_2( 121 define void @myctlz_store16_2(i16 %a, i16* %b) { 122 ; CHECK: ld.param. 123 ; CHECK-NEXT: cvt.u32.u16 124 ; CHECK-NEXT: clz.b32 125 ; CHECK-DAG: cvt.u16.u32 126 ; CHECK-DAG: sub. 127 ; CHECK: st.{{[a-z]}}16 128 ; CHECK: ret; 129 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 130 store i16 %val, i16* %b 131 ret void 132 } 133