1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 3 4 ; GCN-LABEL: {{^}}s_sext_i1_to_i32: 5 ; GCN: v_cndmask_b32_e64 6 ; GCN: s_endpgm 7 define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 8 %cmp = icmp eq i32 %a, %b 9 %sext = sext i1 %cmp to i32 10 store i32 %sext, i32 addrspace(1)* %out, align 4 11 ret void 12 } 13 14 ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64: 15 ; GCN: s_ashr_i32 16 ; GCN: s_endpg 17 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 18 entry: 19 %mul = mul i32 %a, %b 20 %add = add i32 %mul, %c 21 %sext = sext i32 %add to i64 22 store i64 %sext, i64 addrspace(1)* %out, align 8 23 ret void 24 } 25 26 ; GCN-LABEL: {{^}}s_sext_i1_to_i64: 27 ; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc 28 ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] 29 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} 30 ; GCN: s_endpgm 31 define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 32 %cmp = icmp eq i32 %a, %b 33 %sext = sext i1 %cmp to i64 34 store i64 %sext, i64 addrspace(1)* %out, align 8 35 ret void 36 } 37 38 ; GCN-LABEL: {{^}}s_sext_i32_to_i64: 39 ; GCN: s_ashr_i32 40 ; GCN: s_endpgm 41 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { 42 %sext = sext i32 %a to i64 43 store i64 %sext, i64 addrspace(1)* %out, align 8 44 ret void 45 } 46 47 ; GCN-LABEL: {{^}}v_sext_i32_to_i64: 48 ; GCN: v_ashr 49 ; GCN: s_endpgm 50 define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 51 %val = load i32, i32 addrspace(1)* %in, align 4 52 %sext = sext i32 %val to i64 53 store i64 %sext, i64 addrspace(1)* %out, align 8 54 ret void 55 } 56 57 ; GCN-LABEL: {{^}}s_sext_i16_to_i64: 58 ; GCN: s_load_dword [[VAL:s[0-9]+]] 59 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 60 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { 61 %sext = sext i16 %a to i64 62 store i64 %sext, i64 addrspace(1)* %out, align 8 63 ret void 64 } 65 66 ; GCN-LABEL: {{^}}s_sext_i1_to_i16: 67 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 68 ; GCN-NEXT: buffer_store_short [[RESULT]] 69 define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 70 %cmp = icmp eq i32 %a, %b 71 %sext = sext i1 %cmp to i16 72 store i16 %sext, i16 addrspace(1)* %out 73 ret void 74 } 75 76 ; This purpose of this test is to make sure the i16 = sign_extend i1 node 77 ; makes it all the way throught the legalizer/optimizer to make sure 78 ; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node 79 ; is optimized to a select very early. 80 ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and: 81 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 82 ; GCN-NEXT: buffer_store_short [[RESULT]] 83 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { 84 %cmp0 = icmp eq i32 %a, %b 85 %cmp1 = icmp eq i32 %c, %d 86 %cmp = and i1 %cmp0, %cmp1 87 %sext = sext i1 %cmp to i16 88 store i16 %sext, i16 addrspace(1)* %out 89 ret void 90 } 91 92 ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and: 93 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 94 ; GCN-NEXT: buffer_store_short [[RESULT]] 95 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 96 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 97 %cmp0 = icmp eq i32 %a, %tid 98 %cmp1 = icmp eq i32 %b, %c 99 %cmp = and i1 %cmp0, %cmp1 100 %sext = sext i1 %cmp to i16 101 store i16 %sext, i16 addrspace(1)* %out 102 ret void 103 } 104 105 ; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32: 106 ; GCN: s_load_dword [[VAL:s[0-9]+]] 107 ; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010 108 ; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24 109 ; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 110 ; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] 111 112 ; FIXME: We end up with a v_bfe instruction, because the i16 srl 113 ; gets selected to a v_lshrrev_b16 instructions, so the input to 114 ; the bfe is a vector registers. To fix this we need to be able to 115 ; optimize: 116 ; t29: i16 = truncate t10 117 ; t55: i16 = srl t29, Constant:i32<8> 118 ; t63: i32 = any_extend t55 119 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 120 121 ; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8 122 123 ; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]] 124 ; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] 125 ; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]] 126 ; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]] 127 128 ; GCN-DAG: buffer_store_dword [[VEXT0]] 129 ; GCN-DAG: buffer_store_dword [[VEXT1]] 130 ; GCN-DAG: buffer_store_dword [[VEXT2]] 131 ; GCN-DAG: buffer_store_dword [[VEXT3]] 132 133 ; GCN: s_endpgm 134 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { 135 %cast = bitcast i32 %a to <4 x i8> 136 %ext = sext <4 x i8> %cast to <4 x i32> 137 %elt0 = extractelement <4 x i32> %ext, i32 0 138 %elt1 = extractelement <4 x i32> %ext, i32 1 139 %elt2 = extractelement <4 x i32> %ext, i32 2 140 %elt3 = extractelement <4 x i32> %ext, i32 3 141 store volatile i32 %elt0, i32 addrspace(1)* %out 142 store volatile i32 %elt1, i32 addrspace(1)* %out 143 store volatile i32 %elt2, i32 addrspace(1)* %out 144 store volatile i32 %elt3, i32 addrspace(1)* %out 145 ret void 146 } 147 148 ; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32: 149 ; GCN: buffer_load_dword [[VAL:v[0-9]+]] 150 ; FIXME: need to optimize same sequence as above test to avoid 151 ; this shift. 152 ; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]] 153 ; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]] 154 ; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 155 ; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 156 ; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8 157 158 ; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 159 ; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 160 ; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 161 162 ; GCN: buffer_store_dword [[EXT0]] 163 ; GCN: buffer_store_dword [[EXT1]] 164 ; GCN: buffer_store_dword [[EXT2]] 165 ; GCN: buffer_store_dword [[EXT3]] 166 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 167 %a = load i32, i32 addrspace(1)* %in 168 %cast = bitcast i32 %a to <4 x i8> 169 %ext = sext <4 x i8> %cast to <4 x i32> 170 %elt0 = extractelement <4 x i32> %ext, i32 0 171 %elt1 = extractelement <4 x i32> %ext, i32 1 172 %elt2 = extractelement <4 x i32> %ext, i32 2 173 %elt3 = extractelement <4 x i32> %ext, i32 3 174 store volatile i32 %elt0, i32 addrspace(1)* %out 175 store volatile i32 %elt1, i32 addrspace(1)* %out 176 store volatile i32 %elt2, i32 addrspace(1)* %out 177 store volatile i32 %elt3, i32 addrspace(1)* %out 178 ret void 179 } 180 181 ; FIXME: s_bfe_i64, same on SI and VI 182 ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32: 183 ; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 184 ; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 185 186 ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 187 ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 188 189 190 ; GCN-DAG: s_sext_i32_i16 191 ; GCN-DAG: s_sext_i32_i16 192 ; GCN: s_endpgm 193 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { 194 %cast = bitcast i64 %a to <4 x i16> 195 %ext = sext <4 x i16> %cast to <4 x i32> 196 %elt0 = extractelement <4 x i32> %ext, i32 0 197 %elt1 = extractelement <4 x i32> %ext, i32 1 198 %elt2 = extractelement <4 x i32> %ext, i32 2 199 %elt3 = extractelement <4 x i32> %ext, i32 3 200 store volatile i32 %elt0, i32 addrspace(1)* %out 201 store volatile i32 %elt1, i32 addrspace(1)* %out 202 store volatile i32 %elt2, i32 addrspace(1)* %out 203 store volatile i32 %elt3, i32 addrspace(1)* %out 204 ret void 205 } 206 207 ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32: 208 ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 209 ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 210 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 211 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 212 ; GCN: s_endpgm 213 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 214 %a = load i64, i64 addrspace(1)* %in 215 %cast = bitcast i64 %a to <4 x i16> 216 %ext = sext <4 x i16> %cast to <4 x i32> 217 %elt0 = extractelement <4 x i32> %ext, i32 0 218 %elt1 = extractelement <4 x i32> %ext, i32 1 219 %elt2 = extractelement <4 x i32> %ext, i32 2 220 %elt3 = extractelement <4 x i32> %ext, i32 3 221 store volatile i32 %elt0, i32 addrspace(1)* %out 222 store volatile i32 %elt1, i32 addrspace(1)* %out 223 store volatile i32 %elt2, i32 addrspace(1)* %out 224 store volatile i32 %elt3, i32 addrspace(1)* %out 225 ret void 226 } 227 228 declare i32 @llvm.amdgcn.workitem.id.x() #1 229 230 attributes #1 = { nounwind readnone } 231