1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s 2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s 3 4 ; FIXME: Need to handle non-uniform case for function below (load without gep). 5 ; GCN-LABEL: {{^}}v_test_sub_v2i16: 6 ; GFX89: {{flat|global}}_load_dword 7 ; GFX89: {{flat|global}}_load_dword 8 9 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 10 11 ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 12 ; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 14 %tid = call i32 @llvm.amdgcn.workitem.id.x() 15 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 16 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 17 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 18 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 19 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 20 %add = sub <2 x i16> %a, %b 21 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 22 ret void 23 } 24 25 ; GCN-LABEL: {{^}}s_test_sub_v2i16: 26 ; GFX9: s_load_dword [[VAL0:s[0-9]+]] 27 ; GFX9: s_load_dword [[VAL1:s[0-9]+]] 28 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] 29 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]] 30 31 ; VI: s_sub_i32 32 ; VI: s_sub_i32 33 define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { 34 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 35 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 36 %add = sub <2 x i16> %a, %b 37 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 38 ret void 39 } 40 41 ; GCN-LABEL: {{^}}s_test_sub_self_v2i16: 42 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] 43 ; GCN: buffer_store_dword [[ZERO]] 44 define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { 45 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 46 %add = sub <2 x i16> %a, %a 47 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 48 ret void 49 } 50 51 ; FIXME: VI should not scalarize arg access. 52 ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: 53 ; GCN: s_load_dword s 54 ; GCN: s_load_dword s 55 56 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 57 58 ; VI: s_sub_i32 59 ; VI: s_sub_i32 60 ; VI: s_lshl_b32 61 ; VI: s_and_b32 62 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { 63 %add = sub <2 x i16> %a, %b 64 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 65 ret void 66 } 67 68 ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: 69 ; GFX89-DAG: {{flat|global}}_load_dword 70 71 ; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} 72 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] 73 74 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 75 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} 76 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 77 ; VI: v_or_b32 78 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 79 %tid = call i32 @llvm.amdgcn.workitem.id.x() 80 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 81 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 82 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 83 %add = sub <2 x i16> %a, <i16 123, i16 456> 84 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 85 ret void 86 } 87 88 ; FIXME: Need to handle non-uniform case for function below (load without gep). 89 ; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant: 90 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} 91 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] 92 93 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df 94 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] 95 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} 96 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 97 %tid = call i32 @llvm.amdgcn.workitem.id.x() 98 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 99 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 100 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 101 %add = sub <2 x i16> %a, <i16 -845, i16 -991> 102 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 103 ret void 104 } 105 106 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: 107 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} 108 109 ; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 110 ; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] 111 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 112 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]] 113 ; VI: v_or_b32_e32 114 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 115 %tid = call i32 @llvm.amdgcn.workitem.id.x() 116 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 117 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 118 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 119 %add = sub <2 x i16> %a, <i16 -1, i16 -1> 120 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 121 ret void 122 } 123 124 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: 125 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} 126 127 ; VI: flat_load_dword [[LOAD:v[0-9]+]] 128 ; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] 129 ; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] 130 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] 131 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 132 %tid = call i32 @llvm.amdgcn.workitem.id.x() 133 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 134 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 135 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 136 %add = sub <2 x i16> %a, <i16 32, i16 0> 137 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 138 ret void 139 } 140 141 ; The high element gives fp 142 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split: 143 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 144 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] 145 146 ; VI-NOT: v_subrev_i16 147 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 148 ; VI: flat_load_dword 149 ; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 150 ; VI-NOT: v_subrev_i16 151 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 152 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 155 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 156 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 157 %add = sub <2 x i16> %a, <i16 0, i16 16256> 158 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 159 ret void 160 } 161 162 ; FIXME: Need to handle non-uniform case for function below (load without gep). 163 ; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32: 164 ; GFX9: global_load_dword [[A:v[0-9]+]] 165 ; GFX9: global_load_dword [[B:v[0-9]+]] 166 167 ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] 168 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] 169 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] 170 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} 171 172 ; VI: flat_load_dword v[[A:[0-9]+]] 173 ; VI: flat_load_dword v[[B:[0-9]+]] 174 175 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] 176 ; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 177 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} 178 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 180 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 181 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 182 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 183 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 184 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 185 %add = sub <2 x i16> %a, %b 186 %ext = zext <2 x i16> %add to <2 x i32> 187 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 188 ret void 189 } 190 191 ; FIXME: Need to handle non-uniform case for function below (load without gep). 192 ; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64: 193 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 194 ; GFX9: global_load_dword [[A:v[0-9]+]] 195 ; GFX9: global_load_dword [[B:v[0-9]+]] 196 197 ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] 198 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] 199 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] 200 ; GFX9: buffer_store_dwordx4 201 202 ; VI: flat_load_dword [[A:v[0-9]+]] 203 ; VI: flat_load_dword [[B:v[0-9]+]] 204 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]] 205 ; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 206 ; VI: buffer_store_dwordx4 207 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 208 %tid = call i32 @llvm.amdgcn.workitem.id.x() 209 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 210 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 211 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 212 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 213 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 214 %add = sub <2 x i16> %a, %b 215 %ext = zext <2 x i16> %add to <2 x i64> 216 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 217 ret void 218 } 219 220 ; FIXME: Need to handle non-uniform case for function below (load without gep). 221 ; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32: 222 ; GFX9: global_load_dword [[A:v[0-9]+]] 223 ; GFX9: global_load_dword [[B:v[0-9]+]] 224 225 ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] 226 ; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 227 ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] 228 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} 229 230 ; VI: flat_load_dword 231 ; VI: flat_load_dword 232 ; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 233 ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 234 235 ; VI: buffer_store_dwordx2 236 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 237 %tid = call i32 @llvm.amdgcn.workitem.id.x() 238 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 239 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 240 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 241 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 242 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 243 %add = sub <2 x i16> %a, %b 244 %ext = sext <2 x i16> %add to <2 x i32> 245 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 246 ret void 247 } 248 249 ; FIXME: Need to handle non-uniform case for function below (load without gep). 250 ; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64: 251 ; GCN: {{flat|global}}_load_dword 252 ; GCN: {{flat|global}}_load_dword 253 254 ; GFX9: v_pk_sub_i16 255 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 256 257 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 258 ; VI: v_sub_u16_e32 259 260 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 261 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 262 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} 263 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} 264 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 265 %tid = call i32 @llvm.amdgcn.workitem.id.x() 266 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 267 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 268 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 269 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 270 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 271 %add = sub <2 x i16> %a, %b 272 %ext = sext <2 x i16> %add to <2 x i64> 273 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 274 ret void 275 } 276 277 declare i32 @llvm.amdgcn.workitem.id.x() #0 278 279 attributes #0 = { nounwind readnone } 280 attributes #1 = { nounwind } 281