1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5 ; Test expansion of scalar selects on vectors. 6 ; Evergreen not enabled since it seems to be having problems with doubles. 7 8 ; GCN-LABEL: {{^}}v_select_v2i8: 9 ; SI: v_cndmask_b32 10 ; SI-NOT: cndmask 11 12 ; GFX9: v_cndmask_b32 13 ; GFX9-NOT: cndmask 14 15 ; This is worse when i16 is legal and packed is not because 16 ; SelectionDAGBuilder for some reason changes the select type. 17 ; VI: v_cndmask_b32 18 ; VI: v_cndmask_b32 19 define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 25 ret void 26 } 27 28 ; GCN-LABEL: {{^}}v_select_v4i8: 29 ; GCN: v_cndmask_b32_e32 30 ; GCN-NOT: cndmask 31 define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr 33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 37 ret void 38 } 39 40 ; GCN-LABEL: {{^}}v_select_v8i8: 41 ; GCN: v_cndmask_b32_e32 42 ; GCN: v_cndmask_b32_e32 43 ; GCN-NOT: cndmask 44 define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr 46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 50 ret void 51 } 52 53 ; GCN-LABEL: {{^}}v_select_v16i8: 54 ; GCN: v_cndmask_b32_e32 55 ; GCN: v_cndmask_b32_e32 56 ; GCN: v_cndmask_b32_e32 57 ; GCN: v_cndmask_b32_e32 58 ; GCN-NOT: cndmask 59 define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr 61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 65 ret void 66 } 67 68 ; GCN-LABEL: {{^}}select_v4i8: 69 ; GCN: v_cndmask_b32 70 ; GCN-NOT: cndmask 71 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 72 %cmp = icmp eq i8 %c, 0 73 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 74 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 75 ret void 76 } 77 78 ; GCN-LABEL: {{^}}select_v2i16: 79 ; GFX89: s_load_dword 80 ; GFX89: s_load_dword 81 ; GFX89: s_load_dword 82 ; GFX89: v_cndmask_b32 83 ; GFX89-NOT: v_cndmask_b32 84 85 ; SI: v_cndmask_b32_e32 86 ; SI-NOT: v_cndmask_b32e 87 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 88 %cmp = icmp eq i32 %c, 0 89 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 90 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 91 ret void 92 } 93 94 ; GCN-LABEL: {{^}}v_select_v2i16: 95 ; GCN: buffer_load_dword v 96 ; GCN: buffer_load_dword v 97 ; GCN: v_cndmask_b32 98 ; GCN-NOT: cndmask 99 define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 100 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr 101 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr 102 %cmp = icmp eq i32 %c, 0 103 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 104 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 105 ret void 106 } 107 108 ; GCN-LABEL: {{^}}v_select_v3i16: 109 ; SI: v_cndmask_b32_e32 110 ; SI: cndmask 111 ; SI-NOT: cndmask 112 113 ; GFX89: v_cndmask_b32_e32 114 ; GFX89: cndmask 115 ; GFX89-NOT: cndmask 116 define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 117 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr 118 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr 119 %cmp = icmp eq i32 %c, 0 120 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 121 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 122 ret void 123 } 124 125 ; GCN-LABEL: {{^}}v_select_v4i16: 126 ; GCN: v_cndmask_b32_e32 127 ; GCN: v_cndmask_b32_e32 128 ; GCN-NOT: cndmask 129 define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 130 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr 131 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr 132 %cmp = icmp eq i32 %c, 0 133 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 134 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 135 ret void 136 } 137 138 ; GCN-LABEL: {{^}}v_select_v8i16: 139 ; GCN: v_cndmask_b32_e32 140 ; GCN: v_cndmask_b32_e32 141 ; GCN: v_cndmask_b32_e32 142 ; GCN: v_cndmask_b32_e32 143 ; GCN-NOT: cndmask 144 define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 145 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr 146 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr 147 %cmp = icmp eq i32 %c, 0 148 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 149 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 150 ret void 151 } 152 153 ; FIXME: Expansion with bitwise operations may be better if doing a 154 ; vector select with SGPR inputs. 155 156 ; GCN-LABEL: {{^}}s_select_v2i32: 157 ; GCN: v_cndmask_b32_e32 158 ; GCN: v_cndmask_b32_e32 159 ; GCN: buffer_store_dwordx2 160 define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 161 %cmp = icmp eq i32 %c, 0 162 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 163 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 164 ret void 165 } 166 167 ; GCN-LABEL: {{^}}s_select_v4i32: 168 ; GCN: v_cndmask_b32_e32 169 ; GCN: v_cndmask_b32_e32 170 ; GCN: v_cndmask_b32_e32 171 ; GCN: v_cndmask_b32_e32 172 ; GCN: buffer_store_dwordx4 173 define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 174 %cmp = icmp eq i32 %c, 0 175 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 176 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 177 ret void 178 } 179 180 ; GCN-LABEL: {{^}}v_select_v4i32: 181 ; GCN: buffer_load_dwordx4 182 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 183 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 184 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 185 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 186 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 187 ; GCN: buffer_store_dwordx4 188 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { 189 bb: 190 %tmp2 = icmp ult i32 %cond, 32 191 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in 192 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 193 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16 194 ret void 195 } 196 197 ; GCN-LABEL: {{^}}select_v8i32: 198 ; GCN: v_cndmask_b32_e32 199 ; GCN: v_cndmask_b32_e32 200 ; GCN: v_cndmask_b32_e32 201 ; GCN: v_cndmask_b32_e32 202 ; GCN: v_cndmask_b32_e32 203 ; GCN: v_cndmask_b32_e32 204 ; GCN: v_cndmask_b32_e32 205 ; GCN: v_cndmask_b32_e32 206 define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 207 %cmp = icmp eq i32 %c, 0 208 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 209 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 210 ret void 211 } 212 213 ; GCN-LABEL: {{^}}s_select_v2f32: 214 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} 215 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} 216 217 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] 218 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] 219 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] 220 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 221 222 ; GCN-DAG: v_cndmask_b32_e32 223 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] 224 ; GCN-DAG: v_cndmask_b32_e32 225 ; GCN: buffer_store_dwordx2 226 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 227 %cmp = icmp eq i32 %c, 0 228 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 229 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 230 ret void 231 } 232 233 ; GCN-LABEL: {{^}}s_select_v4f32: 234 ; GCN: s_load_dwordx4 235 ; GCN: s_load_dwordx4 236 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 237 238 ; GCN: v_cndmask_b32_e32 239 ; GCN: v_cndmask_b32_e32 240 ; GCN: v_cndmask_b32_e32 241 ; GCN: v_cndmask_b32_e32 242 243 ; GCN: buffer_store_dwordx4 244 define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 245 %cmp = icmp eq i32 %c, 0 246 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 247 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 248 ret void 249 } 250 251 ; GCN-LABEL: {{^}}v_select_v4f32: 252 ; GCN: buffer_load_dwordx4 253 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 254 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 255 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 256 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 257 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 258 ; GCN: buffer_store_dwordx4 259 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { 260 bb: 261 %tmp2 = icmp ult i32 %cond, 32 262 %val = load <4 x float>, <4 x float> addrspace(1)* %in 263 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 264 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16 265 ret void 266 } 267 268 ; GCN-LABEL: {{^}}select_v8f32: 269 ; GCN: v_cndmask_b32_e32 270 ; GCN: v_cndmask_b32_e32 271 ; GCN: v_cndmask_b32_e32 272 ; GCN: v_cndmask_b32_e32 273 ; GCN: v_cndmask_b32_e32 274 ; GCN: v_cndmask_b32_e32 275 ; GCN: v_cndmask_b32_e32 276 ; GCN: v_cndmask_b32_e32 277 define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 278 %cmp = icmp eq i32 %c, 0 279 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 280 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 281 ret void 282 } 283 284 ; GCN-LABEL: {{^}}select_v2f64: 285 ; GCN: v_cndmask_b32_e32 286 ; GCN: v_cndmask_b32_e32 287 ; GCN: v_cndmask_b32_e32 288 ; GCN: v_cndmask_b32_e32 289 define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 290 %cmp = icmp eq i32 %c, 0 291 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 292 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 293 ret void 294 } 295 296 ; GCN-LABEL: {{^}}select_v4f64: 297 ; GCN: v_cndmask_b32_e32 298 ; GCN: v_cndmask_b32_e32 299 ; GCN: v_cndmask_b32_e32 300 ; GCN: v_cndmask_b32_e32 301 ; GCN: v_cndmask_b32_e32 302 ; GCN: v_cndmask_b32_e32 303 ; GCN: v_cndmask_b32_e32 304 ; GCN: v_cndmask_b32_e32 305 define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 306 %cmp = icmp eq i32 %c, 0 307 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 308 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 309 ret void 310 } 311 312 ; GCN-LABEL: {{^}}select_v8f64: 313 ; GCN: v_cndmask_b32_e32 314 ; GCN: v_cndmask_b32_e32 315 ; GCN: v_cndmask_b32_e32 316 ; GCN: v_cndmask_b32_e32 317 ; GCN: v_cndmask_b32_e32 318 ; GCN: v_cndmask_b32_e32 319 ; GCN: v_cndmask_b32_e32 320 ; GCN: v_cndmask_b32_e32 321 ; GCN: v_cndmask_b32_e32 322 ; GCN: v_cndmask_b32_e32 323 ; GCN: v_cndmask_b32_e32 324 ; GCN: v_cndmask_b32_e32 325 ; GCN: v_cndmask_b32_e32 326 ; GCN: v_cndmask_b32_e32 327 ; GCN: v_cndmask_b32_e32 328 ; GCN: v_cndmask_b32_e32 329 define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 330 %cmp = icmp eq i32 %c, 0 331 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 332 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 333 ret void 334 } 335 336 ; GCN-LABEL: {{^}}v_select_v2f16: 337 ; GCN: v_cndmask_b32 338 ; GCN-NOT: cndmask 339 define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 340 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr 341 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr 342 %cmp = icmp eq i32 %c, 0 343 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 344 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 345 ret void 346 } 347 348 ; GCN-LABEL: {{^}}v_select_v3f16: 349 ; GCN: v_cndmask_b32_e32 350 ; GCN: v_cndmask_b32_e32 351 ; GCN-NOT: cndmask 352 define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 353 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr 354 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr 355 %cmp = icmp eq i32 %c, 0 356 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 357 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 358 ret void 359 } 360 361 ; GCN-LABEL: {{^}}v_select_v4f16: 362 ; GCN: v_cndmask_b32_e32 363 ; GCN: v_cndmask_b32_e32 364 ; GCN-NOT: cndmask 365 define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 366 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr 367 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr 368 %cmp = icmp eq i32 %c, 0 369 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 370 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 371 ret void 372 } 373 374 ; Function Attrs: nounwind readnone 375 declare i32 @llvm.amdgcn.workitem.id.x() #1 376 377 attributes #0 = { nounwind } 378 attributes #1 = { nounwind readnone } 379