1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4 ; GCN-LABEL: {{^}}select_f16: 5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 6 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 7 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 8 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 9 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 10 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 11 ; SI-DAG: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 12 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 13 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 14 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 15 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 16 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 17 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 18 ; GCN: buffer_store_short v[[R_F16]] 19 ; GCN: s_endpgm 20 define amdgpu_kernel void @select_f16( 21 half addrspace(1)* %r, 22 half addrspace(1)* %a, 23 half addrspace(1)* %b, 24 half addrspace(1)* %c, 25 half addrspace(1)* %d) { 26 entry: 27 %a.val = load volatile half, half addrspace(1)* %a 28 %b.val = load volatile half, half addrspace(1)* %b 29 %c.val = load volatile half, half addrspace(1)* %c 30 %d.val = load volatile half, half addrspace(1)* %d 31 %fcmp = fcmp olt half %a.val, %b.val 32 %r.val = select i1 %fcmp, half %c.val, half %d.val 33 store half %r.val, half addrspace(1)* %r 34 ret void 35 } 36 37 ; GCN-LABEL: {{^}}select_f16_imm_a: 38 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 39 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 40 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 41 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 42 ; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]] 43 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 44 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 45 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 46 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 47 ; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]] 48 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 49 ; GCN: buffer_store_short v[[R_F16]] 50 ; GCN: s_endpgm 51 define amdgpu_kernel void @select_f16_imm_a( 52 half addrspace(1)* %r, 53 half addrspace(1)* %b, 54 half addrspace(1)* %c, 55 half addrspace(1)* %d) { 56 entry: 57 %b.val = load volatile half, half addrspace(1)* %b 58 %c.val = load volatile half, half addrspace(1)* %c 59 %d.val = load volatile half, half addrspace(1)* %d 60 %fcmp = fcmp olt half 0xH3800, %b.val 61 %r.val = select i1 %fcmp, half %c.val, half %d.val 62 store half %r.val, half addrspace(1)* %r 63 ret void 64 } 65 66 ; GCN-LABEL: {{^}}select_f16_imm_b: 67 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 68 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 69 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 70 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 71 ; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]] 72 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 73 ; SI-DAG: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 74 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] 75 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 76 77 ; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]] 78 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 79 ; GCN: buffer_store_short v[[R_F16]] 80 ; GCN: s_endpgm 81 define amdgpu_kernel void @select_f16_imm_b( 82 half addrspace(1)* %r, 83 half addrspace(1)* %a, 84 half addrspace(1)* %c, 85 half addrspace(1)* %d) { 86 entry: 87 %a.val = load volatile half, half addrspace(1)* %a 88 %c.val = load volatile half, half addrspace(1)* %c 89 %d.val = load volatile half, half addrspace(1)* %d 90 %fcmp = fcmp olt half %a.val, 0xH3800 91 %r.val = select i1 %fcmp, half %c.val, half %d.val 92 store half %r.val, half addrspace(1)* %r 93 ret void 94 } 95 96 ; GCN-LABEL: {{^}}select_f16_imm_c: 97 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 98 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 99 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]] 100 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 101 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 102 ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] 103 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 104 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc 105 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 106 107 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} 108 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 109 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc 110 ; GCN: buffer_store_short v[[R_F16]] 111 ; GCN: s_endpgm 112 define amdgpu_kernel void @select_f16_imm_c( 113 half addrspace(1)* %r, 114 half addrspace(1)* %a, 115 half addrspace(1)* %b, 116 half addrspace(1)* %d) { 117 entry: 118 %a.val = load volatile half, half addrspace(1)* %a 119 %b.val = load volatile half, half addrspace(1)* %b 120 %d.val = load volatile half, half addrspace(1)* %d 121 %fcmp = fcmp olt half %a.val, %b.val 122 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 123 store half %r.val, half addrspace(1)* %r 124 ret void 125 } 126 127 ; GCN-LABEL: {{^}}select_f16_imm_d: 128 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 129 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 130 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] 131 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 132 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 133 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] 134 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 135 ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] 136 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] 137 ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} 138 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 139 ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc 140 ; GCN: buffer_store_short v[[R_F16]] 141 ; GCN: s_endpgm 142 define amdgpu_kernel void @select_f16_imm_d( 143 half addrspace(1)* %r, 144 half addrspace(1)* %a, 145 half addrspace(1)* %b, 146 half addrspace(1)* %c) { 147 entry: 148 %a.val = load volatile half, half addrspace(1)* %a 149 %b.val = load volatile half, half addrspace(1)* %b 150 %c.val = load volatile half, half addrspace(1)* %c 151 %fcmp = fcmp olt half %a.val, %b.val 152 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 153 store half %r.val, half addrspace(1)* %r 154 ret void 155 } 156 157 ; GCN-LABEL: {{^}}select_v2f16: 158 ; SI: v_cvt_f32_f16_e32 159 ; SI: v_cvt_f32_f16_e32 160 ; SI: v_cvt_f32_f16_e32 161 ; SI: v_cvt_f32_f16_e32 162 ; SI: v_cmp_lt_f32_e32 163 ; SI: v_cndmask_b32_e32 164 ; SI: v_cmp_lt_f32_e32 165 ; SI: v_cndmask_b32_e32 166 ; SI: v_cvt_f16_f32_e32 167 ; SI: v_cvt_f16_f32_e32 168 169 ; VI: v_cmp_lt_f16_e32 170 ; VI: v_cndmask_b32_e32 171 ; VI: v_cmp_lt_f16_e32 172 ; VI: v_cndmask_b32_e32 173 174 ; GCN: s_endpgm 175 define amdgpu_kernel void @select_v2f16( 176 <2 x half> addrspace(1)* %r, 177 <2 x half> addrspace(1)* %a, 178 <2 x half> addrspace(1)* %b, 179 <2 x half> addrspace(1)* %c, 180 <2 x half> addrspace(1)* %d) { 181 entry: 182 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 183 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 184 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 185 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 186 %fcmp = fcmp olt <2 x half> %a.val, %b.val 187 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 188 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 189 ret void 190 } 191 192 ; GCN-LABEL: {{^}}select_v2f16_imm_a: 193 ; SI: v_cvt_f32_f16_e32 194 ; SI: v_cvt_f32_f16_e32 195 ; SI: v_cvt_f32_f16_e32 196 ; SI: v_cvt_f32_f16_e32 197 ; SI: v_cvt_f32_f16_e32 198 ; SI: v_cvt_f32_f16_e32 199 200 ; SI: v_cmp_gt_f32_e32 201 ; SI: v_cndmask_b32_e32 202 ; SI: v_cmp_lt_f32_e32 vcc, 0.5 203 ; SI: v_cndmask_b32_e32 204 205 ; VI: v_cmp_lt_f16_e32 206 ; VI: v_cndmask_b32_e32 207 ; VI: v_cmp_gt_f16_e32 208 ; VI: v_cndmask_b32_e32 209 210 ; SI: v_cvt_f16_f32_e32 211 ; SI: v_cvt_f16_f32_e32 212 ; GCN: s_endpgm 213 define amdgpu_kernel void @select_v2f16_imm_a( 214 <2 x half> addrspace(1)* %r, 215 <2 x half> addrspace(1)* %b, 216 <2 x half> addrspace(1)* %c, 217 <2 x half> addrspace(1)* %d) { 218 entry: 219 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 220 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 221 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 222 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 223 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 224 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 225 ret void 226 } 227 228 ; GCN-LABEL: {{^}}select_v2f16_imm_b: 229 ; SI: v_cvt_f32_f16_e32 230 ; SI: v_cvt_f32_f16_e32 231 ; SI: v_cvt_f32_f16_e32 232 ; SI: v_cvt_f32_f16_e32 233 ; SI: v_cvt_f32_f16_e32 234 ; SI: v_cvt_f32_f16_e32 235 236 ; SI: v_cmp_lt_f32_e32 237 ; SI: v_cndmask_b32_e32 238 ; SI: v_cmp_gt_f32_e32 vcc, 0.5 239 ; SI: v_cndmask_b32_e32 240 241 ; VI: v_cmp_gt_f16_e32 242 ; VI: v_cndmask_b32_e32 243 ; VI: v_cmp_lt_f16_e32 244 ; VI: v_cndmask_b32_e32 245 246 ; SI: v_cvt_f16_f32_e32 247 ; SI: v_cvt_f16_f32_e32 248 ; GCN: s_endpgm 249 define amdgpu_kernel void @select_v2f16_imm_b( 250 <2 x half> addrspace(1)* %r, 251 <2 x half> addrspace(1)* %a, 252 <2 x half> addrspace(1)* %c, 253 <2 x half> addrspace(1)* %d) { 254 entry: 255 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 256 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 257 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 258 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 259 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 260 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 261 ret void 262 } 263 264 ; GCN-LABEL: {{^}}select_v2f16_imm_c: 265 ; SI: v_cvt_f32_f16_e32 266 ; SI: v_cvt_f32_f16_e32 267 ; SI: v_cvt_f32_f16_e32 268 ; SI: v_cvt_f32_f16_e32 269 ; SI: v_cvt_f32_f16_e32 270 ; SI: v_cvt_f32_f16_e32 271 272 ; SI: v_cmp_nlt_f32_e32 273 ; SI: v_cndmask_b32_e32 274 ; SI: v_cmp_nlt_f32_e32 275 ; SI-DAG: v_cndmask_b32_e32 276 277 ; VI: v_cmp_nlt_f16_e32 278 ; VI: v_cndmask_b32_e32 279 280 ; VI: v_cmp_nlt_f16_e32 281 ; VI: v_cndmask_b32_e32 282 283 ; SI-DAG: v_cvt_f16_f32_e32 284 ; SI: v_cvt_f16_f32_e32 285 ; GCN: s_endpgm 286 define amdgpu_kernel void @select_v2f16_imm_c( 287 <2 x half> addrspace(1)* %r, 288 <2 x half> addrspace(1)* %a, 289 <2 x half> addrspace(1)* %b, 290 <2 x half> addrspace(1)* %d) { 291 entry: 292 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 293 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 294 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 295 %fcmp = fcmp olt <2 x half> %a.val, %b.val 296 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 297 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 298 ret void 299 } 300 301 ; GCN-LABEL: {{^}}select_v2f16_imm_d: 302 ; SI: v_cvt_f32_f16_e32 303 ; SI: v_cvt_f32_f16_e32 304 ; SI: v_cvt_f32_f16_e32 305 ; SI: v_cvt_f32_f16_e32 306 ; SI: v_cvt_f32_f16_e32 307 ; SI: v_cvt_f32_f16_e32 308 309 ; SI: v_cmp_lt_f32_e32 310 ; SI: v_cndmask_b32 311 ; SI: v_cmp_lt_f32_e32 312 ; SI: v_cndmask_b32 313 314 ; VI: v_cmp_lt_f16_e32 315 ; VI: v_cndmask_b32 316 ; VI: v_cmp_lt_f16_e32 317 ; VI: v_cndmask_b32 318 319 ; SI: v_cvt_f16_f32_e32 320 ; SI: v_cvt_f16_f32_e32 321 ; GCN: s_endpgm 322 define amdgpu_kernel void @select_v2f16_imm_d( 323 <2 x half> addrspace(1)* %r, 324 <2 x half> addrspace(1)* %a, 325 <2 x half> addrspace(1)* %b, 326 <2 x half> addrspace(1)* %c) { 327 entry: 328 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 329 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 330 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 331 %fcmp = fcmp olt <2 x half> %a.val, %b.val 332 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 333 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 334 ret void 335 } 336