Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright  2016 Bas Nieuwenhuizen
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "ac_nir_to_llvm.h"
     25 #include "ac_llvm_build.h"
     26 #include "ac_llvm_util.h"
     27 #include "ac_binary.h"
     28 #include "sid.h"
     29 #include "nir/nir.h"
     30 #include "../vulkan/radv_descriptor_set.h"
     31 #include "util/bitscan.h"
     32 #include <llvm-c/Transforms/Scalar.h>
     33 #include "ac_shader_abi.h"
     34 #include "ac_shader_info.h"
     35 #include "ac_shader_util.h"
     36 #include "ac_exp_param.h"
     37 
     38 enum radeon_llvm_calling_convention {
     39 	RADEON_LLVM_AMDGPU_VS = 87,
     40 	RADEON_LLVM_AMDGPU_GS = 88,
     41 	RADEON_LLVM_AMDGPU_PS = 89,
     42 	RADEON_LLVM_AMDGPU_CS = 90,
     43 	RADEON_LLVM_AMDGPU_HS = 93,
     44 };
     45 
     46 #define CONST_ADDR_SPACE 2
     47 #define LOCAL_ADDR_SPACE 3
     48 
     49 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
     50 #define RADEON_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
     51 
     52 struct nir_to_llvm_context;
     53 
     54 struct ac_nir_context {
     55 	struct ac_llvm_context ac;
     56 	struct ac_shader_abi *abi;
     57 
     58 	gl_shader_stage stage;
     59 
     60 	struct hash_table *defs;
     61 	struct hash_table *phis;
     62 	struct hash_table *vars;
     63 
     64 	LLVMValueRef main_function;
     65 	LLVMBasicBlockRef continue_block;
     66 	LLVMBasicBlockRef break_block;
     67 
     68 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS * 4];
     69 
     70 	int num_locals;
     71 	LLVMValueRef *locals;
     72 
     73 	struct nir_to_llvm_context *nctx; /* TODO get rid of this */
     74 };
     75 
     76 struct nir_to_llvm_context {
     77 	struct ac_llvm_context ac;
     78 	const struct ac_nir_compiler_options *options;
     79 	struct ac_shader_variant_info *shader_info;
     80 	struct ac_shader_abi abi;
     81 	struct ac_nir_context *nir;
     82 
     83 	unsigned max_workgroup_size;
     84 	LLVMContextRef context;
     85 	LLVMModuleRef module;
     86 	LLVMBuilderRef builder;
     87 	LLVMValueRef main_function;
     88 
     89 	struct hash_table *defs;
     90 	struct hash_table *phis;
     91 
     92 	LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
     93 	LLVMValueRef ring_offsets;
     94 	LLVMValueRef push_constants;
     95 	LLVMValueRef view_index;
     96 	LLVMValueRef num_work_groups;
     97 	LLVMValueRef workgroup_ids[3];
     98 	LLVMValueRef local_invocation_ids;
     99 	LLVMValueRef tg_size;
    100 
    101 	LLVMValueRef vertex_buffers;
    102 	LLVMValueRef rel_auto_id;
    103 	LLVMValueRef vs_prim_id;
    104 	LLVMValueRef ls_out_layout;
    105 	LLVMValueRef es2gs_offset;
    106 
    107 	LLVMValueRef tcs_offchip_layout;
    108 	LLVMValueRef tcs_out_offsets;
    109 	LLVMValueRef tcs_out_layout;
    110 	LLVMValueRef tcs_in_layout;
    111 	LLVMValueRef oc_lds;
    112 	LLVMValueRef merged_wave_info;
    113 	LLVMValueRef tess_factor_offset;
    114 	LLVMValueRef tes_rel_patch_id;
    115 	LLVMValueRef tes_u;
    116 	LLVMValueRef tes_v;
    117 
    118 	LLVMValueRef gsvs_ring_stride;
    119 	LLVMValueRef gsvs_num_entries;
    120 	LLVMValueRef gs2vs_offset;
    121 	LLVMValueRef gs_wave_id;
    122 	LLVMValueRef gs_vtx_offset[6];
    123 
    124 	LLVMValueRef esgs_ring;
    125 	LLVMValueRef gsvs_ring;
    126 	LLVMValueRef hs_ring_tess_offchip;
    127 	LLVMValueRef hs_ring_tess_factor;
    128 
    129 	LLVMValueRef prim_mask;
    130 	LLVMValueRef sample_pos_offset;
    131 	LLVMValueRef persp_sample, persp_center, persp_centroid;
    132 	LLVMValueRef linear_sample, linear_center, linear_centroid;
    133 
    134 	gl_shader_stage stage;
    135 
    136 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
    137 
    138 	uint64_t input_mask;
    139 	uint64_t output_mask;
    140 	uint8_t num_output_clips;
    141 	uint8_t num_output_culls;
    142 
    143 	bool is_gs_copy_shader;
    144 	LLVMValueRef gs_next_vertex;
    145 	unsigned gs_max_out_vertices;
    146 
    147 	unsigned tes_primitive_mode;
    148 	uint64_t tess_outputs_written;
    149 	uint64_t tess_patch_outputs_written;
    150 
    151 	uint32_t tcs_patch_outputs_read;
    152 	uint64_t tcs_outputs_read;
    153 };
    154 
    155 static inline struct nir_to_llvm_context *
    156 nir_to_llvm_context_from_abi(struct ac_shader_abi *abi)
    157 {
    158 	struct nir_to_llvm_context *ctx = NULL;
    159 	return container_of(abi, ctx, abi);
    160 }
    161 
    162 static LLVMTypeRef
    163 nir2llvmtype(struct ac_nir_context *ctx,
    164 	     const struct glsl_type *type)
    165 {
    166 	switch (glsl_get_base_type(glsl_without_array(type))) {
    167 	case GLSL_TYPE_UINT:
    168 	case GLSL_TYPE_INT:
    169 		return ctx->ac.i32;
    170 	case GLSL_TYPE_UINT64:
    171 	case GLSL_TYPE_INT64:
    172 		return ctx->ac.i64;
    173 	case GLSL_TYPE_DOUBLE:
    174 		return ctx->ac.f64;
    175 	case GLSL_TYPE_FLOAT:
    176 		return ctx->ac.f32;
    177 	default:
    178 		assert(!"Unsupported type in nir2llvmtype()");
    179 		break;
    180 	}
    181 	return 0;
    182 }
    183 
    184 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
    185 				     const nir_deref_var *deref,
    186 				     enum ac_descriptor_type desc_type,
    187 				     const nir_tex_instr *instr,
    188 				     bool image, bool write);
    189 
    190 static unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan)
    191 {
    192 	return (index * 4) + chan;
    193 }
    194 
    195 static unsigned shader_io_get_unique_index(gl_varying_slot slot)
    196 {
    197 	/* handle patch indices separate */
    198 	if (slot == VARYING_SLOT_TESS_LEVEL_OUTER)
    199 		return 0;
    200 	if (slot == VARYING_SLOT_TESS_LEVEL_INNER)
    201 		return 1;
    202 	if (slot >= VARYING_SLOT_PATCH0 && slot <= VARYING_SLOT_TESS_MAX)
    203 		return 2 + (slot - VARYING_SLOT_PATCH0);
    204 
    205 	if (slot == VARYING_SLOT_POS)
    206 		return 0;
    207 	if (slot == VARYING_SLOT_PSIZ)
    208 		return 1;
    209 	if (slot == VARYING_SLOT_CLIP_DIST0)
    210 		return 2;
    211 	/* 3 is reserved for clip dist as well */
    212 	if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
    213 		return 4 + (slot - VARYING_SLOT_VAR0);
    214 	unreachable("illegal slot in get unique index\n");
    215 }
    216 
    217 static void set_llvm_calling_convention(LLVMValueRef func,
    218                                         gl_shader_stage stage)
    219 {
    220 	enum radeon_llvm_calling_convention calling_conv;
    221 
    222 	switch (stage) {
    223 	case MESA_SHADER_VERTEX:
    224 	case MESA_SHADER_TESS_EVAL:
    225 		calling_conv = RADEON_LLVM_AMDGPU_VS;
    226 		break;
    227 	case MESA_SHADER_GEOMETRY:
    228 		calling_conv = RADEON_LLVM_AMDGPU_GS;
    229 		break;
    230 	case MESA_SHADER_TESS_CTRL:
    231 		calling_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS : RADEON_LLVM_AMDGPU_VS;
    232 		break;
    233 	case MESA_SHADER_FRAGMENT:
    234 		calling_conv = RADEON_LLVM_AMDGPU_PS;
    235 		break;
    236 	case MESA_SHADER_COMPUTE:
    237 		calling_conv = RADEON_LLVM_AMDGPU_CS;
    238 		break;
    239 	default:
    240 		unreachable("Unhandle shader type");
    241 	}
    242 
    243 	LLVMSetFunctionCallConv(func, calling_conv);
    244 }
    245 
    246 #define MAX_ARGS 23
    247 struct arg_info {
    248 	LLVMTypeRef types[MAX_ARGS];
    249 	LLVMValueRef *assign[MAX_ARGS];
    250 	unsigned array_params_mask;
    251 	uint8_t count;
    252 	uint8_t sgpr_count;
    253 	uint8_t num_sgprs_used;
    254 	uint8_t num_vgprs_used;
    255 };
    256 
    257 enum ac_arg_regfile {
    258 	ARG_SGPR,
    259 	ARG_VGPR,
    260 };
    261 
    262 static void
    263 add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type,
    264 	LLVMValueRef *param_ptr)
    265 {
    266 	assert(info->count < MAX_ARGS);
    267 
    268 	info->assign[info->count] = param_ptr;
    269 	info->types[info->count] = type;
    270 	info->count++;
    271 
    272 	if (regfile == ARG_SGPR) {
    273 		info->num_sgprs_used += ac_get_type_size(type) / 4;
    274 		info->sgpr_count++;
    275 	} else {
    276 		assert(regfile == ARG_VGPR);
    277 		info->num_vgprs_used += ac_get_type_size(type) / 4;
    278 	}
    279 }
    280 
    281 static inline void
    282 add_array_arg(struct arg_info *info, LLVMTypeRef type, LLVMValueRef *param_ptr)
    283 {
    284 	info->array_params_mask |= (1 << info->count);
    285 	add_arg(info, ARG_SGPR, type, param_ptr);
    286 }
    287 
    288 static void assign_arguments(LLVMValueRef main_function,
    289 			     struct arg_info *info)
    290 {
    291 	unsigned i;
    292 	for (i = 0; i < info->count; i++) {
    293 		if (info->assign[i])
    294 			*info->assign[i] = LLVMGetParam(main_function, i);
    295 	}
    296 }
    297 
    298 static LLVMValueRef
    299 create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
    300                      LLVMBuilderRef builder, LLVMTypeRef *return_types,
    301                      unsigned num_return_elems,
    302 		     struct arg_info *args,
    303 		     unsigned max_workgroup_size,
    304 		     bool unsafe_math)
    305 {
    306 	LLVMTypeRef main_function_type, ret_type;
    307 	LLVMBasicBlockRef main_function_body;
    308 
    309 	if (num_return_elems)
    310 		ret_type = LLVMStructTypeInContext(ctx, return_types,
    311 		                                   num_return_elems, true);
    312 	else
    313 		ret_type = LLVMVoidTypeInContext(ctx);
    314 
    315 	/* Setup the function */
    316 	main_function_type =
    317 	    LLVMFunctionType(ret_type, args->types, args->count, 0);
    318 	LLVMValueRef main_function =
    319 	    LLVMAddFunction(module, "main", main_function_type);
    320 	main_function_body =
    321 	    LLVMAppendBasicBlockInContext(ctx, main_function, "main_body");
    322 	LLVMPositionBuilderAtEnd(builder, main_function_body);
    323 
    324 	LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS);
    325 	for (unsigned i = 0; i < args->sgpr_count; ++i) {
    326 		if (args->array_params_mask & (1 << i)) {
    327 			LLVMValueRef P = LLVMGetParam(main_function, i);
    328 			ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_BYVAL);
    329 			ac_add_attr_dereferenceable(P, UINT64_MAX);
    330 		}
    331 		else {
    332 			ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG);
    333 		}
    334 	}
    335 
    336 	if (max_workgroup_size) {
    337 		ac_llvm_add_target_dep_function_attr(main_function,
    338 						     "amdgpu-max-work-group-size",
    339 						     max_workgroup_size);
    340 	}
    341 	if (unsafe_math) {
    342 		/* These were copied from some LLVM test. */
    343 		LLVMAddTargetDependentFunctionAttr(main_function,
    344 						   "less-precise-fpmad",
    345 						   "true");
    346 		LLVMAddTargetDependentFunctionAttr(main_function,
    347 						   "no-infs-fp-math",
    348 						   "true");
    349 		LLVMAddTargetDependentFunctionAttr(main_function,
    350 						   "no-nans-fp-math",
    351 						   "true");
    352 		LLVMAddTargetDependentFunctionAttr(main_function,
    353 						   "unsafe-fp-math",
    354 						   "true");
    355 		LLVMAddTargetDependentFunctionAttr(main_function,
    356 					   "no-signed-zeros-fp-math",
    357 					   "true");
    358 	}
    359 	return main_function;
    360 }
    361 
    362 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
    363 {
    364 	return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
    365 	                       CONST_ADDR_SPACE);
    366 }
    367 
    368 static int get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
    369 {
    370 	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
    371 		type = LLVMGetElementType(type);
    372 
    373 	if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
    374 		return LLVMGetIntTypeWidth(type);
    375 
    376 	if (type == ctx->f16)
    377 		return 16;
    378 	if (type == ctx->f32)
    379 		return 32;
    380 	if (type == ctx->f64)
    381 		return 64;
    382 
    383 	unreachable("Unhandled type kind in get_elem_bits");
    384 }
    385 
    386 static LLVMValueRef unpack_param(struct ac_llvm_context *ctx,
    387 				 LLVMValueRef param, unsigned rshift,
    388 				 unsigned bitwidth)
    389 {
    390 	LLVMValueRef value = param;
    391 	if (rshift)
    392 		value = LLVMBuildLShr(ctx->builder, value,
    393 				      LLVMConstInt(ctx->i32, rshift, false), "");
    394 
    395 	if (rshift + bitwidth < 32) {
    396 		unsigned mask = (1 << bitwidth) - 1;
    397 		value = LLVMBuildAnd(ctx->builder, value,
    398 				     LLVMConstInt(ctx->i32, mask, false), "");
    399 	}
    400 	return value;
    401 }
    402 
    403 static LLVMValueRef get_rel_patch_id(struct nir_to_llvm_context *ctx)
    404 {
    405 	switch (ctx->stage) {
    406 	case MESA_SHADER_TESS_CTRL:
    407 		return unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
    408 	case MESA_SHADER_TESS_EVAL:
    409 		return ctx->tes_rel_patch_id;
    410 		break;
    411 	default:
    412 		unreachable("Illegal stage");
    413 	}
    414 }
    415 
    416 /* Tessellation shaders pass outputs to the next shader using LDS.
    417  *
    418  * LS outputs = TCS inputs
    419  * TCS outputs = TES inputs
    420  *
    421  * The LDS layout is:
    422  * - TCS inputs for patch 0
    423  * - TCS inputs for patch 1
    424  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
    425  * - ...
    426  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
    427  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
    428  * - TCS outputs for patch 1
    429  * - Per-patch TCS outputs for patch 1
    430  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
    431  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
    432  * - ...
    433  *
    434  * All three shaders VS(LS), TCS, TES share the same LDS space.
    435  */
    436 static LLVMValueRef
    437 get_tcs_in_patch_stride(struct nir_to_llvm_context *ctx)
    438 {
    439 	if (ctx->stage == MESA_SHADER_VERTEX)
    440 		return unpack_param(&ctx->ac, ctx->ls_out_layout, 0, 13);
    441 	else if (ctx->stage == MESA_SHADER_TESS_CTRL)
    442 		return unpack_param(&ctx->ac, ctx->tcs_in_layout, 0, 13);
    443 	else {
    444 		assert(0);
    445 		return NULL;
    446 	}
    447 }
    448 
    449 static LLVMValueRef
    450 get_tcs_out_patch_stride(struct nir_to_llvm_context *ctx)
    451 {
    452 	return unpack_param(&ctx->ac, ctx->tcs_out_layout, 0, 13);
    453 }
    454 
    455 static LLVMValueRef
    456 get_tcs_out_patch0_offset(struct nir_to_llvm_context *ctx)
    457 {
    458 	return LLVMBuildMul(ctx->builder,
    459 			    unpack_param(&ctx->ac, ctx->tcs_out_offsets, 0, 16),
    460 			    LLVMConstInt(ctx->ac.i32, 4, false), "");
    461 }
    462 
    463 static LLVMValueRef
    464 get_tcs_out_patch0_patch_data_offset(struct nir_to_llvm_context *ctx)
    465 {
    466 	return LLVMBuildMul(ctx->builder,
    467 			    unpack_param(&ctx->ac, ctx->tcs_out_offsets, 16, 16),
    468 			    LLVMConstInt(ctx->ac.i32, 4, false), "");
    469 }
    470 
    471 static LLVMValueRef
    472 get_tcs_in_current_patch_offset(struct nir_to_llvm_context *ctx)
    473 {
    474 	LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
    475 	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
    476 
    477 	return LLVMBuildMul(ctx->builder, patch_stride, rel_patch_id, "");
    478 }
    479 
    480 static LLVMValueRef
    481 get_tcs_out_current_patch_offset(struct nir_to_llvm_context *ctx)
    482 {
    483 	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
    484 	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
    485 	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
    486 
    487 	return LLVMBuildAdd(ctx->builder, patch0_offset,
    488 			    LLVMBuildMul(ctx->builder, patch_stride,
    489 					 rel_patch_id, ""),
    490 			    "");
    491 }
    492 
    493 static LLVMValueRef
    494 get_tcs_out_current_patch_data_offset(struct nir_to_llvm_context *ctx)
    495 {
    496 	LLVMValueRef patch0_patch_data_offset =
    497 		get_tcs_out_patch0_patch_data_offset(ctx);
    498 	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
    499 	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
    500 
    501 	return LLVMBuildAdd(ctx->builder, patch0_patch_data_offset,
    502 			    LLVMBuildMul(ctx->builder, patch_stride,
    503 					 rel_patch_id, ""),
    504 			    "");
    505 }
    506 
    507 static void
    508 set_loc(struct ac_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs,
    509 	uint32_t indirect_offset)
    510 {
    511 	ud_info->sgpr_idx = *sgpr_idx;
    512 	ud_info->num_sgprs = num_sgprs;
    513 	ud_info->indirect = indirect_offset > 0;
    514 	ud_info->indirect_offset = indirect_offset;
    515 	*sgpr_idx += num_sgprs;
    516 }
    517 
    518 static void
    519 set_loc_shader(struct nir_to_llvm_context *ctx, int idx, uint8_t *sgpr_idx,
    520 	       uint8_t num_sgprs)
    521 {
    522 	struct ac_userdata_info *ud_info =
    523 		&ctx->shader_info->user_sgprs_locs.shader_data[idx];
    524 	assert(ud_info);
    525 
    526 	set_loc(ud_info, sgpr_idx, num_sgprs, 0);
    527 }
    528 
    529 static void
    530 set_loc_desc(struct nir_to_llvm_context *ctx, int idx,  uint8_t *sgpr_idx,
    531 	     uint32_t indirect_offset)
    532 {
    533 	struct ac_userdata_info *ud_info =
    534 		&ctx->shader_info->user_sgprs_locs.descriptor_sets[idx];
    535 	assert(ud_info);
    536 
    537 	set_loc(ud_info, sgpr_idx, 2, indirect_offset);
    538 }
    539 
    540 struct user_sgpr_info {
    541 	bool need_ring_offsets;
    542 	uint8_t sgpr_count;
    543 	bool indirect_all_descriptor_sets;
    544 };
    545 
    546 static bool needs_view_index_sgpr(struct nir_to_llvm_context *ctx,
    547 				  gl_shader_stage stage)
    548 {
    549 	switch (stage) {
    550 	case MESA_SHADER_VERTEX:
    551 		if (ctx->shader_info->info.needs_multiview_view_index ||
    552 		    (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
    553 			return true;
    554 		break;
    555 	case MESA_SHADER_TESS_EVAL:
    556 		if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index))
    557 			return true;
    558 	case MESA_SHADER_GEOMETRY:
    559 	case MESA_SHADER_TESS_CTRL:
    560 		if (ctx->shader_info->info.needs_multiview_view_index)
    561 			return true;
    562 	default:
    563 		break;
    564 	}
    565 	return false;
    566 }
    567 
    568 static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
    569 				gl_shader_stage stage,
    570 				bool needs_view_index,
    571 				struct user_sgpr_info *user_sgpr_info)
    572 {
    573 	memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
    574 
    575 	/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
    576 	if (stage == MESA_SHADER_GEOMETRY ||
    577 	    stage == MESA_SHADER_VERTEX ||
    578 	    stage == MESA_SHADER_TESS_CTRL ||
    579 	    stage == MESA_SHADER_TESS_EVAL ||
    580 	    ctx->is_gs_copy_shader)
    581 		user_sgpr_info->need_ring_offsets = true;
    582 
    583 	if (stage == MESA_SHADER_FRAGMENT &&
    584 	    ctx->shader_info->info.ps.needs_sample_positions)
    585 		user_sgpr_info->need_ring_offsets = true;
    586 
    587 	/* 2 user sgprs will nearly always be allocated for scratch/rings */
    588 	if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
    589 		user_sgpr_info->sgpr_count += 2;
    590 	}
    591 
    592 	/* FIXME: fix the number of user sgprs for merged shaders on GFX9 */
    593 	switch (stage) {
    594 	case MESA_SHADER_COMPUTE:
    595 		if (ctx->shader_info->info.cs.uses_grid_size)
    596 			user_sgpr_info->sgpr_count += 3;
    597 		break;
    598 	case MESA_SHADER_FRAGMENT:
    599 		user_sgpr_info->sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
    600 		break;
    601 	case MESA_SHADER_VERTEX:
    602 		if (!ctx->is_gs_copy_shader) {
    603 			user_sgpr_info->sgpr_count += ctx->shader_info->info.vs.has_vertex_buffers ? 2 : 0;
    604 			if (ctx->shader_info->info.vs.needs_draw_id) {
    605 				user_sgpr_info->sgpr_count += 3;
    606 			} else {
    607 				user_sgpr_info->sgpr_count += 2;
    608 			}
    609 		}
    610 		if (ctx->options->key.vs.as_ls)
    611 			user_sgpr_info->sgpr_count++;
    612 		break;
    613 	case MESA_SHADER_TESS_CTRL:
    614 		user_sgpr_info->sgpr_count += 4;
    615 		break;
    616 	case MESA_SHADER_TESS_EVAL:
    617 		user_sgpr_info->sgpr_count += 1;
    618 		break;
    619 	case MESA_SHADER_GEOMETRY:
    620 		user_sgpr_info->sgpr_count += 2;
    621 		break;
    622 	default:
    623 		break;
    624 	}
    625 
    626 	if (needs_view_index)
    627 		user_sgpr_info->sgpr_count++;
    628 
    629 	if (ctx->shader_info->info.loads_push_constants)
    630 		user_sgpr_info->sgpr_count += 2;
    631 
    632 	uint32_t available_sgprs = ctx->options->chip_class >= GFX9 ? 32 : 16;
    633 	uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
    634 
    635 	if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
    636 		user_sgpr_info->sgpr_count += 2;
    637 		user_sgpr_info->indirect_all_descriptor_sets = true;
    638 	} else {
    639 		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
    640 	}
    641 }
    642 
    643 static void
    644 declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
    645 			   gl_shader_stage stage,
    646 			   bool has_previous_stage,
    647 			   gl_shader_stage previous_stage,
    648 			   const struct user_sgpr_info *user_sgpr_info,
    649 			   struct arg_info *args,
    650 			   LLVMValueRef *desc_sets)
    651 {
    652 	LLVMTypeRef type = const_array(ctx->ac.i8, 1024 * 1024);
    653 	unsigned num_sets = ctx->options->layout ?
    654 			    ctx->options->layout->num_sets : 0;
    655 	unsigned stage_mask = 1 << stage;
    656 
    657 	if (has_previous_stage)
    658 		stage_mask |= 1 << previous_stage;
    659 
    660 	/* 1 for each descriptor set */
    661 	if (!user_sgpr_info->indirect_all_descriptor_sets) {
    662 		for (unsigned i = 0; i < num_sets; ++i) {
    663 			if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
    664 				add_array_arg(args, type,
    665 					      &ctx->descriptor_sets[i]);
    666 			}
    667 		}
    668 	} else {
    669 		add_array_arg(args, const_array(type, 32), desc_sets);
    670 	}
    671 
    672 	if (ctx->shader_info->info.loads_push_constants) {
    673 		/* 1 for push constants and dynamic descriptors */
    674 		add_array_arg(args, type, &ctx->push_constants);
    675 	}
    676 }
    677 
    678 static void
    679 declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
    680 				gl_shader_stage stage,
    681 				bool has_previous_stage,
    682 				gl_shader_stage previous_stage,
    683 				struct arg_info *args)
    684 {
    685 	if (!ctx->is_gs_copy_shader &&
    686 	    (stage == MESA_SHADER_VERTEX ||
    687 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
    688 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
    689 			add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
    690 				&ctx->vertex_buffers);
    691 		}
    692 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
    693 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
    694 		if (ctx->shader_info->info.vs.needs_draw_id) {
    695 			add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id);
    696 		}
    697 	}
    698 }
    699 
    700 static void
    701 declare_vs_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
    702 {
    703 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id);
    704 	if (!ctx->is_gs_copy_shader) {
    705 		if (ctx->options->key.vs.as_ls) {
    706 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
    707 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
    708 		} else {
    709 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
    710 			add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
    711 		}
    712 		add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
    713 	}
    714 }
    715 
    716 static void
    717 declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
    718 {
    719 	add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u);
    720 	add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v);
    721 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id);
    722 	add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
    723 }
    724 
    725 static void
    726 set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
    727 		      bool has_previous_stage, gl_shader_stage previous_stage,
    728 		      const struct user_sgpr_info *user_sgpr_info,
    729 		      LLVMValueRef desc_sets, uint8_t *user_sgpr_idx)
    730 {
    731 	unsigned num_sets = ctx->options->layout ?
    732 			    ctx->options->layout->num_sets : 0;
    733 	unsigned stage_mask = 1 << stage;
    734 
    735 	if (has_previous_stage)
    736 		stage_mask |= 1 << previous_stage;
    737 
    738 	if (!user_sgpr_info->indirect_all_descriptor_sets) {
    739 		for (unsigned i = 0; i < num_sets; ++i) {
    740 			if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
    741 				set_loc_desc(ctx, i, user_sgpr_idx, 0);
    742 			} else
    743 				ctx->descriptor_sets[i] = NULL;
    744 		}
    745 	} else {
    746 		set_loc_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
    747 			       user_sgpr_idx, 2);
    748 
    749 		for (unsigned i = 0; i < num_sets; ++i) {
    750 			if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
    751 				set_loc_desc(ctx, i, user_sgpr_idx, i * 8);
    752 				ctx->descriptor_sets[i] =
    753 					ac_build_load_to_sgpr(&ctx->ac,
    754 							      desc_sets,
    755 							      LLVMConstInt(ctx->ac.i32, i, false));
    756 
    757 			} else
    758 				ctx->descriptor_sets[i] = NULL;
    759 		}
    760 		ctx->shader_info->need_indirect_descriptor_sets = true;
    761 	}
    762 
    763 	if (ctx->shader_info->info.loads_push_constants) {
    764 		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
    765 	}
    766 }
    767 
    768 static void
    769 set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
    770 			   gl_shader_stage stage, bool has_previous_stage,
    771 			   gl_shader_stage previous_stage,
    772 			   uint8_t *user_sgpr_idx)
    773 {
    774 	if (!ctx->is_gs_copy_shader &&
    775 	    (stage == MESA_SHADER_VERTEX ||
    776 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
    777 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
    778 			set_loc_shader(ctx, AC_UD_VS_VERTEX_BUFFERS,
    779 				       user_sgpr_idx, 2);
    780 		}
    781 
    782 		unsigned vs_num = 2;
    783 		if (ctx->shader_info->info.vs.needs_draw_id)
    784 			vs_num++;
    785 
    786 		set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE,
    787 			       user_sgpr_idx, vs_num);
    788 	}
    789 }
    790 
    791 static void create_function(struct nir_to_llvm_context *ctx,
    792                             gl_shader_stage stage,
    793                             bool has_previous_stage,
    794                             gl_shader_stage previous_stage)
    795 {
    796 	uint8_t user_sgpr_idx;
    797 	struct user_sgpr_info user_sgpr_info;
    798 	struct arg_info args = {};
    799 	LLVMValueRef desc_sets;
    800 	bool needs_view_index = needs_view_index_sgpr(ctx, stage);
    801 	allocate_user_sgprs(ctx, stage, needs_view_index, &user_sgpr_info);
    802 
    803 	if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
    804 		add_arg(&args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
    805 			&ctx->ring_offsets);
    806 	}
    807 
    808 	switch (stage) {
    809 	case MESA_SHADER_COMPUTE:
    810 		declare_global_input_sgprs(ctx, stage, has_previous_stage,
    811 					   previous_stage, &user_sgpr_info,
    812 					   &args, &desc_sets);
    813 
    814 		if (ctx->shader_info->info.cs.uses_grid_size) {
    815 			add_arg(&args, ARG_SGPR, ctx->ac.v3i32,
    816 				&ctx->num_work_groups);
    817 		}
    818 
    819 		for (int i = 0; i < 3; i++) {
    820 			ctx->workgroup_ids[i] = NULL;
    821 			if (ctx->shader_info->info.cs.uses_block_id[i]) {
    822 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
    823 					&ctx->workgroup_ids[i]);
    824 			}
    825 		}
    826 
    827 		if (ctx->shader_info->info.cs.uses_local_invocation_idx)
    828 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tg_size);
    829 		add_arg(&args, ARG_VGPR, ctx->ac.v3i32,
    830 			&ctx->local_invocation_ids);
    831 		break;
    832 	case MESA_SHADER_VERTEX:
    833 		declare_global_input_sgprs(ctx, stage, has_previous_stage,
    834 					   previous_stage, &user_sgpr_info,
    835 					   &args, &desc_sets);
    836 		declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
    837 						previous_stage, &args);
    838 
    839 		if (needs_view_index)
    840 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
    841 		if (ctx->options->key.vs.as_es)
    842 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    843 				&ctx->es2gs_offset);
    844 		else if (ctx->options->key.vs.as_ls)
    845 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    846 				&ctx->ls_out_layout);
    847 
    848 		declare_vs_input_vgprs(ctx, &args);
    849 		break;
    850 	case MESA_SHADER_TESS_CTRL:
    851 		if (has_previous_stage) {
    852 			// First 6 system regs
    853 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
    854 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    855 				&ctx->merged_wave_info);
    856 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    857 				&ctx->tess_factor_offset);
    858 
    859 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
    860 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
    861 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
    862 
    863 			declare_global_input_sgprs(ctx, stage,
    864 						   has_previous_stage,
    865 						   previous_stage,
    866 						   &user_sgpr_info, &args,
    867 						   &desc_sets);
    868 			declare_vs_specific_input_sgprs(ctx, stage,
    869 							has_previous_stage,
    870 							previous_stage, &args);
    871 
    872 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    873 				&ctx->ls_out_layout);
    874 
    875 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    876 				&ctx->tcs_offchip_layout);
    877 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    878 				&ctx->tcs_out_offsets);
    879 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    880 				&ctx->tcs_out_layout);
    881 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    882 				&ctx->tcs_in_layout);
    883 			if (needs_view_index)
    884 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
    885 					&ctx->view_index);
    886 
    887 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    888 				&ctx->abi.tcs_patch_id);
    889 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    890 				&ctx->abi.tcs_rel_ids);
    891 
    892 			declare_vs_input_vgprs(ctx, &args);
    893 		} else {
    894 			declare_global_input_sgprs(ctx, stage,
    895 						   has_previous_stage,
    896 						   previous_stage,
    897 						   &user_sgpr_info, &args,
    898 						   &desc_sets);
    899 
    900 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    901 				&ctx->tcs_offchip_layout);
    902 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    903 				&ctx->tcs_out_offsets);
    904 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    905 				&ctx->tcs_out_layout);
    906 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    907 				&ctx->tcs_in_layout);
    908 			if (needs_view_index)
    909 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
    910 					&ctx->view_index);
    911 
    912 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
    913 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    914 				&ctx->tess_factor_offset);
    915 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    916 				&ctx->abi.tcs_patch_id);
    917 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    918 				&ctx->abi.tcs_rel_ids);
    919 		}
    920 		break;
    921 	case MESA_SHADER_TESS_EVAL:
    922 		declare_global_input_sgprs(ctx, stage, has_previous_stage,
    923 					   previous_stage, &user_sgpr_info,
    924 					   &args, &desc_sets);
    925 
    926 		add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tcs_offchip_layout);
    927 		if (needs_view_index)
    928 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
    929 
    930 		if (ctx->options->key.tes.as_es) {
    931 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
    932 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
    933 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    934 				&ctx->es2gs_offset);
    935 		} else {
    936 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
    937 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
    938 		}
    939 		declare_tes_input_vgprs(ctx, &args);
    940 		break;
    941 	case MESA_SHADER_GEOMETRY:
    942 		if (has_previous_stage) {
    943 			// First 6 system regs
    944 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    945 				&ctx->gs2vs_offset);
    946 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    947 				&ctx->merged_wave_info);
    948 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
    949 
    950 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
    951 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
    952 			add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
    953 
    954 			declare_global_input_sgprs(ctx, stage,
    955 						   has_previous_stage,
    956 						   previous_stage,
    957 						   &user_sgpr_info, &args,
    958 						   &desc_sets);
    959 
    960 			if (previous_stage == MESA_SHADER_TESS_EVAL) {
    961 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
    962 					&ctx->tcs_offchip_layout);
    963 			} else {
    964 				declare_vs_specific_input_sgprs(ctx, stage,
    965 								has_previous_stage,
    966 								previous_stage,
    967 								&args);
    968 			}
    969 
    970 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    971 				&ctx->gsvs_ring_stride);
    972 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
    973 				&ctx->gsvs_num_entries);
    974 			if (needs_view_index)
    975 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
    976 					&ctx->view_index);
    977 
    978 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    979 				&ctx->gs_vtx_offset[0]);
    980 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    981 				&ctx->gs_vtx_offset[2]);
    982 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    983 				&ctx->abi.gs_prim_id);
    984 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    985 				&ctx->abi.gs_invocation_id);
    986 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
    987 				&ctx->gs_vtx_offset[4]);
    988 
    989 			if (previous_stage == MESA_SHADER_VERTEX) {
    990 				declare_vs_input_vgprs(ctx, &args);
    991 			} else {
    992 				declare_tes_input_vgprs(ctx, &args);
    993 			}
    994 		} else {
    995 			declare_global_input_sgprs(ctx, stage,
    996 						   has_previous_stage,
    997 						   previous_stage,
    998 						   &user_sgpr_info, &args,
    999 						   &desc_sets);
   1000 
   1001 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
   1002 				&ctx->gsvs_ring_stride);
   1003 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
   1004 				&ctx->gsvs_num_entries);
   1005 			if (needs_view_index)
   1006 				add_arg(&args, ARG_SGPR, ctx->ac.i32,
   1007 					&ctx->view_index);
   1008 
   1009 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset);
   1010 			add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id);
   1011 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1012 				&ctx->gs_vtx_offset[0]);
   1013 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1014 				&ctx->gs_vtx_offset[1]);
   1015 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1016 				&ctx->abi.gs_prim_id);
   1017 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1018 				&ctx->gs_vtx_offset[2]);
   1019 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1020 				&ctx->gs_vtx_offset[3]);
   1021 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1022 				&ctx->gs_vtx_offset[4]);
   1023 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1024 				&ctx->gs_vtx_offset[5]);
   1025 			add_arg(&args, ARG_VGPR, ctx->ac.i32,
   1026 				&ctx->abi.gs_invocation_id);
   1027 		}
   1028 		break;
   1029 	case MESA_SHADER_FRAGMENT:
   1030 		declare_global_input_sgprs(ctx, stage, has_previous_stage,
   1031 					   previous_stage, &user_sgpr_info,
   1032 					   &args, &desc_sets);
   1033 
   1034 		if (ctx->shader_info->info.ps.needs_sample_positions)
   1035 			add_arg(&args, ARG_SGPR, ctx->ac.i32,
   1036 				&ctx->sample_pos_offset);
   1037 
   1038 		add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->prim_mask);
   1039 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample);
   1040 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center);
   1041 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid);
   1042 		add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */
   1043 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_sample);
   1044 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_center);
   1045 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_centroid);
   1046 		add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL);  /* line stipple tex */
   1047 		add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]);
   1048 		add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]);
   1049 		add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]);
   1050 		add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]);
   1051 		add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face);
   1052 		add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary);
   1053 		add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage);
   1054 		add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL);  /* fixed pt */
   1055 		break;
   1056 	default:
   1057 		unreachable("Shader stage not implemented");
   1058 	}
   1059 
   1060 	ctx->main_function = create_llvm_function(
   1061 	    ctx->context, ctx->module, ctx->builder, NULL, 0, &args,
   1062 	    ctx->max_workgroup_size,
   1063 	    ctx->options->unsafe_math);
   1064 	set_llvm_calling_convention(ctx->main_function, stage);
   1065 
   1066 
   1067 	ctx->shader_info->num_input_vgprs = 0;
   1068 	ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0;
   1069 
   1070 	ctx->shader_info->num_input_sgprs += args.num_sgprs_used;
   1071 
   1072 	if (ctx->stage != MESA_SHADER_FRAGMENT)
   1073 		ctx->shader_info->num_input_vgprs = args.num_vgprs_used;
   1074 
   1075 	assign_arguments(ctx->main_function, &args);
   1076 
   1077 	user_sgpr_idx = 0;
   1078 
   1079 	if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
   1080 		set_loc_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS,
   1081 			       &user_sgpr_idx, 2);
   1082 		if (ctx->options->supports_spill) {
   1083 			ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
   1084 							       LLVMPointerType(ctx->ac.i8, CONST_ADDR_SPACE),
   1085 							       NULL, 0, AC_FUNC_ATTR_READNONE);
   1086 			ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
   1087 							     const_array(ctx->ac.v4i32, 16), "");
   1088 		}
   1089 	}
   1090 
   1091 	/* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
   1092 	 * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
   1093 	if (has_previous_stage)
   1094 		user_sgpr_idx = 0;
   1095 
   1096 	set_global_input_locs(ctx, stage, has_previous_stage, previous_stage,
   1097 			      &user_sgpr_info, desc_sets, &user_sgpr_idx);
   1098 
   1099 	switch (stage) {
   1100 	case MESA_SHADER_COMPUTE:
   1101 		if (ctx->shader_info->info.cs.uses_grid_size) {
   1102 			set_loc_shader(ctx, AC_UD_CS_GRID_SIZE,
   1103 				       &user_sgpr_idx, 3);
   1104 		}
   1105 		break;
   1106 	case MESA_SHADER_VERTEX:
   1107 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
   1108 					   previous_stage, &user_sgpr_idx);
   1109 		if (ctx->view_index)
   1110 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
   1111 		if (ctx->options->key.vs.as_ls) {
   1112 			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
   1113 				       &user_sgpr_idx, 1);
   1114 		}
   1115 		if (ctx->options->key.vs.as_ls)
   1116 			ac_declare_lds_as_pointer(&ctx->ac);
   1117 		break;
   1118 	case MESA_SHADER_TESS_CTRL:
   1119 		set_vs_specific_input_locs(ctx, stage, has_previous_stage,
   1120 					   previous_stage, &user_sgpr_idx);
   1121 		if (has_previous_stage)
   1122 			set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
   1123 				       &user_sgpr_idx, 1);
   1124 		set_loc_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
   1125 		if (ctx->view_index)
   1126 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
   1127 		ac_declare_lds_as_pointer(&ctx->ac);
   1128 		break;
   1129 	case MESA_SHADER_TESS_EVAL:
   1130 		set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
   1131 		if (ctx->view_index)
   1132 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
   1133 		break;
   1134 	case MESA_SHADER_GEOMETRY:
   1135 		if (has_previous_stage) {
   1136 			if (previous_stage == MESA_SHADER_VERTEX)
   1137 				set_vs_specific_input_locs(ctx, stage,
   1138 							   has_previous_stage,
   1139 							   previous_stage,
   1140 							   &user_sgpr_idx);
   1141 			else
   1142 				set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
   1143 					       &user_sgpr_idx, 1);
   1144 		}
   1145 		set_loc_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES,
   1146 			       &user_sgpr_idx, 2);
   1147 		if (ctx->view_index)
   1148 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
   1149 		if (has_previous_stage)
   1150 			ac_declare_lds_as_pointer(&ctx->ac);
   1151 		break;
   1152 	case MESA_SHADER_FRAGMENT:
   1153 		if (ctx->shader_info->info.ps.needs_sample_positions) {
   1154 			set_loc_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET,
   1155 				       &user_sgpr_idx, 1);
   1156 		}
   1157 		break;
   1158 	default:
   1159 		unreachable("Shader stage not implemented");
   1160 	}
   1161 
   1162 	ctx->shader_info->num_user_sgprs = user_sgpr_idx;
   1163 }
   1164 
   1165 static LLVMValueRef trim_vector(struct ac_llvm_context *ctx,
   1166                                 LLVMValueRef value, unsigned count)
   1167 {
   1168 	unsigned num_components = ac_get_llvm_num_components(value);
   1169 	if (count == num_components)
   1170 		return value;
   1171 
   1172 	LLVMValueRef masks[] = {
   1173 	    LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
   1174 	    LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
   1175 
   1176 	if (count == 1)
   1177 		return LLVMBuildExtractElement(ctx->builder, value, masks[0],
   1178 		                               "");
   1179 
   1180 	LLVMValueRef swizzle = LLVMConstVector(masks, count);
   1181 	return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
   1182 }
   1183 
   1184 static void
   1185 build_store_values_extended(struct ac_llvm_context *ac,
   1186 			     LLVMValueRef *values,
   1187 			     unsigned value_count,
   1188 			     unsigned value_stride,
   1189 			     LLVMValueRef vec)
   1190 {
   1191 	LLVMBuilderRef builder = ac->builder;
   1192 	unsigned i;
   1193 
   1194 	for (i = 0; i < value_count; i++) {
   1195 		LLVMValueRef ptr = values[i * value_stride];
   1196 		LLVMValueRef index = LLVMConstInt(ac->i32, i, false);
   1197 		LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, "");
   1198 		LLVMBuildStore(builder, value, ptr);
   1199 	}
   1200 }
   1201 
   1202 static LLVMTypeRef get_def_type(struct ac_nir_context *ctx,
   1203                                 const nir_ssa_def *def)
   1204 {
   1205 	LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size);
   1206 	if (def->num_components > 1) {
   1207 		type = LLVMVectorType(type, def->num_components);
   1208 	}
   1209 	return type;
   1210 }
   1211 
   1212 static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src)
   1213 {
   1214 	assert(src.is_ssa);
   1215 	struct hash_entry *entry = _mesa_hash_table_search(nir->defs, src.ssa);
   1216 	return (LLVMValueRef)entry->data;
   1217 }
   1218 
   1219 
   1220 static LLVMBasicBlockRef get_block(struct ac_nir_context *nir,
   1221                                    const struct nir_block *b)
   1222 {
   1223 	struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b);
   1224 	return (LLVMBasicBlockRef)entry->data;
   1225 }
   1226 
   1227 static LLVMValueRef get_alu_src(struct ac_nir_context *ctx,
   1228                                 nir_alu_src src,
   1229                                 unsigned num_components)
   1230 {
   1231 	LLVMValueRef value = get_src(ctx, src.src);
   1232 	bool need_swizzle = false;
   1233 
   1234 	assert(value);
   1235 	LLVMTypeRef type = LLVMTypeOf(value);
   1236 	unsigned src_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
   1237 	                              ? LLVMGetVectorSize(type)
   1238 	                              : 1;
   1239 
   1240 	for (unsigned i = 0; i < num_components; ++i) {
   1241 		assert(src.swizzle[i] < src_components);
   1242 		if (src.swizzle[i] != i)
   1243 			need_swizzle = true;
   1244 	}
   1245 
   1246 	if (need_swizzle || num_components != src_components) {
   1247 		LLVMValueRef masks[] = {
   1248 		    LLVMConstInt(ctx->ac.i32, src.swizzle[0], false),
   1249 		    LLVMConstInt(ctx->ac.i32, src.swizzle[1], false),
   1250 		    LLVMConstInt(ctx->ac.i32, src.swizzle[2], false),
   1251 		    LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)};
   1252 
   1253 		if (src_components > 1 && num_components == 1) {
   1254 			value = LLVMBuildExtractElement(ctx->ac.builder, value,
   1255 			                                masks[0], "");
   1256 		} else if (src_components == 1 && num_components > 1) {
   1257 			LLVMValueRef values[] = {value, value, value, value};
   1258 			value = ac_build_gather_values(&ctx->ac, values, num_components);
   1259 		} else {
   1260 			LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
   1261 			value = LLVMBuildShuffleVector(ctx->ac.builder, value, value,
   1262 		                                       swizzle, "");
   1263 		}
   1264 	}
   1265 	assert(!src.negate);
   1266 	assert(!src.abs);
   1267 	return value;
   1268 }
   1269 
   1270 static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
   1271                                  LLVMIntPredicate pred, LLVMValueRef src0,
   1272                                  LLVMValueRef src1)
   1273 {
   1274 	LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
   1275 	return LLVMBuildSelect(ctx->builder, result,
   1276 	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
   1277 	                       ctx->i32_0, "");
   1278 }
   1279 
   1280 static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx,
   1281                                    LLVMRealPredicate pred, LLVMValueRef src0,
   1282                                    LLVMValueRef src1)
   1283 {
   1284 	LLVMValueRef result;
   1285 	src0 = ac_to_float(ctx, src0);
   1286 	src1 = ac_to_float(ctx, src1);
   1287 	result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
   1288 	return LLVMBuildSelect(ctx->builder, result,
   1289 	                       LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
   1290 			       ctx->i32_0, "");
   1291 }
   1292 
   1293 static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
   1294 					 const char *intrin,
   1295 					 LLVMTypeRef result_type,
   1296 					 LLVMValueRef src0)
   1297 {
   1298 	char name[64];
   1299 	LLVMValueRef params[] = {
   1300 		ac_to_float(ctx, src0),
   1301 	};
   1302 
   1303 	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
   1304 						 get_elem_bits(ctx, result_type));
   1305 	assert(length < sizeof(name));
   1306 	return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
   1307 }
   1308 
   1309 static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
   1310 				       const char *intrin,
   1311 				       LLVMTypeRef result_type,
   1312 				       LLVMValueRef src0, LLVMValueRef src1)
   1313 {
   1314 	char name[64];
   1315 	LLVMValueRef params[] = {
   1316 		ac_to_float(ctx, src0),
   1317 		ac_to_float(ctx, src1),
   1318 	};
   1319 
   1320 	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
   1321 						 get_elem_bits(ctx, result_type));
   1322 	assert(length < sizeof(name));
   1323 	return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
   1324 }
   1325 
   1326 static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
   1327 					 const char *intrin,
   1328 					 LLVMTypeRef result_type,
   1329 					 LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
   1330 {
   1331 	char name[64];
   1332 	LLVMValueRef params[] = {
   1333 		ac_to_float(ctx, src0),
   1334 		ac_to_float(ctx, src1),
   1335 		ac_to_float(ctx, src2),
   1336 	};
   1337 
   1338 	MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
   1339 						 get_elem_bits(ctx, result_type));
   1340 	assert(length < sizeof(name));
   1341 	return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
   1342 }
   1343 
   1344 static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx,
   1345 			       LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
   1346 {
   1347 	LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0,
   1348 				       ctx->i32_0, "");
   1349 	return LLVMBuildSelect(ctx->builder, v, ac_to_integer(ctx, src1),
   1350 			       ac_to_integer(ctx, src2), "");
   1351 }
   1352 
   1353 static LLVMValueRef emit_minmax_int(struct ac_llvm_context *ctx,
   1354 				    LLVMIntPredicate pred,
   1355 				    LLVMValueRef src0, LLVMValueRef src1)
   1356 {
   1357 	return LLVMBuildSelect(ctx->builder,
   1358 			       LLVMBuildICmp(ctx->builder, pred, src0, src1, ""),
   1359 			       src0,
   1360 			       src1, "");
   1361 
   1362 }
   1363 static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx,
   1364 			      LLVMValueRef src0)
   1365 {
   1366 	return emit_minmax_int(ctx, LLVMIntSGT, src0,
   1367 			       LLVMBuildNeg(ctx->builder, src0, ""));
   1368 }
   1369 
   1370 static LLVMValueRef emit_fsign(struct ac_llvm_context *ctx,
   1371 			       LLVMValueRef src0,
   1372 			       unsigned bitsize)
   1373 {
   1374 	LLVMValueRef cmp, val, zero, one;
   1375 	LLVMTypeRef type;
   1376 
   1377 	if (bitsize == 32) {
   1378 		type = ctx->f32;
   1379 		zero = ctx->f32_0;
   1380 		one = ctx->f32_1;
   1381 	} else {
   1382 		type = ctx->f64;
   1383 		zero = ctx->f64_0;
   1384 		one = ctx->f64_1;
   1385 	}
   1386 
   1387 	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
   1388 	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
   1389 	cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
   1390 	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
   1391 	return val;
   1392 }
   1393 
   1394 static LLVMValueRef emit_isign(struct ac_llvm_context *ctx,
   1395 			       LLVMValueRef src0, unsigned bitsize)
   1396 {
   1397 	LLVMValueRef cmp, val, zero, one;
   1398 	LLVMTypeRef type;
   1399 
   1400 	if (bitsize == 32) {
   1401 		type = ctx->i32;
   1402 		zero = ctx->i32_0;
   1403 		one = ctx->i32_1;
   1404 	} else {
   1405 		type = ctx->i64;
   1406 		zero = ctx->i64_0;
   1407 		one = ctx->i64_1;
   1408 	}
   1409 
   1410 	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
   1411 	val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
   1412 	cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
   1413 	val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
   1414 	return val;
   1415 }
   1416 
   1417 static LLVMValueRef emit_ffract(struct ac_llvm_context *ctx,
   1418 				LLVMValueRef src0)
   1419 {
   1420 	const char *intr = "llvm.floor.f32";
   1421 	LLVMValueRef fsrc0 = ac_to_float(ctx, src0);
   1422 	LLVMValueRef params[] = {
   1423 		fsrc0,
   1424 	};
   1425 	LLVMValueRef floor = ac_build_intrinsic(ctx, intr,
   1426 						ctx->f32, params, 1,
   1427 						AC_FUNC_ATTR_READNONE);
   1428 	return LLVMBuildFSub(ctx->builder, fsrc0, floor, "");
   1429 }
   1430 
   1431 static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx,
   1432 				    const char *intrin,
   1433 				    LLVMValueRef src0, LLVMValueRef src1)
   1434 {
   1435 	LLVMTypeRef ret_type;
   1436 	LLVMTypeRef types[] = { ctx->i32, ctx->i1 };
   1437 	LLVMValueRef res;
   1438 	LLVMValueRef params[] = { src0, src1 };
   1439 	ret_type = LLVMStructTypeInContext(ctx->context, types,
   1440 					   2, true);
   1441 
   1442 	res = ac_build_intrinsic(ctx, intrin, ret_type,
   1443 				 params, 2, AC_FUNC_ATTR_READNONE);
   1444 
   1445 	res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
   1446 	res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
   1447 	return res;
   1448 }
   1449 
   1450 static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
   1451 			     LLVMValueRef src0)
   1452 {
   1453 	return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), "");
   1454 }
   1455 
   1456 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
   1457 			     LLVMValueRef src0)
   1458 {
   1459 	src0 = ac_to_float(ctx, src0);
   1460 	return LLVMBuildSExt(ctx->builder,
   1461 			     LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, ctx->f32_0, ""),
   1462 			     ctx->i32, "");
   1463 }
   1464 
   1465 static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
   1466 			     LLVMValueRef src0,
   1467 			     unsigned bitsize)
   1468 {
   1469 	LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
   1470 
   1471 	if (bitsize == 32)
   1472 		return result;
   1473 
   1474 	return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
   1475 }
   1476 
   1477 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
   1478 			     LLVMValueRef src0)
   1479 {
   1480 	return LLVMBuildSExt(ctx->builder,
   1481 			     LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, ctx->i32_0, ""),
   1482 			     ctx->i32, "");
   1483 }
   1484 
   1485 static LLVMValueRef emit_f2f16(struct nir_to_llvm_context *ctx,
   1486 			       LLVMValueRef src0)
   1487 {
   1488 	LLVMValueRef result;
   1489 	LLVMValueRef cond = NULL;
   1490 
   1491 	src0 = ac_to_float(&ctx->ac, src0);
   1492 	result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->ac.f16, "");
   1493 
   1494 	if (ctx->options->chip_class >= VI) {
   1495 		LLVMValueRef args[2];
   1496 		/* Check if the result is a denormal - and flush to 0 if so. */
   1497 		args[0] = result;
   1498 		args[1] = LLVMConstInt(ctx->ac.i32, N_SUBNORMAL | P_SUBNORMAL, false);
   1499 		cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f16", ctx->ac.i1, args, 2, AC_FUNC_ATTR_READNONE);
   1500 	}
   1501 
   1502 	/* need to convert back up to f32 */
   1503 	result = LLVMBuildFPExt(ctx->builder, result, ctx->ac.f32, "");
   1504 
   1505 	if (ctx->options->chip_class >= VI)
   1506 		result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, "");
   1507 	else {
   1508 		/* for SI/CIK */
   1509 		/* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
   1510 		 * so compare the result and flush to 0 if it's smaller.
   1511 		 */
   1512 		LLVMValueRef temp, cond2;
   1513 		temp = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
   1514 					    ctx->ac.f32, result);
   1515 		cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT,
   1516 				     LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->ac.i32, 0x38800000, false), ctx->ac.f32, ""),
   1517 				     temp, "");
   1518 		cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
   1519 				      temp, ctx->ac.f32_0, "");
   1520 		cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
   1521 		result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, "");
   1522 	}
   1523 	return result;
   1524 }
   1525 
   1526 static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx,
   1527 				   LLVMValueRef src0, LLVMValueRef src1)
   1528 {
   1529 	LLVMValueRef dst64, result;
   1530 	src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
   1531 	src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
   1532 
   1533 	dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
   1534 	dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
   1535 	result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
   1536 	return result;
   1537 }
   1538 
   1539 static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx,
   1540 				   LLVMValueRef src0, LLVMValueRef src1)
   1541 {
   1542 	LLVMValueRef dst64, result;
   1543 	src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
   1544 	src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
   1545 
   1546 	dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
   1547 	dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
   1548 	result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
   1549 	return result;
   1550 }
   1551 
   1552 static LLVMValueRef emit_bitfield_extract(struct ac_llvm_context *ctx,
   1553 					  bool is_signed,
   1554 					  const LLVMValueRef srcs[3])
   1555 {
   1556 	LLVMValueRef result;
   1557 	LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
   1558 
   1559 	result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
   1560 	result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
   1561 	return result;
   1562 }
   1563 
   1564 static LLVMValueRef emit_bitfield_insert(struct ac_llvm_context *ctx,
   1565 					 LLVMValueRef src0, LLVMValueRef src1,
   1566 					 LLVMValueRef src2, LLVMValueRef src3)
   1567 {
   1568 	LLVMValueRef bfi_args[3], result;
   1569 
   1570 	bfi_args[0] = LLVMBuildShl(ctx->builder,
   1571 				   LLVMBuildSub(ctx->builder,
   1572 						LLVMBuildShl(ctx->builder,
   1573 							     ctx->i32_1,
   1574 							     src3, ""),
   1575 						ctx->i32_1, ""),
   1576 				   src2, "");
   1577 	bfi_args[1] = LLVMBuildShl(ctx->builder, src1, src2, "");
   1578 	bfi_args[2] = src0;
   1579 
   1580 	LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, src3, LLVMConstInt(ctx->i32, 32, false), "");
   1581 
   1582 	/* Calculate:
   1583 	 *   (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2)
   1584 	 * Use the right-hand side, which the LLVM backend can convert to V_BFI.
   1585 	 */
   1586 	result = LLVMBuildXor(ctx->builder, bfi_args[2],
   1587 			      LLVMBuildAnd(ctx->builder, bfi_args[0],
   1588 					   LLVMBuildXor(ctx->builder, bfi_args[1], bfi_args[2], ""), ""), "");
   1589 
   1590 	result = LLVMBuildSelect(ctx->builder, icond, src1, result, "");
   1591 	return result;
   1592 }
   1593 
   1594 static LLVMValueRef emit_pack_half_2x16(struct ac_llvm_context *ctx,
   1595 					LLVMValueRef src0)
   1596 {
   1597 	LLVMValueRef comp[2];
   1598 
   1599 	src0 = ac_to_float(ctx, src0);
   1600 	comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
   1601 	comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
   1602 
   1603 	return ac_build_cvt_pkrtz_f16(ctx, comp);
   1604 }
   1605 
   1606 static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx,
   1607 					  LLVMValueRef src0)
   1608 {
   1609 	LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
   1610 	LLVMValueRef temps[2], result, val;
   1611 	int i;
   1612 
   1613 	for (i = 0; i < 2; i++) {
   1614 		val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
   1615 		val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
   1616 		val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
   1617 		temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
   1618 	}
   1619 
   1620 	result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), temps[0],
   1621 					ctx->i32_0, "");
   1622 	result = LLVMBuildInsertElement(ctx->builder, result, temps[1],
   1623 					ctx->i32_1, "");
   1624 	return result;
   1625 }
   1626 
   1627 static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
   1628 			      nir_op op,
   1629 			      LLVMValueRef src0)
   1630 {
   1631 	unsigned mask;
   1632 	int idx;
   1633 	LLVMValueRef result;
   1634 
   1635 	if (op == nir_op_fddx_fine || op == nir_op_fddx)
   1636 		mask = AC_TID_MASK_LEFT;
   1637 	else if (op == nir_op_fddy_fine || op == nir_op_fddy)
   1638 		mask = AC_TID_MASK_TOP;
   1639 	else
   1640 		mask = AC_TID_MASK_TOP_LEFT;
   1641 
   1642 	/* for DDX we want to next X pixel, DDY next Y pixel. */
   1643 	if (op == nir_op_fddx_fine ||
   1644 	    op == nir_op_fddx_coarse ||
   1645 	    op == nir_op_fddx)
   1646 		idx = 1;
   1647 	else
   1648 		idx = 2;
   1649 
   1650 	result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
   1651 	return result;
   1652 }
   1653 
   1654 /*
   1655  * this takes an I,J coordinate pair,
   1656  * and works out the X and Y derivatives.
   1657  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
   1658  */
   1659 static LLVMValueRef emit_ddxy_interp(
   1660 	struct ac_nir_context *ctx,
   1661 	LLVMValueRef interp_ij)
   1662 {
   1663 	LLVMValueRef result[4], a;
   1664 	unsigned i;
   1665 
   1666 	for (i = 0; i < 2; i++) {
   1667 		a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
   1668 					    LLVMConstInt(ctx->ac.i32, i, false), "");
   1669 		result[i] = emit_ddxy(ctx, nir_op_fddx, a);
   1670 		result[2+i] = emit_ddxy(ctx, nir_op_fddy, a);
   1671 	}
   1672 	return ac_build_gather_values(&ctx->ac, result, 4);
   1673 }
   1674 
   1675 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
   1676 {
   1677 	LLVMValueRef src[4], result = NULL;
   1678 	unsigned num_components = instr->dest.dest.ssa.num_components;
   1679 	unsigned src_components;
   1680 	LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
   1681 
   1682 	assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
   1683 	switch (instr->op) {
   1684 	case nir_op_vec2:
   1685 	case nir_op_vec3:
   1686 	case nir_op_vec4:
   1687 		src_components = 1;
   1688 		break;
   1689 	case nir_op_pack_half_2x16:
   1690 		src_components = 2;
   1691 		break;
   1692 	case nir_op_unpack_half_2x16:
   1693 		src_components = 1;
   1694 		break;
   1695 	default:
   1696 		src_components = num_components;
   1697 		break;
   1698 	}
   1699 	for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
   1700 		src[i] = get_alu_src(ctx, instr->src[i], src_components);
   1701 
   1702 	switch (instr->op) {
   1703 	case nir_op_fmov:
   1704 	case nir_op_imov:
   1705 		result = src[0];
   1706 		break;
   1707 	case nir_op_fneg:
   1708 	        src[0] = ac_to_float(&ctx->ac, src[0]);
   1709 		result = LLVMBuildFNeg(ctx->ac.builder, src[0], "");
   1710 		break;
   1711 	case nir_op_ineg:
   1712 		result = LLVMBuildNeg(ctx->ac.builder, src[0], "");
   1713 		break;
   1714 	case nir_op_inot:
   1715 		result = LLVMBuildNot(ctx->ac.builder, src[0], "");
   1716 		break;
   1717 	case nir_op_iadd:
   1718 		result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
   1719 		break;
   1720 	case nir_op_fadd:
   1721 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1722 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1723 		result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], "");
   1724 		break;
   1725 	case nir_op_fsub:
   1726 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1727 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1728 		result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], "");
   1729 		break;
   1730 	case nir_op_isub:
   1731 		result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], "");
   1732 		break;
   1733 	case nir_op_imul:
   1734 		result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], "");
   1735 		break;
   1736 	case nir_op_imod:
   1737 		result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
   1738 		break;
   1739 	case nir_op_umod:
   1740 		result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], "");
   1741 		break;
   1742 	case nir_op_fmod:
   1743 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1744 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1745 		result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
   1746 		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
   1747 		                              ac_to_float_type(&ctx->ac, def_type), result);
   1748 		result = LLVMBuildFMul(ctx->ac.builder, src[1] , result, "");
   1749 		result = LLVMBuildFSub(ctx->ac.builder, src[0], result, "");
   1750 		break;
   1751 	case nir_op_frem:
   1752 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1753 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1754 		result = LLVMBuildFRem(ctx->ac.builder, src[0], src[1], "");
   1755 		break;
   1756 	case nir_op_irem:
   1757 		result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
   1758 		break;
   1759 	case nir_op_idiv:
   1760 		result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], "");
   1761 		break;
   1762 	case nir_op_udiv:
   1763 		result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], "");
   1764 		break;
   1765 	case nir_op_fmul:
   1766 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1767 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1768 		result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
   1769 		break;
   1770 	case nir_op_fdiv:
   1771 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1772 		src[1] = ac_to_float(&ctx->ac, src[1]);
   1773 		result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
   1774 		break;
   1775 	case nir_op_frcp:
   1776 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1777 		result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
   1778 				       src[0]);
   1779 		break;
   1780 	case nir_op_iand:
   1781 		result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
   1782 		break;
   1783 	case nir_op_ior:
   1784 		result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], "");
   1785 		break;
   1786 	case nir_op_ixor:
   1787 		result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], "");
   1788 		break;
   1789 	case nir_op_ishl:
   1790 		result = LLVMBuildShl(ctx->ac.builder, src[0],
   1791 				      LLVMBuildZExt(ctx->ac.builder, src[1],
   1792 						    LLVMTypeOf(src[0]), ""),
   1793 				      "");
   1794 		break;
   1795 	case nir_op_ishr:
   1796 		result = LLVMBuildAShr(ctx->ac.builder, src[0],
   1797 				       LLVMBuildZExt(ctx->ac.builder, src[1],
   1798 						     LLVMTypeOf(src[0]), ""),
   1799 				       "");
   1800 		break;
   1801 	case nir_op_ushr:
   1802 		result = LLVMBuildLShr(ctx->ac.builder, src[0],
   1803 				       LLVMBuildZExt(ctx->ac.builder, src[1],
   1804 						     LLVMTypeOf(src[0]), ""),
   1805 				       "");
   1806 		break;
   1807 	case nir_op_ilt:
   1808 		result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
   1809 		break;
   1810 	case nir_op_ine:
   1811 		result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
   1812 		break;
   1813 	case nir_op_ieq:
   1814 		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
   1815 		break;
   1816 	case nir_op_ige:
   1817 		result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
   1818 		break;
   1819 	case nir_op_ult:
   1820 		result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
   1821 		break;
   1822 	case nir_op_uge:
   1823 		result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
   1824 		break;
   1825 	case nir_op_feq:
   1826 		result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
   1827 		break;
   1828 	case nir_op_fne:
   1829 		result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
   1830 		break;
   1831 	case nir_op_flt:
   1832 		result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]);
   1833 		break;
   1834 	case nir_op_fge:
   1835 		result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]);
   1836 		break;
   1837 	case nir_op_fabs:
   1838 		result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
   1839 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1840 		break;
   1841 	case nir_op_iabs:
   1842 		result = emit_iabs(&ctx->ac, src[0]);
   1843 		break;
   1844 	case nir_op_imax:
   1845 		result = emit_minmax_int(&ctx->ac, LLVMIntSGT, src[0], src[1]);
   1846 		break;
   1847 	case nir_op_imin:
   1848 		result = emit_minmax_int(&ctx->ac, LLVMIntSLT, src[0], src[1]);
   1849 		break;
   1850 	case nir_op_umax:
   1851 		result = emit_minmax_int(&ctx->ac, LLVMIntUGT, src[0], src[1]);
   1852 		break;
   1853 	case nir_op_umin:
   1854 		result = emit_minmax_int(&ctx->ac, LLVMIntULT, src[0], src[1]);
   1855 		break;
   1856 	case nir_op_isign:
   1857 		result = emit_isign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
   1858 		break;
   1859 	case nir_op_fsign:
   1860 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1861 		result = emit_fsign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
   1862 		break;
   1863 	case nir_op_ffloor:
   1864 		result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
   1865 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1866 		break;
   1867 	case nir_op_ftrunc:
   1868 		result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc",
   1869 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1870 		break;
   1871 	case nir_op_fceil:
   1872 		result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil",
   1873 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1874 		break;
   1875 	case nir_op_fround_even:
   1876 		result = emit_intrin_1f_param(&ctx->ac, "llvm.rint",
   1877 		                              ac_to_float_type(&ctx->ac, def_type),src[0]);
   1878 		break;
   1879 	case nir_op_ffract:
   1880 		result = emit_ffract(&ctx->ac, src[0]);
   1881 		break;
   1882 	case nir_op_fsin:
   1883 		result = emit_intrin_1f_param(&ctx->ac, "llvm.sin",
   1884 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1885 		break;
   1886 	case nir_op_fcos:
   1887 		result = emit_intrin_1f_param(&ctx->ac, "llvm.cos",
   1888 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1889 		break;
   1890 	case nir_op_fsqrt:
   1891 		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
   1892 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1893 		break;
   1894 	case nir_op_fexp2:
   1895 		result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
   1896 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1897 		break;
   1898 	case nir_op_flog2:
   1899 		result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
   1900 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1901 		break;
   1902 	case nir_op_frsq:
   1903 		result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
   1904 		                              ac_to_float_type(&ctx->ac, def_type), src[0]);
   1905 		result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
   1906 				       result);
   1907 		break;
   1908 	case nir_op_fpow:
   1909 		result = emit_intrin_2f_param(&ctx->ac, "llvm.pow",
   1910 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
   1911 		break;
   1912 	case nir_op_fmax:
   1913 		result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
   1914 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
   1915 		if (instr->dest.dest.ssa.bit_size == 32)
   1916 			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
   1917 						      ac_to_float_type(&ctx->ac, def_type),
   1918 						      result);
   1919 		break;
   1920 	case nir_op_fmin:
   1921 		result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
   1922 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
   1923 		if (instr->dest.dest.ssa.bit_size == 32)
   1924 			result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
   1925 						      ac_to_float_type(&ctx->ac, def_type),
   1926 						      result);
   1927 		break;
   1928 	case nir_op_ffma:
   1929 		result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd",
   1930 		                              ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
   1931 		break;
   1932 	case nir_op_ibitfield_extract:
   1933 		result = emit_bitfield_extract(&ctx->ac, true, src);
   1934 		break;
   1935 	case nir_op_ubitfield_extract:
   1936 		result = emit_bitfield_extract(&ctx->ac, false, src);
   1937 		break;
   1938 	case nir_op_bitfield_insert:
   1939 		result = emit_bitfield_insert(&ctx->ac, src[0], src[1], src[2], src[3]);
   1940 		break;
   1941 	case nir_op_bitfield_reverse:
   1942 		result = ac_build_intrinsic(&ctx->ac, "llvm.bitreverse.i32", ctx->ac.i32, src, 1, AC_FUNC_ATTR_READNONE);
   1943 		break;
   1944 	case nir_op_bit_count:
   1945 		result = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, src, 1, AC_FUNC_ATTR_READNONE);
   1946 		break;
   1947 	case nir_op_vec2:
   1948 	case nir_op_vec3:
   1949 	case nir_op_vec4:
   1950 		for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
   1951 			src[i] = ac_to_integer(&ctx->ac, src[i]);
   1952 		result = ac_build_gather_values(&ctx->ac, src, num_components);
   1953 		break;
   1954 	case nir_op_f2i32:
   1955 	case nir_op_f2i64:
   1956 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1957 		result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
   1958 		break;
   1959 	case nir_op_f2u32:
   1960 	case nir_op_f2u64:
   1961 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1962 		result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
   1963 		break;
   1964 	case nir_op_i2f32:
   1965 	case nir_op_i2f64:
   1966 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   1967 		result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
   1968 		break;
   1969 	case nir_op_u2f32:
   1970 	case nir_op_u2f64:
   1971 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   1972 		result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
   1973 		break;
   1974 	case nir_op_f2f64:
   1975 		src[0] = ac_to_float(&ctx->ac, src[0]);
   1976 		result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
   1977 		break;
   1978 	case nir_op_f2f32:
   1979 		result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
   1980 		break;
   1981 	case nir_op_u2u32:
   1982 	case nir_op_u2u64:
   1983 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   1984 		if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
   1985 			result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, "");
   1986 		else
   1987 			result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
   1988 		break;
   1989 	case nir_op_i2i32:
   1990 	case nir_op_i2i64:
   1991 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   1992 		if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
   1993 			result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, "");
   1994 		else
   1995 			result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
   1996 		break;
   1997 	case nir_op_bcsel:
   1998 		result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
   1999 		break;
   2000 	case nir_op_find_lsb:
   2001 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2002 		result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]);
   2003 		break;
   2004 	case nir_op_ufind_msb:
   2005 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2006 		result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32);
   2007 		break;
   2008 	case nir_op_ifind_msb:
   2009 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2010 		result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32);
   2011 		break;
   2012 	case nir_op_uadd_carry:
   2013 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2014 		src[1] = ac_to_integer(&ctx->ac, src[1]);
   2015 		result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
   2016 		break;
   2017 	case nir_op_usub_borrow:
   2018 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2019 		src[1] = ac_to_integer(&ctx->ac, src[1]);
   2020 		result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
   2021 		break;
   2022 	case nir_op_b2f:
   2023 		result = emit_b2f(&ctx->ac, src[0]);
   2024 		break;
   2025 	case nir_op_f2b:
   2026 		result = emit_f2b(&ctx->ac, src[0]);
   2027 		break;
   2028 	case nir_op_b2i:
   2029 		result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
   2030 		break;
   2031 	case nir_op_i2b:
   2032 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2033 		result = emit_i2b(&ctx->ac, src[0]);
   2034 		break;
   2035 	case nir_op_fquantize2f16:
   2036 		result = emit_f2f16(ctx->nctx, src[0]);
   2037 		break;
   2038 	case nir_op_umul_high:
   2039 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2040 		src[1] = ac_to_integer(&ctx->ac, src[1]);
   2041 		result = emit_umul_high(&ctx->ac, src[0], src[1]);
   2042 		break;
   2043 	case nir_op_imul_high:
   2044 		src[0] = ac_to_integer(&ctx->ac, src[0]);
   2045 		src[1] = ac_to_integer(&ctx->ac, src[1]);
   2046 		result = emit_imul_high(&ctx->ac, src[0], src[1]);
   2047 		break;
   2048 	case nir_op_pack_half_2x16:
   2049 		result = emit_pack_half_2x16(&ctx->ac, src[0]);
   2050 		break;
   2051 	case nir_op_unpack_half_2x16:
   2052 		result = emit_unpack_half_2x16(&ctx->ac, src[0]);
   2053 		break;
   2054 	case nir_op_fddx:
   2055 	case nir_op_fddy:
   2056 	case nir_op_fddx_fine:
   2057 	case nir_op_fddy_fine:
   2058 	case nir_op_fddx_coarse:
   2059 	case nir_op_fddy_coarse:
   2060 		result = emit_ddxy(ctx, instr->op, src[0]);
   2061 		break;
   2062 
   2063 	case nir_op_unpack_64_2x32_split_x: {
   2064 		assert(instr->src[0].src.ssa->num_components == 1);
   2065 		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
   2066 						    ctx->ac.v2i32,
   2067 						    "");
   2068 		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
   2069 						 ctx->ac.i32_0, "");
   2070 		break;
   2071 	}
   2072 
   2073 	case nir_op_unpack_64_2x32_split_y: {
   2074 		assert(instr->src[0].src.ssa->num_components == 1);
   2075 		LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
   2076 						    ctx->ac.v2i32,
   2077 						    "");
   2078 		result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
   2079 						 ctx->ac.i32_1, "");
   2080 		break;
   2081 	}
   2082 
   2083 	case nir_op_pack_64_2x32_split: {
   2084 		LLVMValueRef tmp = LLVMGetUndef(ctx->ac.v2i32);
   2085 		tmp = LLVMBuildInsertElement(ctx->ac.builder, tmp,
   2086 					     src[0], ctx->ac.i32_0, "");
   2087 		tmp = LLVMBuildInsertElement(ctx->ac.builder, tmp,
   2088 					     src[1], ctx->ac.i32_1, "");
   2089 		result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, "");
   2090 		break;
   2091 	}
   2092 
   2093 	default:
   2094 		fprintf(stderr, "Unknown NIR alu instr: ");
   2095 		nir_print_instr(&instr->instr, stderr);
   2096 		fprintf(stderr, "\n");
   2097 		abort();
   2098 	}
   2099 
   2100 	if (result) {
   2101 		assert(instr->dest.dest.is_ssa);
   2102 		result = ac_to_integer(&ctx->ac, result);
   2103 		_mesa_hash_table_insert(ctx->defs, &instr->dest.dest.ssa,
   2104 		                        result);
   2105 	}
   2106 }
   2107 
   2108 static void visit_load_const(struct ac_nir_context *ctx,
   2109                              const nir_load_const_instr *instr)
   2110 {
   2111 	LLVMValueRef values[4], value = NULL;
   2112 	LLVMTypeRef element_type =
   2113 	    LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
   2114 
   2115 	for (unsigned i = 0; i < instr->def.num_components; ++i) {
   2116 		switch (instr->def.bit_size) {
   2117 		case 32:
   2118 			values[i] = LLVMConstInt(element_type,
   2119 			                         instr->value.u32[i], false);
   2120 			break;
   2121 		case 64:
   2122 			values[i] = LLVMConstInt(element_type,
   2123 			                         instr->value.u64[i], false);
   2124 			break;
   2125 		default:
   2126 			fprintf(stderr,
   2127 			        "unsupported nir load_const bit_size: %d\n",
   2128 			        instr->def.bit_size);
   2129 			abort();
   2130 		}
   2131 	}
   2132 	if (instr->def.num_components > 1) {
   2133 		value = LLVMConstVector(values, instr->def.num_components);
   2134 	} else
   2135 		value = values[0];
   2136 
   2137 	_mesa_hash_table_insert(ctx->defs, &instr->def, value);
   2138 }
   2139 
   2140 static LLVMValueRef cast_ptr(struct nir_to_llvm_context *ctx, LLVMValueRef ptr,
   2141                              LLVMTypeRef type)
   2142 {
   2143 	int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
   2144 	return LLVMBuildBitCast(ctx->builder, ptr,
   2145 	                        LLVMPointerType(type, addr_space), "");
   2146 }
   2147 
   2148 static LLVMValueRef
   2149 get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements)
   2150 {
   2151 	LLVMValueRef size =
   2152 		LLVMBuildExtractElement(ctx->ac.builder, descriptor,
   2153 					LLVMConstInt(ctx->ac.i32, 2, false), "");
   2154 
   2155 	/* VI only */
   2156 	if (ctx->ac.chip_class == VI && in_elements) {
   2157 		/* On VI, the descriptor contains the size in bytes,
   2158 		 * but TXQ must return the size in elements.
   2159 		 * The stride is always non-zero for resources using TXQ.
   2160 		 */
   2161 		LLVMValueRef stride =
   2162 			LLVMBuildExtractElement(ctx->ac.builder, descriptor,
   2163 						ctx->ac.i32_1, "");
   2164 		stride = LLVMBuildLShr(ctx->ac.builder, stride,
   2165 				       LLVMConstInt(ctx->ac.i32, 16, false), "");
   2166 		stride = LLVMBuildAnd(ctx->ac.builder, stride,
   2167 				      LLVMConstInt(ctx->ac.i32, 0x3fff, false), "");
   2168 
   2169 		size = LLVMBuildUDiv(ctx->ac.builder, size, stride, "");
   2170 	}
   2171 	return size;
   2172 }
   2173 
   2174 /**
   2175  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
   2176  * intrinsic names).
   2177  */
   2178 static void build_int_type_name(
   2179 	LLVMTypeRef type,
   2180 	char *buf, unsigned bufsize)
   2181 {
   2182 	assert(bufsize >= 6);
   2183 
   2184 	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
   2185 		snprintf(buf, bufsize, "v%ui32",
   2186 			 LLVMGetVectorSize(type));
   2187 	else
   2188 		strcpy(buf, "i32");
   2189 }
   2190 
   2191 static LLVMValueRef radv_lower_gather4_integer(struct ac_llvm_context *ctx,
   2192 					       struct ac_image_args *args,
   2193 					       const nir_tex_instr *instr)
   2194 {
   2195 	enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
   2196 	LLVMValueRef coord = args->addr;
   2197 	LLVMValueRef half_texel[2];
   2198 	LLVMValueRef compare_cube_wa = NULL;
   2199 	LLVMValueRef result;
   2200 	int c;
   2201 	unsigned coord_vgpr_index = (unsigned)args->offset + (unsigned)args->compare;
   2202 
   2203 	//TODO Rect
   2204 	{
   2205 		struct ac_image_args txq_args = { 0 };
   2206 
   2207 		txq_args.da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
   2208 		txq_args.opcode = ac_image_get_resinfo;
   2209 		txq_args.dmask = 0xf;
   2210 		txq_args.addr = ctx->i32_0;
   2211 		txq_args.resource = args->resource;
   2212 		LLVMValueRef size = ac_build_image_opcode(ctx, &txq_args);
   2213 
   2214 		for (c = 0; c < 2; c++) {
   2215 			half_texel[c] = LLVMBuildExtractElement(ctx->builder, size,
   2216 								LLVMConstInt(ctx->i32, c, false), "");
   2217 			half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
   2218 			half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]);
   2219 			half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c],
   2220 						      LLVMConstReal(ctx->f32, -0.5), "");
   2221 		}
   2222 	}
   2223 
   2224 	LLVMValueRef orig_coords = args->addr;
   2225 
   2226 	for (c = 0; c < 2; c++) {
   2227 		LLVMValueRef tmp;
   2228 		LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
   2229 		tmp = LLVMBuildExtractElement(ctx->builder, coord, index, "");
   2230 		tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
   2231 		tmp = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
   2232 		tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
   2233 		coord = LLVMBuildInsertElement(ctx->builder, coord, tmp, index, "");
   2234 	}
   2235 
   2236 
   2237 	/*
   2238 	 * Apparantly cube has issue with integer types that the workaround doesn't solve,
   2239 	 * so this tests if the format is 8_8_8_8 and an integer type do an alternate
   2240 	 * workaround by sampling using a scaled type and converting.
   2241 	 * This is taken from amdgpu-pro shaders.
   2242 	 */
   2243 	/* NOTE this produces some ugly code compared to amdgpu-pro,
   2244 	 * LLVM ends up dumping SGPRs into VGPRs to deal with the compare/select,
   2245 	 * and then reads them back. -pro generates two selects,
   2246 	 * one s_cmp for the descriptor rewriting
   2247 	 * one v_cmp for the coordinate and result changes.
   2248 	 */
   2249 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
   2250 		LLVMValueRef tmp, tmp2;
   2251 
   2252 		/* workaround 8/8/8/8 uint/sint cube gather bug */
   2253 		/* first detect it then change to a scaled read and f2i */
   2254 		tmp = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, "");
   2255 		tmp2 = tmp;
   2256 
   2257 		/* extract the DATA_FORMAT */
   2258 		tmp = ac_build_bfe(ctx, tmp, LLVMConstInt(ctx->i32, 20, false),
   2259 				   LLVMConstInt(ctx->i32, 6, false), false);
   2260 
   2261 		/* is the DATA_FORMAT == 8_8_8_8 */
   2262 		compare_cube_wa = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tmp, LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), "");
   2263 
   2264 		if (stype == GLSL_TYPE_UINT)
   2265 			/* Create a NUM FORMAT - 0x2 or 0x4 - USCALED or UINT */
   2266 			tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0x8000000, false),
   2267 					      LLVMConstInt(ctx->i32, 0x10000000, false), "");
   2268 		else
   2269 			/* Create a NUM FORMAT - 0x3 or 0x5 - SSCALED or SINT */
   2270 			tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0xc000000, false),
   2271 					      LLVMConstInt(ctx->i32, 0x14000000, false), "");
   2272 
   2273 		/* replace the NUM FORMAT in the descriptor */
   2274 		tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false), "");
   2275 		tmp2 = LLVMBuildOr(ctx->builder, tmp2, tmp, "");
   2276 
   2277 		args->resource = LLVMBuildInsertElement(ctx->builder, args->resource, tmp2, ctx->i32_1, "");
   2278 
   2279 		/* don't modify the coordinates for this case */
   2280 		coord = LLVMBuildSelect(ctx->builder, compare_cube_wa, orig_coords, coord, "");
   2281 	}
   2282 	args->addr = coord;
   2283 	result = ac_build_image_opcode(ctx, args);
   2284 
   2285 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
   2286 		LLVMValueRef tmp, tmp2;
   2287 
   2288 		/* if the cube workaround is in place, f2i the result. */
   2289 		for (c = 0; c < 4; c++) {
   2290 			tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
   2291 			if (stype == GLSL_TYPE_UINT)
   2292 				tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
   2293 			else
   2294 				tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
   2295 			tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
   2296 			tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
   2297 			tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, tmp2, tmp, "");
   2298 			tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
   2299 			result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
   2300 		}
   2301 	}
   2302 	return result;
   2303 }
   2304 
   2305 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
   2306 					const nir_tex_instr *instr,
   2307 					bool lod_is_zero,
   2308 					struct ac_image_args *args)
   2309 {
   2310 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
   2311 		if (ctx->abi->gfx9_stride_size_workaround) {
   2312 			return ac_build_buffer_load_format_gfx9_safe(&ctx->ac,
   2313 			                                             args->resource,
   2314 			                                             args->addr,
   2315 			                                             ctx->ac.i32_0,
   2316 			                                             true);
   2317 		} else {
   2318 			return ac_build_buffer_load_format(&ctx->ac,
   2319 			                                   args->resource,
   2320 			                                   args->addr,
   2321 			                                   ctx->ac.i32_0,
   2322 			                                   true);
   2323 		}
   2324 	}
   2325 
   2326 	args->opcode = ac_image_sample;
   2327 	args->compare = instr->is_shadow;
   2328 
   2329 	switch (instr->op) {
   2330 	case nir_texop_txf:
   2331 	case nir_texop_txf_ms:
   2332 	case nir_texop_samples_identical:
   2333 		args->opcode = lod_is_zero ||
   2334 			       instr->sampler_dim == GLSL_SAMPLER_DIM_MS ?
   2335 					ac_image_load : ac_image_load_mip;
   2336 		args->compare = false;
   2337 		args->offset = false;
   2338 		break;
   2339 	case nir_texop_txb:
   2340 		args->bias = true;
   2341 		break;
   2342 	case nir_texop_txl:
   2343 		if (lod_is_zero)
   2344 			args->level_zero = true;
   2345 		else
   2346 			args->lod = true;
   2347 		break;
   2348 	case nir_texop_txs:
   2349 	case nir_texop_query_levels:
   2350 		args->opcode = ac_image_get_resinfo;
   2351 		break;
   2352 	case nir_texop_tex:
   2353 		if (ctx->stage != MESA_SHADER_FRAGMENT)
   2354 			args->level_zero = true;
   2355 		break;
   2356 	case nir_texop_txd:
   2357 		args->deriv = true;
   2358 		break;
   2359 	case nir_texop_tg4:
   2360 		args->opcode = ac_image_gather4;
   2361 		args->level_zero = true;
   2362 		break;
   2363 	case nir_texop_lod:
   2364 		args->opcode = ac_image_get_lod;
   2365 		args->compare = false;
   2366 		args->offset = false;
   2367 		break;
   2368 	default:
   2369 		break;
   2370 	}
   2371 
   2372 	if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= VI) {
   2373 		enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
   2374 		if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
   2375 			return radv_lower_gather4_integer(&ctx->ac, args, instr);
   2376 		}
   2377 	}
   2378 	return ac_build_image_opcode(&ctx->ac, args);
   2379 }
   2380 
   2381 static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx,
   2382                                                 nir_intrinsic_instr *instr)
   2383 {
   2384 	LLVMValueRef index = get_src(ctx->nir, instr->src[0]);
   2385 	unsigned desc_set = nir_intrinsic_desc_set(instr);
   2386 	unsigned binding = nir_intrinsic_binding(instr);
   2387 	LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
   2388 	struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
   2389 	struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
   2390 	unsigned base_offset = layout->binding[binding].offset;
   2391 	LLVMValueRef offset, stride;
   2392 
   2393 	if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
   2394 	    layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
   2395 		unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
   2396 			layout->binding[binding].dynamic_offset_offset;
   2397 		desc_ptr = ctx->push_constants;
   2398 		base_offset = pipeline_layout->push_constant_size + 16 * idx;
   2399 		stride = LLVMConstInt(ctx->ac.i32, 16, false);
   2400 	} else
   2401 		stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
   2402 
   2403 	offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
   2404 	index = LLVMBuildMul(ctx->builder, index, stride, "");
   2405 	offset = LLVMBuildAdd(ctx->builder, offset, index, "");
   2406 
   2407 	desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset);
   2408 	desc_ptr = cast_ptr(ctx, desc_ptr, ctx->ac.v4i32);
   2409 	LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
   2410 
   2411 	return desc_ptr;
   2412 }
   2413 
   2414 static LLVMValueRef visit_vulkan_resource_reindex(struct nir_to_llvm_context *ctx,
   2415                                                   nir_intrinsic_instr *instr)
   2416 {
   2417 	LLVMValueRef ptr = get_src(ctx->nir, instr->src[0]);
   2418 	LLVMValueRef index = get_src(ctx->nir, instr->src[1]);
   2419 
   2420 	LLVMValueRef result = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
   2421 	LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
   2422 	return result;
   2423 }
   2424 
   2425 static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
   2426                                              nir_intrinsic_instr *instr)
   2427 {
   2428 	LLVMValueRef ptr, addr;
   2429 
   2430 	addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
   2431 	addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
   2432 
   2433 	ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
   2434 	ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
   2435 
   2436 	return LLVMBuildLoad(ctx->builder, ptr, "");
   2437 }
   2438 
   2439 static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx,
   2440                                           const nir_intrinsic_instr *instr)
   2441 {
   2442 	LLVMValueRef index = get_src(ctx, instr->src[0]);
   2443 
   2444 	return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false);
   2445 }
   2446 
   2447 static uint32_t widen_mask(uint32_t mask, unsigned multiplier)
   2448 {
   2449 	uint32_t new_mask = 0;
   2450 	for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
   2451 		if (mask & (1u << i))
   2452 			new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
   2453 	return new_mask;
   2454 }
   2455 
   2456 static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
   2457                                          unsigned start, unsigned count)
   2458 {
   2459 	LLVMTypeRef type = LLVMTypeOf(src);
   2460 
   2461 	if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
   2462 		assert(start == 0);
   2463 		assert(count == 1);
   2464 		return src;
   2465 	}
   2466 
   2467 	unsigned src_elements = LLVMGetVectorSize(type);
   2468 	assert(start < src_elements);
   2469 	assert(start + count <= src_elements);
   2470 
   2471 	if (start == 0 && count == src_elements)
   2472 		return src;
   2473 
   2474 	if (count == 1)
   2475 		return LLVMBuildExtractElement(ctx->builder, src, LLVMConstInt(ctx->i32, start, false), "");
   2476 
   2477 	assert(count <= 8);
   2478 	LLVMValueRef indices[8];
   2479 	for (unsigned i = 0; i < count; ++i)
   2480 		indices[i] = LLVMConstInt(ctx->i32, start + i, false);
   2481 
   2482 	LLVMValueRef swizzle = LLVMConstVector(indices, count);
   2483 	return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
   2484 }
   2485 
   2486 static void visit_store_ssbo(struct ac_nir_context *ctx,
   2487                              nir_intrinsic_instr *instr)
   2488 {
   2489 	const char *store_name;
   2490 	LLVMValueRef src_data = get_src(ctx, instr->src[0]);
   2491 	LLVMTypeRef data_type = ctx->ac.f32;
   2492 	int elem_size_mult = get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 32;
   2493 	int components_32bit = elem_size_mult * instr->num_components;
   2494 	unsigned writemask = nir_intrinsic_write_mask(instr);
   2495 	LLVMValueRef base_data, base_offset;
   2496 	LLVMValueRef params[6];
   2497 
   2498 	params[1] = ctx->abi->load_ssbo(ctx->abi,
   2499 				        get_src(ctx, instr->src[1]), true);
   2500 	params[2] = ctx->ac.i32_0; /* vindex */
   2501 	params[4] = ctx->ac.i1false;  /* glc */
   2502 	params[5] = ctx->ac.i1false;  /* slc */
   2503 
   2504 	if (components_32bit > 1)
   2505 		data_type = LLVMVectorType(ctx->ac.f32, components_32bit);
   2506 
   2507 	writemask = widen_mask(writemask, elem_size_mult);
   2508 
   2509 	base_data = ac_to_float(&ctx->ac, src_data);
   2510 	base_data = trim_vector(&ctx->ac, base_data, instr->num_components);
   2511 	base_data = LLVMBuildBitCast(ctx->ac.builder, base_data,
   2512 				     data_type, "");
   2513 	base_offset = get_src(ctx, instr->src[2]);      /* voffset */
   2514 	while (writemask) {
   2515 		int start, count;
   2516 		LLVMValueRef data;
   2517 		LLVMValueRef offset;
   2518 
   2519 		u_bit_scan_consecutive_range(&writemask, &start, &count);
   2520 
   2521 		/* Due to an LLVM limitation, split 3-element writes
   2522 		 * into a 2-element and a 1-element write. */
   2523 		if (count == 3) {
   2524 			writemask |= 1 << (start + 2);
   2525 			count = 2;
   2526 		}
   2527 
   2528 		if (count > 4) {
   2529 			writemask |= ((1u << (count - 4)) - 1u) << (start + 4);
   2530 			count = 4;
   2531 		}
   2532 
   2533 		if (count == 4) {
   2534 			store_name = "llvm.amdgcn.buffer.store.v4f32";
   2535 		} else if (count == 2) {
   2536 			store_name = "llvm.amdgcn.buffer.store.v2f32";
   2537 
   2538 		} else {
   2539 			assert(count == 1);
   2540 			store_name = "llvm.amdgcn.buffer.store.f32";
   2541 		}
   2542 		data = extract_vector_range(&ctx->ac, base_data, start, count);
   2543 
   2544 		offset = base_offset;
   2545 		if (start != 0) {
   2546 			offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, start * 4, false), "");
   2547 		}
   2548 		params[0] = data;
   2549 		params[3] = offset;
   2550 		ac_build_intrinsic(&ctx->ac, store_name,
   2551 				   ctx->ac.voidt, params, 6, 0);
   2552 	}
   2553 }
   2554 
   2555 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
   2556                                       const nir_intrinsic_instr *instr)
   2557 {
   2558 	const char *name;
   2559 	LLVMValueRef params[6];
   2560 	int arg_count = 0;
   2561 
   2562 	if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
   2563 		params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
   2564 	}
   2565 	params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
   2566 	params[arg_count++] = ctx->abi->load_ssbo(ctx->abi,
   2567 						 get_src(ctx, instr->src[0]),
   2568 						 true);
   2569 	params[arg_count++] = ctx->ac.i32_0; /* vindex */
   2570 	params[arg_count++] = get_src(ctx, instr->src[1]);      /* voffset */
   2571 	params[arg_count++] = LLVMConstInt(ctx->ac.i1, 0, false);  /* slc */
   2572 
   2573 	switch (instr->intrinsic) {
   2574 	case nir_intrinsic_ssbo_atomic_add:
   2575 		name = "llvm.amdgcn.buffer.atomic.add";
   2576 		break;
   2577 	case nir_intrinsic_ssbo_atomic_imin:
   2578 		name = "llvm.amdgcn.buffer.atomic.smin";
   2579 		break;
   2580 	case nir_intrinsic_ssbo_atomic_umin:
   2581 		name = "llvm.amdgcn.buffer.atomic.umin";
   2582 		break;
   2583 	case nir_intrinsic_ssbo_atomic_imax:
   2584 		name = "llvm.amdgcn.buffer.atomic.smax";
   2585 		break;
   2586 	case nir_intrinsic_ssbo_atomic_umax:
   2587 		name = "llvm.amdgcn.buffer.atomic.umax";
   2588 		break;
   2589 	case nir_intrinsic_ssbo_atomic_and:
   2590 		name = "llvm.amdgcn.buffer.atomic.and";
   2591 		break;
   2592 	case nir_intrinsic_ssbo_atomic_or:
   2593 		name = "llvm.amdgcn.buffer.atomic.or";
   2594 		break;
   2595 	case nir_intrinsic_ssbo_atomic_xor:
   2596 		name = "llvm.amdgcn.buffer.atomic.xor";
   2597 		break;
   2598 	case nir_intrinsic_ssbo_atomic_exchange:
   2599 		name = "llvm.amdgcn.buffer.atomic.swap";
   2600 		break;
   2601 	case nir_intrinsic_ssbo_atomic_comp_swap:
   2602 		name = "llvm.amdgcn.buffer.atomic.cmpswap";
   2603 		break;
   2604 	default:
   2605 		abort();
   2606 	}
   2607 
   2608 	return ac_build_intrinsic(&ctx->ac, name, ctx->ac.i32, params, arg_count, 0);
   2609 }
   2610 
   2611 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
   2612                                       const nir_intrinsic_instr *instr)
   2613 {
   2614 	LLVMValueRef results[2];
   2615 	int load_components;
   2616 	int num_components = instr->num_components;
   2617 	if (instr->dest.ssa.bit_size == 64)
   2618 		num_components *= 2;
   2619 
   2620 	for (int i = 0; i < num_components; i += load_components) {
   2621 		load_components = MIN2(num_components - i, 4);
   2622 		const char *load_name;
   2623 		LLVMTypeRef data_type = ctx->ac.f32;
   2624 		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * 4, false);
   2625 		offset = LLVMBuildAdd(ctx->ac.builder, get_src(ctx, instr->src[1]), offset, "");
   2626 
   2627 		if (load_components == 3)
   2628 			data_type = LLVMVectorType(ctx->ac.f32, 4);
   2629 		else if (load_components > 1)
   2630 			data_type = LLVMVectorType(ctx->ac.f32, load_components);
   2631 
   2632 		if (load_components >= 3)
   2633 			load_name = "llvm.amdgcn.buffer.load.v4f32";
   2634 		else if (load_components == 2)
   2635 			load_name = "llvm.amdgcn.buffer.load.v2f32";
   2636 		else if (load_components == 1)
   2637 			load_name = "llvm.amdgcn.buffer.load.f32";
   2638 		else
   2639 			unreachable("unhandled number of components");
   2640 
   2641 		LLVMValueRef params[] = {
   2642 			ctx->abi->load_ssbo(ctx->abi,
   2643 					    get_src(ctx, instr->src[0]),
   2644 					    false),
   2645 			ctx->ac.i32_0,
   2646 			offset,
   2647 			ctx->ac.i1false,
   2648 			ctx->ac.i1false,
   2649 		};
   2650 
   2651 		results[i > 0 ? 1 : 0] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
   2652 	}
   2653 
   2654 	assume(results[0]);
   2655 	LLVMValueRef ret = results[0];
   2656 	if (num_components > 4 || num_components == 3) {
   2657 		LLVMValueRef masks[] = {
   2658 		        LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
   2659 		        LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
   2660 			LLVMConstInt(ctx->ac.i32, 4, false), LLVMConstInt(ctx->ac.i32, 5, false),
   2661 		        LLVMConstInt(ctx->ac.i32, 6, false), LLVMConstInt(ctx->ac.i32, 7, false)
   2662 		};
   2663 
   2664 		LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
   2665 		ret = LLVMBuildShuffleVector(ctx->ac.builder, results[0],
   2666 					     results[num_components > 4 ? 1 : 0], swizzle, "");
   2667 	}
   2668 
   2669 	return LLVMBuildBitCast(ctx->ac.builder, ret,
   2670 	                        get_def_type(ctx, &instr->dest.ssa), "");
   2671 }
   2672 
   2673 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
   2674                                           const nir_intrinsic_instr *instr)
   2675 {
   2676 	LLVMValueRef ret;
   2677 	LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
   2678 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
   2679 	int num_components = instr->num_components;
   2680 
   2681 	if (ctx->abi->load_ubo)
   2682 		rsrc = ctx->abi->load_ubo(ctx->abi, rsrc);
   2683 
   2684 	if (instr->dest.ssa.bit_size == 64)
   2685 		num_components *= 2;
   2686 
   2687 	ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
   2688 				   NULL, 0, false, false, true, true);
   2689 	ret = trim_vector(&ctx->ac, ret, num_components);
   2690 	return LLVMBuildBitCast(ctx->ac.builder, ret,
   2691 	                        get_def_type(ctx, &instr->dest.ssa), "");
   2692 }
   2693 
   2694 static void
   2695 get_deref_offset(struct ac_nir_context *ctx, nir_deref_var *deref,
   2696 		 bool vs_in, unsigned *vertex_index_out,
   2697 		 LLVMValueRef *vertex_index_ref,
   2698 		 unsigned *const_out, LLVMValueRef *indir_out)
   2699 {
   2700 	unsigned const_offset = 0;
   2701 	nir_deref *tail = &deref->deref;
   2702 	LLVMValueRef offset = NULL;
   2703 
   2704 	if (vertex_index_out != NULL || vertex_index_ref != NULL) {
   2705 		tail = tail->child;
   2706 		nir_deref_array *deref_array = nir_deref_as_array(tail);
   2707 		if (vertex_index_out)
   2708 			*vertex_index_out = deref_array->base_offset;
   2709 
   2710 		if (vertex_index_ref) {
   2711 			LLVMValueRef vtx = LLVMConstInt(ctx->ac.i32, deref_array->base_offset, false);
   2712 			if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
   2713 				vtx = LLVMBuildAdd(ctx->ac.builder, vtx, get_src(ctx, deref_array->indirect), "");
   2714 			}
   2715 			*vertex_index_ref = vtx;
   2716 		}
   2717 	}
   2718 
   2719 	if (deref->var->data.compact) {
   2720 		assert(tail->child->deref_type == nir_deref_type_array);
   2721 		assert(glsl_type_is_scalar(glsl_without_array(deref->var->type)));
   2722 		nir_deref_array *deref_array = nir_deref_as_array(tail->child);
   2723 		/* We always lower indirect dereferences for "compact" array vars. */
   2724 		assert(deref_array->deref_array_type == nir_deref_array_type_direct);
   2725 
   2726 		const_offset = deref_array->base_offset;
   2727 		goto out;
   2728 	}
   2729 
   2730 	while (tail->child != NULL) {
   2731 		const struct glsl_type *parent_type = tail->type;
   2732 		tail = tail->child;
   2733 
   2734 		if (tail->deref_type == nir_deref_type_array) {
   2735 			nir_deref_array *deref_array = nir_deref_as_array(tail);
   2736 			LLVMValueRef index, stride, local_offset;
   2737 			unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
   2738 
   2739 			const_offset += size * deref_array->base_offset;
   2740 			if (deref_array->deref_array_type == nir_deref_array_type_direct)
   2741 				continue;
   2742 
   2743 			assert(deref_array->deref_array_type == nir_deref_array_type_indirect);
   2744 			index = get_src(ctx, deref_array->indirect);
   2745 			stride = LLVMConstInt(ctx->ac.i32, size, 0);
   2746 			local_offset = LLVMBuildMul(ctx->ac.builder, stride, index, "");
   2747 
   2748 			if (offset)
   2749 				offset = LLVMBuildAdd(ctx->ac.builder, offset, local_offset, "");
   2750 			else
   2751 				offset = local_offset;
   2752 		} else if (tail->deref_type == nir_deref_type_struct) {
   2753 			nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
   2754 
   2755 			for (unsigned i = 0; i < deref_struct->index; i++) {
   2756 				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
   2757 				const_offset += glsl_count_attribute_slots(ft, vs_in);
   2758 			}
   2759 		} else
   2760 			unreachable("unsupported deref type");
   2761 
   2762 	}
   2763 out:
   2764 	if (const_offset && offset)
   2765 		offset = LLVMBuildAdd(ctx->ac.builder, offset,
   2766 				      LLVMConstInt(ctx->ac.i32, const_offset, 0),
   2767 				      "");
   2768 
   2769 	*const_out = const_offset;
   2770 	*indir_out = offset;
   2771 }
   2772 
   2773 
   2774 /* The offchip buffer layout for TCS->TES is
   2775  *
   2776  * - attribute 0 of patch 0 vertex 0
   2777  * - attribute 0 of patch 0 vertex 1
   2778  * - attribute 0 of patch 0 vertex 2
   2779  *   ...
   2780  * - attribute 0 of patch 1 vertex 0
   2781  * - attribute 0 of patch 1 vertex 1
   2782  *   ...
   2783  * - attribute 1 of patch 0 vertex 0
   2784  * - attribute 1 of patch 0 vertex 1
   2785  *   ...
   2786  * - per patch attribute 0 of patch 0
   2787  * - per patch attribute 0 of patch 1
   2788  *   ...
   2789  *
   2790  * Note that every attribute has 4 components.
   2791  */
   2792 static LLVMValueRef get_tcs_tes_buffer_address(struct nir_to_llvm_context *ctx,
   2793                                                LLVMValueRef vertex_index,
   2794                                                LLVMValueRef param_index)
   2795 {
   2796 	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
   2797 	LLVMValueRef param_stride, constant16;
   2798 	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
   2799 
   2800 	vertices_per_patch = unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 9, 6);
   2801 	num_patches = unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
   2802 	total_vertices = LLVMBuildMul(ctx->builder, vertices_per_patch,
   2803 	                              num_patches, "");
   2804 
   2805 	constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
   2806 	if (vertex_index) {
   2807 		base_addr = LLVMBuildMul(ctx->builder, rel_patch_id,
   2808 		                         vertices_per_patch, "");
   2809 
   2810 		base_addr = LLVMBuildAdd(ctx->builder, base_addr,
   2811 		                         vertex_index, "");
   2812 
   2813 		param_stride = total_vertices;
   2814 	} else {
   2815 		base_addr = rel_patch_id;
   2816 		param_stride = num_patches;
   2817 	}
   2818 
   2819 	base_addr = LLVMBuildAdd(ctx->builder, base_addr,
   2820 	                         LLVMBuildMul(ctx->builder, param_index,
   2821 	                                      param_stride, ""), "");
   2822 
   2823 	base_addr = LLVMBuildMul(ctx->builder, base_addr, constant16, "");
   2824 
   2825 	if (!vertex_index) {
   2826 		LLVMValueRef patch_data_offset =
   2827 		           unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 16);
   2828 
   2829 		base_addr = LLVMBuildAdd(ctx->builder, base_addr,
   2830 		                         patch_data_offset, "");
   2831 	}
   2832 	return base_addr;
   2833 }
   2834 
   2835 static LLVMValueRef get_tcs_tes_buffer_address_params(struct nir_to_llvm_context *ctx,
   2836 						      unsigned param,
   2837 						      unsigned const_index,
   2838 						      bool is_compact,
   2839 						      LLVMValueRef vertex_index,
   2840 						      LLVMValueRef indir_index)
   2841 {
   2842 	LLVMValueRef param_index;
   2843 
   2844 	if (indir_index)
   2845 		param_index = LLVMBuildAdd(ctx->builder, LLVMConstInt(ctx->ac.i32, param, false),
   2846 					   indir_index, "");
   2847 	else {
   2848 		if (const_index && !is_compact)
   2849 			param += const_index;
   2850 		param_index = LLVMConstInt(ctx->ac.i32, param, false);
   2851 	}
   2852 	return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
   2853 }
   2854 
   2855 static void
   2856 mark_tess_output(struct nir_to_llvm_context *ctx,
   2857 		 bool is_patch, uint32_t param, int num_slots)
   2858 
   2859 {
   2860 	uint64_t slot_mask = (1ull << num_slots) - 1;
   2861 	if (is_patch) {
   2862 		ctx->tess_patch_outputs_written |= (slot_mask << param);
   2863 	} else
   2864 		ctx->tess_outputs_written |= (slot_mask<< param);
   2865 }
   2866 
   2867 static LLVMValueRef
   2868 get_dw_address(struct nir_to_llvm_context *ctx,
   2869 	       LLVMValueRef dw_addr,
   2870 	       unsigned param,
   2871 	       unsigned const_index,
   2872 	       bool compact_const_index,
   2873 	       LLVMValueRef vertex_index,
   2874 	       LLVMValueRef stride,
   2875 	       LLVMValueRef indir_index)
   2876 
   2877 {
   2878 
   2879 	if (vertex_index) {
   2880 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2881 				       LLVMBuildMul(ctx->builder,
   2882 						    vertex_index,
   2883 						    stride, ""), "");
   2884 	}
   2885 
   2886 	if (indir_index)
   2887 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2888 				       LLVMBuildMul(ctx->builder, indir_index,
   2889 						    LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
   2890 	else if (const_index && !compact_const_index)
   2891 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2892 				       LLVMConstInt(ctx->ac.i32, const_index * 4, false), "");
   2893 
   2894 	dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2895 			       LLVMConstInt(ctx->ac.i32, param * 4, false), "");
   2896 
   2897 	if (const_index && compact_const_index)
   2898 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2899 				       LLVMConstInt(ctx->ac.i32, const_index, false), "");
   2900 	return dw_addr;
   2901 }
   2902 
   2903 static LLVMValueRef
   2904 load_tcs_varyings(struct ac_shader_abi *abi,
   2905 		  LLVMValueRef vertex_index,
   2906 		  LLVMValueRef indir_index,
   2907 		  unsigned const_index,
   2908 		  unsigned location,
   2909 		  unsigned driver_location,
   2910 		  unsigned component,
   2911 		  unsigned num_components,
   2912 		  bool is_patch,
   2913 		  bool is_compact,
   2914 		  bool load_input)
   2915 {
   2916 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   2917 	LLVMValueRef dw_addr, stride;
   2918 	LLVMValueRef value[4], result;
   2919 	unsigned param = shader_io_get_unique_index(location);
   2920 
   2921 	if (load_input) {
   2922 		stride = unpack_param(&ctx->ac, ctx->tcs_in_layout, 13, 8);
   2923 		dw_addr = get_tcs_in_current_patch_offset(ctx);
   2924 	} else {
   2925 		if (!is_patch) {
   2926 			stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8);
   2927 			dw_addr = get_tcs_out_current_patch_offset(ctx);
   2928 		} else {
   2929 			dw_addr = get_tcs_out_current_patch_data_offset(ctx);
   2930 			stride = NULL;
   2931 		}
   2932 	}
   2933 
   2934 	dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
   2935 				 indir_index);
   2936 
   2937 	for (unsigned i = 0; i < num_components + component; i++) {
   2938 		value[i] = ac_lds_load(&ctx->ac, dw_addr);
   2939 		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
   2940 				       ctx->ac.i32_1, "");
   2941 	}
   2942 	result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
   2943 	return result;
   2944 }
   2945 
   2946 static void
   2947 store_tcs_output(struct ac_shader_abi *abi,
   2948 		 const nir_variable *var,
   2949 		 LLVMValueRef vertex_index,
   2950 		 LLVMValueRef param_index,
   2951 		 unsigned const_index,
   2952 		 LLVMValueRef src,
   2953 		 unsigned writemask)
   2954 {
   2955 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   2956 	const unsigned location = var->data.location;
   2957 	const unsigned component = var->data.location_frac;
   2958 	const bool is_patch = var->data.patch;
   2959 	const bool is_compact = var->data.compact;
   2960 	const unsigned count = glsl_count_attribute_slots(var->type, false);
   2961 	LLVMValueRef dw_addr;
   2962 	LLVMValueRef stride = NULL;
   2963 	LLVMValueRef buf_addr = NULL;
   2964 	unsigned param;
   2965 	bool store_lds = true;
   2966 
   2967 	if (is_patch) {
   2968 		if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
   2969 			store_lds = false;
   2970 	} else {
   2971 		if (!(ctx->tcs_outputs_read & (1ULL << location)))
   2972 			store_lds = false;
   2973 	}
   2974 
   2975 	param = shader_io_get_unique_index(location);
   2976 	if (location == VARYING_SLOT_CLIP_DIST0 &&
   2977 	    is_compact && const_index > 3) {
   2978 		const_index -= 3;
   2979 		param++;
   2980 	}
   2981 
   2982 	if (!is_patch) {
   2983 		stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8);
   2984 		dw_addr = get_tcs_out_current_patch_offset(ctx);
   2985 	} else {
   2986 		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
   2987 	}
   2988 
   2989 	if (param_index)
   2990 		mark_tess_output(ctx, is_patch, param, count);
   2991 	else
   2992 		mark_tess_output(ctx, is_patch, param, 1);
   2993 
   2994 	dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
   2995 				 param_index);
   2996 	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
   2997 						     vertex_index, param_index);
   2998 
   2999 	bool is_tess_factor = false;
   3000 	if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
   3001 	    location == VARYING_SLOT_TESS_LEVEL_OUTER)
   3002 		is_tess_factor = true;
   3003 
   3004 	unsigned base = is_compact ? const_index : 0;
   3005 	for (unsigned chan = 0; chan < 8; chan++) {
   3006 		if (!(writemask & (1 << chan)))
   3007 			continue;
   3008 		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
   3009 
   3010 		if (store_lds || is_tess_factor) {
   3011 			LLVMValueRef dw_addr_chan =
   3012 				LLVMBuildAdd(ctx->builder, dw_addr,
   3013 				                           LLVMConstInt(ctx->ac.i32, chan, false), "");
   3014 			ac_lds_store(&ctx->ac, dw_addr_chan, value);
   3015 		}
   3016 
   3017 		if (!is_tess_factor && writemask != 0xF)
   3018 			ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1,
   3019 						    buf_addr, ctx->oc_lds,
   3020 						    4 * (base + chan), 1, 0, true, false);
   3021 	}
   3022 
   3023 	if (writemask == 0xF) {
   3024 		ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4,
   3025 					    buf_addr, ctx->oc_lds,
   3026 					    (base * 4), 1, 0, true, false);
   3027 	}
   3028 }
   3029 
   3030 static LLVMValueRef
   3031 load_tes_input(struct ac_shader_abi *abi,
   3032 	       LLVMValueRef vertex_index,
   3033 	       LLVMValueRef param_index,
   3034 	       unsigned const_index,
   3035 	       unsigned location,
   3036 	       unsigned driver_location,
   3037 	       unsigned component,
   3038 	       unsigned num_components,
   3039 	       bool is_patch,
   3040 	       bool is_compact,
   3041 	       bool load_input)
   3042 {
   3043 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   3044 	LLVMValueRef buf_addr;
   3045 	LLVMValueRef result;
   3046 	unsigned param = shader_io_get_unique_index(location);
   3047 
   3048 	if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) {
   3049 		const_index -= 3;
   3050 		param++;
   3051 	}
   3052 
   3053 	buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
   3054 						     is_compact, vertex_index, param_index);
   3055 
   3056 	LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
   3057 	buf_addr = LLVMBuildAdd(ctx->builder, buf_addr, comp_offset, "");
   3058 
   3059 	result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
   3060 				      buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, false);
   3061 	result = trim_vector(&ctx->ac, result, num_components);
   3062 	return result;
   3063 }
   3064 
   3065 static LLVMValueRef
   3066 load_gs_input(struct ac_shader_abi *abi,
   3067 	      unsigned location,
   3068 	      unsigned driver_location,
   3069 	      unsigned component,
   3070 	      unsigned num_components,
   3071 	      unsigned vertex_index,
   3072 	      unsigned const_index,
   3073 	      LLVMTypeRef type)
   3074 {
   3075 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   3076 	LLVMValueRef vtx_offset;
   3077 	LLVMValueRef args[9];
   3078 	unsigned param, vtx_offset_param;
   3079 	LLVMValueRef value[4], result;
   3080 
   3081 	vtx_offset_param = vertex_index;
   3082 	assert(vtx_offset_param < 6);
   3083 	vtx_offset = LLVMBuildMul(ctx->builder, ctx->gs_vtx_offset[vtx_offset_param],
   3084 				  LLVMConstInt(ctx->ac.i32, 4, false), "");
   3085 
   3086 	param = shader_io_get_unique_index(location);
   3087 
   3088 	for (unsigned i = component; i < num_components + component; i++) {
   3089 		if (ctx->ac.chip_class >= GFX9) {
   3090 			LLVMValueRef dw_addr = ctx->gs_vtx_offset[vtx_offset_param];
   3091 			dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
   3092 			                       LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
   3093 			value[i] = ac_lds_load(&ctx->ac, dw_addr);
   3094 		} else {
   3095 			args[0] = ctx->esgs_ring;
   3096 			args[1] = vtx_offset;
   3097 			args[2] = LLVMConstInt(ctx->ac.i32, (param * 4 + i + const_index) * 256, false);
   3098 			args[3] = ctx->ac.i32_0;
   3099 			args[4] = ctx->ac.i32_1; /* OFFEN */
   3100 			args[5] = ctx->ac.i32_0; /* IDXEN */
   3101 			args[6] = ctx->ac.i32_1; /* GLC */
   3102 			args[7] = ctx->ac.i32_0; /* SLC */
   3103 			args[8] = ctx->ac.i32_0; /* TFE */
   3104 
   3105 			value[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.buffer.load.dword.i32.i32",
   3106 			                              ctx->ac.i32, args, 9,
   3107 			                              AC_FUNC_ATTR_READONLY |
   3108 			                              AC_FUNC_ATTR_LEGACY);
   3109 		}
   3110 	}
   3111 	result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
   3112 
   3113 	return result;
   3114 }
   3115 
   3116 static LLVMValueRef
   3117 build_gep_for_deref(struct ac_nir_context *ctx,
   3118 		    nir_deref_var *deref)
   3119 {
   3120 	struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, deref->var);
   3121 	assert(entry->data);
   3122 	LLVMValueRef val = entry->data;
   3123 	nir_deref *tail = deref->deref.child;
   3124 	while (tail != NULL) {
   3125 		LLVMValueRef offset;
   3126 		switch (tail->deref_type) {
   3127 		case nir_deref_type_array: {
   3128 			nir_deref_array *array = nir_deref_as_array(tail);
   3129 			offset = LLVMConstInt(ctx->ac.i32, array->base_offset, 0);
   3130 			if (array->deref_array_type ==
   3131 			    nir_deref_array_type_indirect) {
   3132 				offset = LLVMBuildAdd(ctx->ac.builder, offset,
   3133 						      get_src(ctx,
   3134 							      array->indirect),
   3135 						      "");
   3136 			}
   3137 			break;
   3138 		}
   3139 		case nir_deref_type_struct: {
   3140 			nir_deref_struct *deref_struct =
   3141 				nir_deref_as_struct(tail);
   3142 			offset = LLVMConstInt(ctx->ac.i32,
   3143 					      deref_struct->index, 0);
   3144 			break;
   3145 		}
   3146 		default:
   3147 			unreachable("bad deref type");
   3148 		}
   3149 		val = ac_build_gep0(&ctx->ac, val, offset);
   3150 		tail = tail->child;
   3151 	}
   3152 	return val;
   3153 }
   3154 
   3155 static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx,
   3156 				       nir_intrinsic_instr *instr,
   3157 				       bool load_inputs)
   3158 {
   3159 	LLVMValueRef result;
   3160 	LLVMValueRef vertex_index = NULL;
   3161 	LLVMValueRef indir_index = NULL;
   3162 	unsigned const_index = 0;
   3163 	unsigned location = instr->variables[0]->var->data.location;
   3164 	unsigned driver_location = instr->variables[0]->var->data.driver_location;
   3165 	const bool is_patch =  instr->variables[0]->var->data.patch;
   3166 	const bool is_compact = instr->variables[0]->var->data.compact;
   3167 
   3168 	get_deref_offset(ctx, instr->variables[0],
   3169 			 false, NULL, is_patch ? NULL : &vertex_index,
   3170 			 &const_index, &indir_index);
   3171 
   3172 	result = ctx->abi->load_tess_varyings(ctx->abi, vertex_index, indir_index,
   3173 					      const_index, location, driver_location,
   3174 					      instr->variables[0]->var->data.location_frac,
   3175 					      instr->num_components,
   3176 					      is_patch, is_compact, load_inputs);
   3177 	return LLVMBuildBitCast(ctx->ac.builder, result, get_def_type(ctx, &instr->dest.ssa), "");
   3178 }
   3179 
   3180 static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
   3181 				   nir_intrinsic_instr *instr)
   3182 {
   3183 	LLVMValueRef values[8];
   3184 	int idx = instr->variables[0]->var->data.driver_location;
   3185 	int ve = instr->dest.ssa.num_components;
   3186 	unsigned comp = instr->variables[0]->var->data.location_frac;
   3187 	LLVMValueRef indir_index;
   3188 	LLVMValueRef ret;
   3189 	unsigned const_index;
   3190 	unsigned stride = instr->variables[0]->var->data.compact ? 1 : 4;
   3191 	bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
   3192 	             instr->variables[0]->var->data.mode == nir_var_shader_in;
   3193 	get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
   3194 				      &const_index, &indir_index);
   3195 
   3196 	if (instr->dest.ssa.bit_size == 64)
   3197 		ve *= 2;
   3198 
   3199 	switch (instr->variables[0]->var->data.mode) {
   3200 	case nir_var_shader_in:
   3201 		if (ctx->stage == MESA_SHADER_TESS_CTRL ||
   3202 		    ctx->stage == MESA_SHADER_TESS_EVAL) {
   3203 			return load_tess_varyings(ctx, instr, true);
   3204 		}
   3205 
   3206 		if (ctx->stage == MESA_SHADER_GEOMETRY) {
   3207 				LLVMValueRef indir_index;
   3208 				unsigned const_index, vertex_index;
   3209 				get_deref_offset(ctx, instr->variables[0],
   3210 						 false, &vertex_index, NULL,
   3211 						 &const_index, &indir_index);
   3212 			return ctx->abi->load_inputs(ctx->abi, instr->variables[0]->var->data.location,
   3213 						     instr->variables[0]->var->data.driver_location,
   3214 						     instr->variables[0]->var->data.location_frac, ve,
   3215 						     vertex_index, const_index,
   3216 						     nir2llvmtype(ctx, instr->variables[0]->var->type));
   3217 		}
   3218 
   3219 		for (unsigned chan = comp; chan < ve + comp; chan++) {
   3220 			if (indir_index) {
   3221 				unsigned count = glsl_count_attribute_slots(
   3222 						instr->variables[0]->var->type,
   3223 						ctx->stage == MESA_SHADER_VERTEX);
   3224 				count -= chan / 4;
   3225 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
   3226 						&ctx->ac, ctx->abi->inputs + idx + chan, count,
   3227 						stride, false, true);
   3228 
   3229 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
   3230 								       tmp_vec,
   3231 								       indir_index, "");
   3232 			} else
   3233 				values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
   3234 		}
   3235 		break;
   3236 	case nir_var_local:
   3237 		for (unsigned chan = 0; chan < ve; chan++) {
   3238 			if (indir_index) {
   3239 				unsigned count = glsl_count_attribute_slots(
   3240 					instr->variables[0]->var->type, false);
   3241 				count -= chan / 4;
   3242 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
   3243 						&ctx->ac, ctx->locals + idx + chan, count,
   3244 						stride, true, true);
   3245 
   3246 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
   3247 								       tmp_vec,
   3248 								       indir_index, "");
   3249 			} else {
   3250 				values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], "");
   3251 			}
   3252 		}
   3253 		break;
   3254 	case nir_var_shared: {
   3255 		LLVMValueRef address = build_gep_for_deref(ctx,
   3256 							   instr->variables[0]);
   3257 		LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
   3258 		return LLVMBuildBitCast(ctx->ac.builder, val,
   3259 					get_def_type(ctx, &instr->dest.ssa),
   3260 					"");
   3261 	}
   3262 	case nir_var_shader_out:
   3263 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
   3264 			return load_tess_varyings(ctx, instr, false);
   3265 		}
   3266 
   3267 		for (unsigned chan = comp; chan < ve + comp; chan++) {
   3268 			if (indir_index) {
   3269 				unsigned count = glsl_count_attribute_slots(
   3270 						instr->variables[0]->var->type, false);
   3271 				count -= chan / 4;
   3272 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
   3273 						&ctx->ac, ctx->outputs + idx + chan, count,
   3274 						stride, true, true);
   3275 
   3276 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
   3277 								       tmp_vec,
   3278 								       indir_index, "");
   3279 			} else {
   3280 				values[chan] = LLVMBuildLoad(ctx->ac.builder,
   3281 						     ctx->outputs[idx + chan + const_index * stride],
   3282 						     "");
   3283 			}
   3284 		}
   3285 		break;
   3286 	default:
   3287 		unreachable("unhandle variable mode");
   3288 	}
   3289 	ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
   3290 	return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
   3291 }
   3292 
   3293 static void
   3294 visit_store_var(struct ac_nir_context *ctx,
   3295 		nir_intrinsic_instr *instr)
   3296 {
   3297 	LLVMValueRef temp_ptr, value;
   3298 	int idx = instr->variables[0]->var->data.driver_location;
   3299 	unsigned comp = instr->variables[0]->var->data.location_frac;
   3300 	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
   3301 	int writemask = instr->const_index[0] << comp;
   3302 	LLVMValueRef indir_index;
   3303 	unsigned const_index;
   3304 	get_deref_offset(ctx, instr->variables[0], false,
   3305 		         NULL, NULL, &const_index, &indir_index);
   3306 
   3307 	if (get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
   3308 
   3309 		src = LLVMBuildBitCast(ctx->ac.builder, src,
   3310 		                       LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
   3311 		                       "");
   3312 
   3313 		writemask = widen_mask(writemask, 2);
   3314 	}
   3315 
   3316 	switch (instr->variables[0]->var->data.mode) {
   3317 	case nir_var_shader_out:
   3318 
   3319 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
   3320 			LLVMValueRef vertex_index = NULL;
   3321 			LLVMValueRef indir_index = NULL;
   3322 			unsigned const_index = 0;
   3323 			const bool is_patch = instr->variables[0]->var->data.patch;
   3324 
   3325 			get_deref_offset(ctx, instr->variables[0],
   3326 					 false, NULL, is_patch ? NULL : &vertex_index,
   3327 					 &const_index, &indir_index);
   3328 
   3329 			ctx->abi->store_tcs_outputs(ctx->abi, instr->variables[0]->var,
   3330 						    vertex_index, indir_index,
   3331 						    const_index, src, writemask);
   3332 			return;
   3333 		}
   3334 
   3335 		for (unsigned chan = 0; chan < 8; chan++) {
   3336 			int stride = 4;
   3337 			if (!(writemask & (1 << chan)))
   3338 				continue;
   3339 
   3340 			value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
   3341 
   3342 			if (instr->variables[0]->var->data.compact)
   3343 				stride = 1;
   3344 			if (indir_index) {
   3345 				unsigned count = glsl_count_attribute_slots(
   3346 						instr->variables[0]->var->type, false);
   3347 				count -= chan / 4;
   3348 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
   3349 						&ctx->ac, ctx->outputs + idx + chan, count,
   3350 						stride, true, true);
   3351 
   3352 				tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
   3353 							         value, indir_index, "");
   3354 				build_store_values_extended(&ctx->ac, ctx->outputs + idx + chan,
   3355 							    count, stride, tmp_vec);
   3356 
   3357 			} else {
   3358 				temp_ptr = ctx->outputs[idx + chan + const_index * stride];
   3359 
   3360 				LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
   3361 			}
   3362 		}
   3363 		break;
   3364 	case nir_var_local:
   3365 		for (unsigned chan = 0; chan < 8; chan++) {
   3366 			if (!(writemask & (1 << chan)))
   3367 				continue;
   3368 
   3369 			value = ac_llvm_extract_elem(&ctx->ac, src, chan);
   3370 			if (indir_index) {
   3371 				unsigned count = glsl_count_attribute_slots(
   3372 					instr->variables[0]->var->type, false);
   3373 				count -= chan / 4;
   3374 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
   3375 					&ctx->ac, ctx->locals + idx + chan, count,
   3376 					4, true, true);
   3377 
   3378 				tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
   3379 								 value, indir_index, "");
   3380 				build_store_values_extended(&ctx->ac, ctx->locals + idx + chan,
   3381 							    count, 4, tmp_vec);
   3382 			} else {
   3383 				temp_ptr = ctx->locals[idx + chan + const_index * 4];
   3384 
   3385 				LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
   3386 			}
   3387 		}
   3388 		break;
   3389 	case nir_var_shared: {
   3390 		int writemask = instr->const_index[0];
   3391 		LLVMValueRef address = build_gep_for_deref(ctx,
   3392 							   instr->variables[0]);
   3393 		LLVMValueRef val = get_src(ctx, instr->src[0]);
   3394 		unsigned components =
   3395 			glsl_get_vector_elements(
   3396 			   nir_deref_tail(&instr->variables[0]->deref)->type);
   3397 		if (writemask == (1 << components) - 1) {
   3398 			val = LLVMBuildBitCast(
   3399 			   ctx->ac.builder, val,
   3400 			   LLVMGetElementType(LLVMTypeOf(address)), "");
   3401 			LLVMBuildStore(ctx->ac.builder, val, address);
   3402 		} else {
   3403 			for (unsigned chan = 0; chan < 4; chan++) {
   3404 				if (!(writemask & (1 << chan)))
   3405 					continue;
   3406 				LLVMValueRef ptr =
   3407 					LLVMBuildStructGEP(ctx->ac.builder,
   3408 							   address, chan, "");
   3409 				LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val,
   3410 									chan);
   3411 				src = LLVMBuildBitCast(
   3412 				   ctx->ac.builder, src,
   3413 				   LLVMGetElementType(LLVMTypeOf(ptr)), "");
   3414 				LLVMBuildStore(ctx->ac.builder, src, ptr);
   3415 			}
   3416 		}
   3417 		break;
   3418 	}
   3419 	default:
   3420 		break;
   3421 	}
   3422 }
   3423 
   3424 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
   3425 {
   3426 	switch (dim) {
   3427 	case GLSL_SAMPLER_DIM_BUF:
   3428 		return 1;
   3429 	case GLSL_SAMPLER_DIM_1D:
   3430 		return array ? 2 : 1;
   3431 	case GLSL_SAMPLER_DIM_2D:
   3432 		return array ? 3 : 2;
   3433 	case GLSL_SAMPLER_DIM_MS:
   3434 		return array ? 4 : 3;
   3435 	case GLSL_SAMPLER_DIM_3D:
   3436 	case GLSL_SAMPLER_DIM_CUBE:
   3437 		return 3;
   3438 	case GLSL_SAMPLER_DIM_RECT:
   3439 	case GLSL_SAMPLER_DIM_SUBPASS:
   3440 		return 2;
   3441 	case GLSL_SAMPLER_DIM_SUBPASS_MS:
   3442 		return 3;
   3443 	default:
   3444 		break;
   3445 	}
   3446 	return 0;
   3447 }
   3448 
   3449 
   3450 
   3451 /* Adjust the sample index according to FMASK.
   3452  *
   3453  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
   3454  * which is the identity mapping. Each nibble says which physical sample
   3455  * should be fetched to get that sample.
   3456  *
   3457  * For example, 0x11111100 means there are only 2 samples stored and
   3458  * the second sample covers 3/4 of the pixel. When reading samples 0
   3459  * and 1, return physical sample 0 (determined by the first two 0s
   3460  * in FMASK), otherwise return physical sample 1.
   3461  *
   3462  * The sample index should be adjusted as follows:
   3463  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
   3464  */
   3465 static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
   3466 						    LLVMValueRef coord_x, LLVMValueRef coord_y,
   3467 						    LLVMValueRef coord_z,
   3468 						    LLVMValueRef sample_index,
   3469 						    LLVMValueRef fmask_desc_ptr)
   3470 {
   3471 	LLVMValueRef fmask_load_address[4];
   3472 	LLVMValueRef res;
   3473 
   3474 	fmask_load_address[0] = coord_x;
   3475 	fmask_load_address[1] = coord_y;
   3476 	if (coord_z) {
   3477 		fmask_load_address[2] = coord_z;
   3478 		fmask_load_address[3] = LLVMGetUndef(ctx->i32);
   3479 	}
   3480 
   3481 	struct ac_image_args args = {0};
   3482 
   3483 	args.opcode = ac_image_load;
   3484 	args.da = coord_z ? true : false;
   3485 	args.resource = fmask_desc_ptr;
   3486 	args.dmask = 0xf;
   3487 	args.addr = ac_build_gather_values(ctx, fmask_load_address, coord_z ? 4 : 2);
   3488 
   3489 	res = ac_build_image_opcode(ctx, &args);
   3490 
   3491 	res = ac_to_integer(ctx, res);
   3492 	LLVMValueRef four = LLVMConstInt(ctx->i32, 4, false);
   3493 	LLVMValueRef F = LLVMConstInt(ctx->i32, 0xf, false);
   3494 
   3495 	LLVMValueRef fmask = LLVMBuildExtractElement(ctx->builder,
   3496 						     res,
   3497 						     ctx->i32_0, "");
   3498 
   3499 	LLVMValueRef sample_index4 =
   3500 		LLVMBuildMul(ctx->builder, sample_index, four, "");
   3501 	LLVMValueRef shifted_fmask =
   3502 		LLVMBuildLShr(ctx->builder, fmask, sample_index4, "");
   3503 	LLVMValueRef final_sample =
   3504 		LLVMBuildAnd(ctx->builder, shifted_fmask, F, "");
   3505 
   3506 	/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
   3507 	 * resource descriptor is 0 (invalid),
   3508 	 */
   3509 	LLVMValueRef fmask_desc =
   3510 		LLVMBuildBitCast(ctx->builder, fmask_desc_ptr,
   3511 				 ctx->v8i32, "");
   3512 
   3513 	LLVMValueRef fmask_word1 =
   3514 		LLVMBuildExtractElement(ctx->builder, fmask_desc,
   3515 					ctx->i32_1, "");
   3516 
   3517 	LLVMValueRef word1_is_nonzero =
   3518 		LLVMBuildICmp(ctx->builder, LLVMIntNE,
   3519 			      fmask_word1, ctx->i32_0, "");
   3520 
   3521 	/* Replace the MSAA sample index. */
   3522 	sample_index =
   3523 		LLVMBuildSelect(ctx->builder, word1_is_nonzero,
   3524 				final_sample, sample_index, "");
   3525 	return sample_index;
   3526 }
   3527 
   3528 static LLVMValueRef get_image_coords(struct ac_nir_context *ctx,
   3529 				     const nir_intrinsic_instr *instr)
   3530 {
   3531 	const struct glsl_type *type = glsl_without_array(instr->variables[0]->var->type);
   3532 
   3533 	LLVMValueRef src0 = get_src(ctx, instr->src[0]);
   3534 	LLVMValueRef coords[4];
   3535 	LLVMValueRef masks[] = {
   3536 		LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
   3537 		LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
   3538 	};
   3539 	LLVMValueRef res;
   3540 	LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[1]), 0);
   3541 
   3542 	int count;
   3543 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
   3544 	bool is_array = glsl_sampler_type_is_array(type);
   3545 	bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
   3546 			     dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
   3547 	bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
   3548 		      dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
   3549 	bool gfx9_1d = ctx->ac.chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
   3550 	count = image_type_to_components_count(dim, is_array);
   3551 
   3552 	if (is_ms) {
   3553 		LLVMValueRef fmask_load_address[3];
   3554 		int chan;
   3555 
   3556 		fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
   3557 		fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
   3558 		if (is_array)
   3559 			fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
   3560 		else
   3561 			fmask_load_address[2] = NULL;
   3562 		if (add_frag_pos) {
   3563 			for (chan = 0; chan < 2; ++chan)
   3564 				fmask_load_address[chan] =
   3565 					LLVMBuildAdd(ctx->ac.builder, fmask_load_address[chan],
   3566 						LLVMBuildFPToUI(ctx->ac.builder, ctx->abi->frag_pos[chan],
   3567 								ctx->ac.i32, ""), "");
   3568 			fmask_load_address[2] = ac_to_integer(&ctx->ac, ctx->abi->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]);
   3569 		}
   3570 		sample_index = adjust_sample_index_using_fmask(&ctx->ac,
   3571 							       fmask_load_address[0],
   3572 							       fmask_load_address[1],
   3573 							       fmask_load_address[2],
   3574 							       sample_index,
   3575 							       get_sampler_desc(ctx, instr->variables[0], AC_DESC_FMASK, NULL, true, false));
   3576 	}
   3577 	if (count == 1 && !gfx9_1d) {
   3578 		if (instr->src[0].ssa->num_components)
   3579 			res = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
   3580 		else
   3581 			res = src0;
   3582 	} else {
   3583 		int chan;
   3584 		if (is_ms)
   3585 			count--;
   3586 		for (chan = 0; chan < count; ++chan) {
   3587 			coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
   3588 		}
   3589 		if (add_frag_pos) {
   3590 			for (chan = 0; chan < 2; ++chan)
   3591 				coords[chan] = LLVMBuildAdd(ctx->ac.builder, coords[chan], LLVMBuildFPToUI(ctx->ac.builder, ctx->abi->frag_pos[chan],
   3592 						ctx->ac.i32, ""), "");
   3593 			coords[2] = ac_to_integer(&ctx->ac, ctx->abi->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]);
   3594 			count++;
   3595 		}
   3596 
   3597 		if (gfx9_1d) {
   3598 			if (is_array) {
   3599 				coords[2] = coords[1];
   3600 				coords[1] = ctx->ac.i32_0;
   3601 			} else
   3602 				coords[1] = ctx->ac.i32_0;
   3603 			count++;
   3604 		}
   3605 
   3606 		if (is_ms) {
   3607 			coords[count] = sample_index;
   3608 			count++;
   3609 		}
   3610 
   3611 		if (count == 3) {
   3612 			coords[3] = LLVMGetUndef(ctx->ac.i32);
   3613 			count = 4;
   3614 		}
   3615 		res = ac_build_gather_values(&ctx->ac, coords, count);
   3616 	}
   3617 	return res;
   3618 }
   3619 
   3620 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
   3621                                                 const nir_intrinsic_instr *instr, bool write)
   3622 {
   3623 	LLVMValueRef rsrc = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, write);
   3624 	if (ctx->abi->gfx9_stride_size_workaround) {
   3625 		LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
   3626 		LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
   3627 		stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
   3628 
   3629 		LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder,
   3630 		                                              LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""),
   3631 		                                              elem_count, stride, "");
   3632 
   3633 		rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count,
   3634 		                              LLVMConstInt(ctx->ac.i32, 2, 0), "");
   3635 	}
   3636 	return rsrc;
   3637 }
   3638 
   3639 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
   3640 				     const nir_intrinsic_instr *instr)
   3641 {
   3642 	LLVMValueRef params[7];
   3643 	LLVMValueRef res;
   3644 	char intrinsic_name[64];
   3645 	const nir_variable *var = instr->variables[0]->var;
   3646 	const struct glsl_type *type = var->type;
   3647 
   3648 	if(instr->variables[0]->deref.child)
   3649 		type = instr->variables[0]->deref.child->type;
   3650 
   3651 	type = glsl_without_array(type);
   3652 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
   3653 		params[0] = get_image_buffer_descriptor(ctx, instr, false);
   3654 		params[1] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
   3655 						    ctx->ac.i32_0, ""); /* vindex */
   3656 		params[2] = ctx->ac.i32_0; /* voffset */
   3657 		params[3] = ctx->ac.i1false;  /* glc */
   3658 		params[4] = ctx->ac.i1false;  /* slc */
   3659 		res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.load.format.v4f32", ctx->ac.v4f32,
   3660 					 params, 5, 0);
   3661 
   3662 		res = trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
   3663 		res = ac_to_integer(&ctx->ac, res);
   3664 	} else {
   3665 		bool is_da = glsl_sampler_type_is_array(type) ||
   3666 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE ||
   3667 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_3D ||
   3668 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS ||
   3669 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_SUBPASS_MS;
   3670 		LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false;
   3671 		LLVMValueRef glc = ctx->ac.i1false;
   3672 		LLVMValueRef slc = ctx->ac.i1false;
   3673 
   3674 		params[0] = get_image_coords(ctx, instr);
   3675 		params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false);
   3676 		params[2] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */
   3677 		if (HAVE_LLVM <= 0x0309) {
   3678 			params[3] = ctx->ac.i1false;  /* r128 */
   3679 			params[4] = da;
   3680 			params[5] = glc;
   3681 			params[6] = slc;
   3682 		} else {
   3683 			LLVMValueRef lwe = ctx->ac.i1false;
   3684 			params[3] = glc;
   3685 			params[4] = slc;
   3686 			params[5] = lwe;
   3687 			params[6] = da;
   3688 		}
   3689 
   3690 		ac_get_image_intr_name("llvm.amdgcn.image.load",
   3691 				       ctx->ac.v4f32, /* vdata */
   3692 				       LLVMTypeOf(params[0]), /* coords */
   3693 				       LLVMTypeOf(params[1]), /* rsrc */
   3694 				       intrinsic_name, sizeof(intrinsic_name));
   3695 
   3696 		res = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.v4f32,
   3697 					 params, 7, AC_FUNC_ATTR_READONLY);
   3698 	}
   3699 	return ac_to_integer(&ctx->ac, res);
   3700 }
   3701 
   3702 static void visit_image_store(struct ac_nir_context *ctx,
   3703 			      nir_intrinsic_instr *instr)
   3704 {
   3705 	LLVMValueRef params[8];
   3706 	char intrinsic_name[64];
   3707 	const nir_variable *var = instr->variables[0]->var;
   3708 	const struct glsl_type *type = glsl_without_array(var->type);
   3709 	LLVMValueRef glc = ctx->ac.i1false;
   3710 	bool force_glc = ctx->ac.chip_class == SI;
   3711 	if (force_glc)
   3712 		glc = ctx->ac.i1true;
   3713 
   3714 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
   3715 		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true);
   3716 
   3717 		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */
   3718 		params[1] = rsrc;
   3719 		params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
   3720 						    ctx->ac.i32_0, ""); /* vindex */
   3721 		params[3] = ctx->ac.i32_0; /* voffset */
   3722 		params[4] = glc;  /* glc */
   3723 		params[5] = ctx->ac.i1false;  /* slc */
   3724 		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->ac.voidt,
   3725 				   params, 6, 0);
   3726 	} else {
   3727 		bool is_da = glsl_sampler_type_is_array(type) ||
   3728 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE ||
   3729 			     glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_3D;
   3730 		LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false;
   3731 		LLVMValueRef slc = ctx->ac.i1false;
   3732 
   3733 		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2]));
   3734 		params[1] = get_image_coords(ctx, instr); /* coords */
   3735 		params[2] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, true);
   3736 		params[3] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */
   3737 		if (HAVE_LLVM <= 0x0309) {
   3738 			params[4] = ctx->ac.i1false;  /* r128 */
   3739 			params[5] = da;
   3740 			params[6] = glc;
   3741 			params[7] = slc;
   3742 		} else {
   3743 			LLVMValueRef lwe = ctx->ac.i1false;
   3744 			params[4] = glc;
   3745 			params[5] = slc;
   3746 			params[6] = lwe;
   3747 			params[7] = da;
   3748 		}
   3749 
   3750 		ac_get_image_intr_name("llvm.amdgcn.image.store",
   3751 				       LLVMTypeOf(params[0]), /* vdata */
   3752 				       LLVMTypeOf(params[1]), /* coords */
   3753 				       LLVMTypeOf(params[2]), /* rsrc */
   3754 				       intrinsic_name, sizeof(intrinsic_name));
   3755 
   3756 		ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.voidt,
   3757 				   params, 8, 0);
   3758 	}
   3759 
   3760 }
   3761 
   3762 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
   3763                                        const nir_intrinsic_instr *instr)
   3764 {
   3765 	LLVMValueRef params[7];
   3766 	int param_count = 0;
   3767 	const nir_variable *var = instr->variables[0]->var;
   3768 
   3769 	const char *atomic_name;
   3770 	char intrinsic_name[41];
   3771 	const struct glsl_type *type = glsl_without_array(var->type);
   3772 	MAYBE_UNUSED int length;
   3773 
   3774 	bool is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT;
   3775 
   3776 	switch (instr->intrinsic) {
   3777 	case nir_intrinsic_image_atomic_add:
   3778 		atomic_name = "add";
   3779 		break;
   3780 	case nir_intrinsic_image_atomic_min:
   3781 		atomic_name = is_unsigned ? "umin" : "smin";
   3782 		break;
   3783 	case nir_intrinsic_image_atomic_max:
   3784 		atomic_name = is_unsigned ? "umax" : "smax";
   3785 		break;
   3786 	case nir_intrinsic_image_atomic_and:
   3787 		atomic_name = "and";
   3788 		break;
   3789 	case nir_intrinsic_image_atomic_or:
   3790 		atomic_name = "or";
   3791 		break;
   3792 	case nir_intrinsic_image_atomic_xor:
   3793 		atomic_name = "xor";
   3794 		break;
   3795 	case nir_intrinsic_image_atomic_exchange:
   3796 		atomic_name = "swap";
   3797 		break;
   3798 	case nir_intrinsic_image_atomic_comp_swap:
   3799 		atomic_name = "cmpswap";
   3800 		break;
   3801 	default:
   3802 		abort();
   3803 	}
   3804 
   3805 	if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
   3806 		params[param_count++] = get_src(ctx, instr->src[3]);
   3807 	params[param_count++] = get_src(ctx, instr->src[2]);
   3808 
   3809 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
   3810 		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true);
   3811 		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
   3812 								ctx->ac.i32_0, ""); /* vindex */
   3813 		params[param_count++] = ctx->ac.i32_0; /* voffset */
   3814 		params[param_count++] = ctx->ac.i1false;  /* slc */
   3815 
   3816 		length = snprintf(intrinsic_name, sizeof(intrinsic_name),
   3817 				  "llvm.amdgcn.buffer.atomic.%s", atomic_name);
   3818 	} else {
   3819 		char coords_type[8];
   3820 
   3821 		bool da = glsl_sampler_type_is_array(type) ||
   3822 		          glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
   3823 
   3824 		LLVMValueRef coords = params[param_count++] = get_image_coords(ctx, instr);
   3825 		params[param_count++] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE,
   3826 							 NULL, true, true);
   3827 		params[param_count++] = ctx->ac.i1false; /* r128 */
   3828 		params[param_count++] = da ? ctx->ac.i1true : ctx->ac.i1false;      /* da */
   3829 		params[param_count++] = ctx->ac.i1false;  /* slc */
   3830 
   3831 		build_int_type_name(LLVMTypeOf(coords),
   3832 				    coords_type, sizeof(coords_type));
   3833 
   3834 		length = snprintf(intrinsic_name, sizeof(intrinsic_name),
   3835 				  "llvm.amdgcn.image.atomic.%s.%s", atomic_name, coords_type);
   3836 	}
   3837 
   3838 	assert(length < sizeof(intrinsic_name));
   3839 	return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, params, param_count, 0);
   3840 }
   3841 
   3842 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
   3843 				     const nir_intrinsic_instr *instr)
   3844 {
   3845 	LLVMValueRef res;
   3846 	const nir_variable *var = instr->variables[0]->var;
   3847 	const struct glsl_type *type = instr->variables[0]->var->type;
   3848 	bool da = glsl_sampler_type_is_array(var->type) ||
   3849 		  glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE ||
   3850 		  glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_3D;
   3851 	if(instr->variables[0]->deref.child)
   3852 		type = instr->variables[0]->deref.child->type;
   3853 
   3854 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF)
   3855 		return get_buffer_size(ctx,
   3856 			get_sampler_desc(ctx, instr->variables[0],
   3857 					 AC_DESC_BUFFER, NULL, true, false), true);
   3858 
   3859 	struct ac_image_args args = { 0 };
   3860 
   3861 	args.da = da;
   3862 	args.dmask = 0xf;
   3863 	args.resource = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false);
   3864 	args.opcode = ac_image_get_resinfo;
   3865 	args.addr = ctx->ac.i32_0;
   3866 
   3867 	res = ac_build_image_opcode(&ctx->ac, &args);
   3868 
   3869 	LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
   3870 
   3871 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
   3872 	    glsl_sampler_type_is_array(type)) {
   3873 		LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
   3874 		LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
   3875 		z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
   3876 		res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
   3877 	}
   3878 	if (ctx->ac.chip_class >= GFX9 &&
   3879 	    glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
   3880 	    glsl_sampler_type_is_array(type)) {
   3881 		LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
   3882 		res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
   3883 						ctx->ac.i32_1, "");
   3884 
   3885 	}
   3886 	return res;
   3887 }
   3888 
   3889 #define NOOP_WAITCNT 0xf7f
   3890 #define LGKM_CNT 0x07f
   3891 #define VM_CNT 0xf70
   3892 
   3893 static void emit_membar(struct nir_to_llvm_context *ctx,
   3894 			const nir_intrinsic_instr *instr)
   3895 {
   3896 	unsigned waitcnt = NOOP_WAITCNT;
   3897 
   3898 	switch (instr->intrinsic) {
   3899 	case nir_intrinsic_memory_barrier:
   3900 	case nir_intrinsic_group_memory_barrier:
   3901 		waitcnt &= VM_CNT & LGKM_CNT;
   3902 		break;
   3903 	case nir_intrinsic_memory_barrier_atomic_counter:
   3904 	case nir_intrinsic_memory_barrier_buffer:
   3905 	case nir_intrinsic_memory_barrier_image:
   3906 		waitcnt &= VM_CNT;
   3907 		break;
   3908 	case nir_intrinsic_memory_barrier_shared:
   3909 		waitcnt &= LGKM_CNT;
   3910 		break;
   3911 	default:
   3912 		break;
   3913 	}
   3914 	if (waitcnt != NOOP_WAITCNT)
   3915 		ac_build_waitcnt(&ctx->ac, waitcnt);
   3916 }
   3917 
   3918 static void emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
   3919 {
   3920 	/* SI only (thanks to a hw bug workaround):
   3921 	 * The real barrier instruction isnt needed, because an entire patch
   3922 	 * always fits into a single wave.
   3923 	 */
   3924 	if (ac->chip_class == SI && stage == MESA_SHADER_TESS_CTRL) {
   3925 		ac_build_waitcnt(ac, LGKM_CNT & VM_CNT);
   3926 		return;
   3927 	}
   3928 	ac_build_intrinsic(ac, "llvm.amdgcn.s.barrier",
   3929 			   ac->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
   3930 }
   3931 
   3932 static void emit_discard(struct ac_nir_context *ctx,
   3933 			 const nir_intrinsic_instr *instr)
   3934 {
   3935 	LLVMValueRef cond;
   3936 
   3937 	if (instr->intrinsic == nir_intrinsic_discard_if) {
   3938 		cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
   3939 				     get_src(ctx, instr->src[0]),
   3940 				     ctx->ac.i32_0, "");
   3941 	} else {
   3942 		assert(instr->intrinsic == nir_intrinsic_discard);
   3943 		cond = LLVMConstInt(ctx->ac.i1, false, 0);
   3944 	}
   3945 
   3946 	ac_build_kill_if_false(&ctx->ac, cond);
   3947 }
   3948 
   3949 static LLVMValueRef
   3950 visit_load_helper_invocation(struct ac_nir_context *ctx)
   3951 {
   3952 	LLVMValueRef result = ac_build_intrinsic(&ctx->ac,
   3953 						 "llvm.amdgcn.ps.live",
   3954 						 ctx->ac.i1, NULL, 0,
   3955 						 AC_FUNC_ATTR_READNONE);
   3956 	result = LLVMBuildNot(ctx->ac.builder, result, "");
   3957 	return LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, "");
   3958 }
   3959 
   3960 static LLVMValueRef
   3961 visit_load_local_invocation_index(struct nir_to_llvm_context *ctx)
   3962 {
   3963 	LLVMValueRef result;
   3964 	LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
   3965 	result = LLVMBuildAnd(ctx->builder, ctx->tg_size,
   3966 			      LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
   3967 
   3968 	return LLVMBuildAdd(ctx->builder, result, thread_id, "");
   3969 }
   3970 
   3971 static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx,
   3972 				     const nir_intrinsic_instr *instr)
   3973 {
   3974 	LLVMValueRef ptr, result;
   3975 	LLVMValueRef src = get_src(ctx->nir, instr->src[0]);
   3976 	ptr = build_gep_for_deref(ctx->nir, instr->variables[0]);
   3977 
   3978 	if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap) {
   3979 		LLVMValueRef src1 = get_src(ctx->nir, instr->src[1]);
   3980 		result = LLVMBuildAtomicCmpXchg(ctx->builder,
   3981 						ptr, src, src1,
   3982 						LLVMAtomicOrderingSequentiallyConsistent,
   3983 						LLVMAtomicOrderingSequentiallyConsistent,
   3984 						false);
   3985 	} else {
   3986 		LLVMAtomicRMWBinOp op;
   3987 		switch (instr->intrinsic) {
   3988 		case nir_intrinsic_var_atomic_add:
   3989 			op = LLVMAtomicRMWBinOpAdd;
   3990 			break;
   3991 		case nir_intrinsic_var_atomic_umin:
   3992 			op = LLVMAtomicRMWBinOpUMin;
   3993 			break;
   3994 		case nir_intrinsic_var_atomic_umax:
   3995 			op = LLVMAtomicRMWBinOpUMax;
   3996 			break;
   3997 		case nir_intrinsic_var_atomic_imin:
   3998 			op = LLVMAtomicRMWBinOpMin;
   3999 			break;
   4000 		case nir_intrinsic_var_atomic_imax:
   4001 			op = LLVMAtomicRMWBinOpMax;
   4002 			break;
   4003 		case nir_intrinsic_var_atomic_and:
   4004 			op = LLVMAtomicRMWBinOpAnd;
   4005 			break;
   4006 		case nir_intrinsic_var_atomic_or:
   4007 			op = LLVMAtomicRMWBinOpOr;
   4008 			break;
   4009 		case nir_intrinsic_var_atomic_xor:
   4010 			op = LLVMAtomicRMWBinOpXor;
   4011 			break;
   4012 		case nir_intrinsic_var_atomic_exchange:
   4013 			op = LLVMAtomicRMWBinOpXchg;
   4014 			break;
   4015 		default:
   4016 			return NULL;
   4017 		}
   4018 
   4019 		result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, ac_to_integer(&ctx->ac, src),
   4020 					    LLVMAtomicOrderingSequentiallyConsistent,
   4021 					    false);
   4022 	}
   4023 	return result;
   4024 }
   4025 
   4026 #define INTERP_CENTER 0
   4027 #define INTERP_CENTROID 1
   4028 #define INTERP_SAMPLE 2
   4029 
   4030 static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx,
   4031 					enum glsl_interp_mode interp, unsigned location)
   4032 {
   4033 	switch (interp) {
   4034 	case INTERP_MODE_FLAT:
   4035 	default:
   4036 		return NULL;
   4037 	case INTERP_MODE_SMOOTH:
   4038 	case INTERP_MODE_NONE:
   4039 		if (location == INTERP_CENTER)
   4040 			return ctx->persp_center;
   4041 		else if (location == INTERP_CENTROID)
   4042 			return ctx->persp_centroid;
   4043 		else if (location == INTERP_SAMPLE)
   4044 			return ctx->persp_sample;
   4045 		break;
   4046 	case INTERP_MODE_NOPERSPECTIVE:
   4047 		if (location == INTERP_CENTER)
   4048 			return ctx->linear_center;
   4049 		else if (location == INTERP_CENTROID)
   4050 			return ctx->linear_centroid;
   4051 		else if (location == INTERP_SAMPLE)
   4052 			return ctx->linear_sample;
   4053 		break;
   4054 	}
   4055 	return NULL;
   4056 }
   4057 
   4058 static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
   4059 					 LLVMValueRef sample_id)
   4060 {
   4061 	LLVMValueRef result;
   4062 	LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false));
   4063 
   4064 	ptr = LLVMBuildBitCast(ctx->builder, ptr,
   4065 			       const_array(ctx->ac.v2f32, 64), "");
   4066 
   4067 	sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
   4068 	result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
   4069 
   4070 	return result;
   4071 }
   4072 
   4073 static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx)
   4074 {
   4075 	LLVMValueRef values[2];
   4076 
   4077 	values[0] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[0]);
   4078 	values[1] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[1]);
   4079 	return ac_build_gather_values(&ctx->ac, values, 2);
   4080 }
   4081 
   4082 static LLVMValueRef load_sample_mask_in(struct ac_nir_context *ctx)
   4083 {
   4084 	uint8_t log2_ps_iter_samples = ctx->nctx->shader_info->info.ps.force_persample ? ctx->nctx->options->key.fs.log2_num_samples : ctx->nctx->options->key.fs.log2_ps_iter_samples;
   4085 
   4086 	/* The bit pattern matches that used by fixed function fragment
   4087 	 * processing. */
   4088 	static const uint16_t ps_iter_masks[] = {
   4089 		0xffff, /* not used */
   4090 		0x5555,
   4091 		0x1111,
   4092 		0x0101,
   4093 		0x0001,
   4094 	};
   4095 	assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
   4096 
   4097 	uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples];
   4098 
   4099 	LLVMValueRef result, sample_id;
   4100 	sample_id = unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4);
   4101 	sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, "");
   4102 	result = LLVMBuildAnd(ctx->ac.builder, sample_id, ctx->abi->sample_coverage, "");
   4103 	return result;
   4104 }
   4105 
   4106 static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx,
   4107 				 const nir_intrinsic_instr *instr)
   4108 {
   4109 	LLVMValueRef result[4];
   4110 	LLVMValueRef interp_param, attr_number;
   4111 	unsigned location;
   4112 	unsigned chan;
   4113 	LLVMValueRef src_c0 = NULL;
   4114 	LLVMValueRef src_c1 = NULL;
   4115 	LLVMValueRef src0 = NULL;
   4116 	int input_index = instr->variables[0]->var->data.location - VARYING_SLOT_VAR0;
   4117 	switch (instr->intrinsic) {
   4118 	case nir_intrinsic_interp_var_at_centroid:
   4119 		location = INTERP_CENTROID;
   4120 		break;
   4121 	case nir_intrinsic_interp_var_at_sample:
   4122 	case nir_intrinsic_interp_var_at_offset:
   4123 		location = INTERP_CENTER;
   4124 		src0 = get_src(ctx->nir, instr->src[0]);
   4125 		break;
   4126 	default:
   4127 		break;
   4128 	}
   4129 
   4130 	if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) {
   4131 		src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->ac.i32_0, ""));
   4132 		src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->builder, src0, ctx->ac.i32_1, ""));
   4133 	} else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) {
   4134 		LLVMValueRef sample_position;
   4135 		LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
   4136 
   4137 		/* fetch sample ID */
   4138 		sample_position = load_sample_position(ctx, src0);
   4139 
   4140 		src_c0 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->ac.i32_0, "");
   4141 		src_c0 = LLVMBuildFSub(ctx->builder, src_c0, halfval, "");
   4142 		src_c1 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->ac.i32_1, "");
   4143 		src_c1 = LLVMBuildFSub(ctx->builder, src_c1, halfval, "");
   4144 	}
   4145 	interp_param = lookup_interp_param(ctx, instr->variables[0]->var->data.interpolation, location);
   4146 	attr_number = LLVMConstInt(ctx->ac.i32, input_index, false);
   4147 
   4148 	if (location == INTERP_CENTER) {
   4149 		LLVMValueRef ij_out[2];
   4150 		LLVMValueRef ddxy_out = emit_ddxy_interp(ctx->nir, interp_param);
   4151 
   4152 		/*
   4153 		 * take the I then J parameters, and the DDX/Y for it, and
   4154 		 * calculate the IJ inputs for the interpolator.
   4155 		 * temp1 = ddx * offset/sample.x + I;
   4156 		 * interp_param.I = ddy * offset/sample.y + temp1;
   4157 		 * temp1 = ddx * offset/sample.x + J;
   4158 		 * interp_param.J = ddy * offset/sample.y + temp1;
   4159 		 */
   4160 		for (unsigned i = 0; i < 2; i++) {
   4161 			LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false);
   4162 			LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false);
   4163 			LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->builder,
   4164 								      ddxy_out, ix_ll, "");
   4165 			LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->builder,
   4166 								      ddxy_out, iy_ll, "");
   4167 			LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->builder,
   4168 									 interp_param, ix_ll, "");
   4169 			LLVMValueRef temp1, temp2;
   4170 
   4171 			interp_el = LLVMBuildBitCast(ctx->builder, interp_el,
   4172 						     ctx->ac.f32, "");
   4173 
   4174 			temp1 = LLVMBuildFMul(ctx->builder, ddx_el, src_c0, "");
   4175 			temp1 = LLVMBuildFAdd(ctx->builder, temp1, interp_el, "");
   4176 
   4177 			temp2 = LLVMBuildFMul(ctx->builder, ddy_el, src_c1, "");
   4178 			temp2 = LLVMBuildFAdd(ctx->builder, temp2, temp1, "");
   4179 
   4180 			ij_out[i] = LLVMBuildBitCast(ctx->builder,
   4181 						     temp2, ctx->ac.i32, "");
   4182 		}
   4183 		interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
   4184 
   4185 	}
   4186 
   4187 	for (chan = 0; chan < 4; chan++) {
   4188 		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
   4189 
   4190 		if (interp_param) {
   4191 			interp_param = LLVMBuildBitCast(ctx->builder,
   4192 							interp_param, ctx->ac.v2f32, "");
   4193 			LLVMValueRef i = LLVMBuildExtractElement(
   4194 				ctx->builder, interp_param, ctx->ac.i32_0, "");
   4195 			LLVMValueRef j = LLVMBuildExtractElement(
   4196 				ctx->builder, interp_param, ctx->ac.i32_1, "");
   4197 
   4198 			result[chan] = ac_build_fs_interp(&ctx->ac,
   4199 							  llvm_chan, attr_number,
   4200 							  ctx->prim_mask, i, j);
   4201 		} else {
   4202 			result[chan] = ac_build_fs_interp_mov(&ctx->ac,
   4203 							      LLVMConstInt(ctx->ac.i32, 2, false),
   4204 							      llvm_chan, attr_number,
   4205 							      ctx->prim_mask);
   4206 		}
   4207 	}
   4208 	return ac_build_varying_gather_values(&ctx->ac, result, instr->num_components,
   4209 					      instr->variables[0]->var->data.location_frac);
   4210 }
   4211 
   4212 static void
   4213 visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
   4214 {
   4215 	LLVMValueRef gs_next_vertex;
   4216 	LLVMValueRef can_emit;
   4217 	int idx;
   4218 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4219 
   4220 	assert(stream == 0);
   4221 
   4222 	/* Write vertex attribute values to GSVS ring */
   4223 	gs_next_vertex = LLVMBuildLoad(ctx->builder,
   4224 				       ctx->gs_next_vertex,
   4225 				       "");
   4226 
   4227 	/* If this thread has already emitted the declared maximum number of
   4228 	 * vertices, kill it: excessive vertex emissions are not supposed to
   4229 	 * have any effect, and GS threads have no externally observable
   4230 	 * effects other than emitting vertices.
   4231 	 */
   4232 	can_emit = LLVMBuildICmp(ctx->builder, LLVMIntULT, gs_next_vertex,
   4233 				 LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), "");
   4234 	ac_build_kill_if_false(&ctx->ac, can_emit);
   4235 
   4236 	/* loop num outputs */
   4237 	idx = 0;
   4238 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   4239 		LLVMValueRef *out_ptr = &addrs[i * 4];
   4240 		int length = 4;
   4241 		int slot = idx;
   4242 		int slot_inc = 1;
   4243 
   4244 		if (!(ctx->output_mask & (1ull << i)))
   4245 			continue;
   4246 
   4247 		if (i == VARYING_SLOT_CLIP_DIST0) {
   4248 			/* pack clip and cull into a single set of slots */
   4249 			length = ctx->num_output_clips + ctx->num_output_culls;
   4250 			if (length > 4)
   4251 				slot_inc = 2;
   4252 		}
   4253 		for (unsigned j = 0; j < length; j++) {
   4254 			LLVMValueRef out_val = LLVMBuildLoad(ctx->builder,
   4255 							     out_ptr[j], "");
   4256 			LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices, false);
   4257 			voffset = LLVMBuildAdd(ctx->builder, voffset, gs_next_vertex, "");
   4258 			voffset = LLVMBuildMul(ctx->builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
   4259 
   4260 			out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->ac.i32, "");
   4261 
   4262 			ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring,
   4263 						    out_val, 1,
   4264 						    voffset, ctx->gs2vs_offset, 0,
   4265 						    1, 1, true, true);
   4266 		}
   4267 		idx += slot_inc;
   4268 	}
   4269 
   4270 	gs_next_vertex = LLVMBuildAdd(ctx->builder, gs_next_vertex,
   4271 				      ctx->ac.i32_1, "");
   4272 	LLVMBuildStore(ctx->builder, gs_next_vertex, ctx->gs_next_vertex);
   4273 
   4274 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id);
   4275 }
   4276 
   4277 static void
   4278 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
   4279 {
   4280 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4281 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id);
   4282 }
   4283 
   4284 static LLVMValueRef
   4285 load_tess_coord(struct ac_shader_abi *abi, LLVMTypeRef type,
   4286 		unsigned num_components)
   4287 {
   4288 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4289 
   4290 	LLVMValueRef coord[4] = {
   4291 		ctx->tes_u,
   4292 		ctx->tes_v,
   4293 		ctx->ac.f32_0,
   4294 		ctx->ac.f32_0,
   4295 	};
   4296 
   4297 	if (ctx->tes_primitive_mode == GL_TRIANGLES)
   4298 		coord[2] = LLVMBuildFSub(ctx->builder, ctx->ac.f32_1,
   4299 					LLVMBuildFAdd(ctx->builder, coord[0], coord[1], ""), "");
   4300 
   4301 	LLVMValueRef result = ac_build_gather_values(&ctx->ac, coord, num_components);
   4302 	return LLVMBuildBitCast(ctx->builder, result, type, "");
   4303 }
   4304 
   4305 static LLVMValueRef
   4306 load_patch_vertices_in(struct ac_shader_abi *abi)
   4307 {
   4308 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4309 	return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false);
   4310 }
   4311 
   4312 static void visit_intrinsic(struct ac_nir_context *ctx,
   4313                             nir_intrinsic_instr *instr)
   4314 {
   4315 	LLVMValueRef result = NULL;
   4316 
   4317 	switch (instr->intrinsic) {
   4318 	case nir_intrinsic_ballot:
   4319 		result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
   4320 		break;
   4321 	case nir_intrinsic_read_invocation:
   4322 	case nir_intrinsic_read_first_invocation: {
   4323 		LLVMValueRef args[2];
   4324 
   4325 		/* Value */
   4326 		args[0] = get_src(ctx, instr->src[0]);
   4327 
   4328 		unsigned num_args;
   4329 		const char *intr_name;
   4330 		if (instr->intrinsic == nir_intrinsic_read_invocation) {
   4331 			num_args = 2;
   4332 			intr_name = "llvm.amdgcn.readlane";
   4333 
   4334 			/* Invocation */
   4335 			args[1] = get_src(ctx, instr->src[1]);
   4336 		} else {
   4337 			num_args = 1;
   4338 			intr_name = "llvm.amdgcn.readfirstlane";
   4339 		}
   4340 
   4341 		/* We currently have no other way to prevent LLVM from lifting the icmp
   4342 		 * calls to a dominating basic block.
   4343 		 */
   4344 		ac_build_optimization_barrier(&ctx->ac, &args[0]);
   4345 
   4346 		result = ac_build_intrinsic(&ctx->ac, intr_name,
   4347 					    ctx->ac.i32, args, num_args,
   4348 					    AC_FUNC_ATTR_READNONE |
   4349 					    AC_FUNC_ATTR_CONVERGENT);
   4350 		break;
   4351 	}
   4352 	case nir_intrinsic_load_subgroup_invocation:
   4353 		result = ac_get_thread_id(&ctx->ac);
   4354 		break;
   4355 	case nir_intrinsic_load_work_group_id: {
   4356 		LLVMValueRef values[3];
   4357 
   4358 		for (int i = 0; i < 3; i++) {
   4359 			values[i] = ctx->nctx->workgroup_ids[i] ?
   4360 				    ctx->nctx->workgroup_ids[i] : ctx->ac.i32_0;
   4361 		}
   4362 
   4363 		result = ac_build_gather_values(&ctx->ac, values, 3);
   4364 		break;
   4365 	}
   4366 	case nir_intrinsic_load_base_vertex: {
   4367 		result = ctx->abi->base_vertex;
   4368 		break;
   4369 	}
   4370 	case nir_intrinsic_load_vertex_id_zero_base: {
   4371 		result = ctx->abi->vertex_id;
   4372 		break;
   4373 	}
   4374 	case nir_intrinsic_load_local_invocation_id: {
   4375 		result = ctx->nctx->local_invocation_ids;
   4376 		break;
   4377 	}
   4378 	case nir_intrinsic_load_base_instance:
   4379 		result = ctx->abi->start_instance;
   4380 		break;
   4381 	case nir_intrinsic_load_draw_id:
   4382 		result = ctx->abi->draw_id;
   4383 		break;
   4384 	case nir_intrinsic_load_view_index:
   4385 		result = ctx->nctx->view_index ? ctx->nctx->view_index : ctx->ac.i32_0;
   4386 		break;
   4387 	case nir_intrinsic_load_invocation_id:
   4388 		if (ctx->stage == MESA_SHADER_TESS_CTRL)
   4389 			result = unpack_param(&ctx->ac, ctx->abi->tcs_rel_ids, 8, 5);
   4390 		else
   4391 			result = ctx->abi->gs_invocation_id;
   4392 		break;
   4393 	case nir_intrinsic_load_primitive_id:
   4394 		if (ctx->stage == MESA_SHADER_GEOMETRY) {
   4395 			result = ctx->abi->gs_prim_id;
   4396 		} else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
   4397 			result = ctx->abi->tcs_patch_id;
   4398 		} else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
   4399 			result = ctx->abi->tes_patch_id;
   4400 		} else
   4401 			fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
   4402 		break;
   4403 	case nir_intrinsic_load_sample_id:
   4404 		result = unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4);
   4405 		break;
   4406 	case nir_intrinsic_load_sample_pos:
   4407 		result = load_sample_pos(ctx);
   4408 		break;
   4409 	case nir_intrinsic_load_sample_mask_in:
   4410 		if (ctx->nctx)
   4411 			result = load_sample_mask_in(ctx);
   4412 		else
   4413 			result = ctx->abi->sample_coverage;
   4414 		break;
   4415 	case nir_intrinsic_load_frag_coord: {
   4416 		LLVMValueRef values[4] = {
   4417 			ctx->abi->frag_pos[0],
   4418 			ctx->abi->frag_pos[1],
   4419 			ctx->abi->frag_pos[2],
   4420 			ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3])
   4421 		};
   4422 		result = ac_build_gather_values(&ctx->ac, values, 4);
   4423 		break;
   4424 	}
   4425 	case nir_intrinsic_load_front_face:
   4426 		result = ctx->abi->front_face;
   4427 		break;
   4428 	case nir_intrinsic_load_helper_invocation:
   4429 		result = visit_load_helper_invocation(ctx);
   4430 		break;
   4431 	case nir_intrinsic_load_instance_id:
   4432 		result = ctx->abi->instance_id;
   4433 		break;
   4434 	case nir_intrinsic_load_num_work_groups:
   4435 		result = ctx->nctx->num_work_groups;
   4436 		break;
   4437 	case nir_intrinsic_load_local_invocation_index:
   4438 		result = visit_load_local_invocation_index(ctx->nctx);
   4439 		break;
   4440 	case nir_intrinsic_load_push_constant:
   4441 		result = visit_load_push_constant(ctx->nctx, instr);
   4442 		break;
   4443 	case nir_intrinsic_vulkan_resource_index:
   4444 		result = visit_vulkan_resource_index(ctx->nctx, instr);
   4445 		break;
   4446 	case nir_intrinsic_vulkan_resource_reindex:
   4447 		result = visit_vulkan_resource_reindex(ctx->nctx, instr);
   4448 		break;
   4449 	case nir_intrinsic_store_ssbo:
   4450 		visit_store_ssbo(ctx, instr);
   4451 		break;
   4452 	case nir_intrinsic_load_ssbo:
   4453 		result = visit_load_buffer(ctx, instr);
   4454 		break;
   4455 	case nir_intrinsic_ssbo_atomic_add:
   4456 	case nir_intrinsic_ssbo_atomic_imin:
   4457 	case nir_intrinsic_ssbo_atomic_umin:
   4458 	case nir_intrinsic_ssbo_atomic_imax:
   4459 	case nir_intrinsic_ssbo_atomic_umax:
   4460 	case nir_intrinsic_ssbo_atomic_and:
   4461 	case nir_intrinsic_ssbo_atomic_or:
   4462 	case nir_intrinsic_ssbo_atomic_xor:
   4463 	case nir_intrinsic_ssbo_atomic_exchange:
   4464 	case nir_intrinsic_ssbo_atomic_comp_swap:
   4465 		result = visit_atomic_ssbo(ctx, instr);
   4466 		break;
   4467 	case nir_intrinsic_load_ubo:
   4468 		result = visit_load_ubo_buffer(ctx, instr);
   4469 		break;
   4470 	case nir_intrinsic_get_buffer_size:
   4471 		result = visit_get_buffer_size(ctx, instr);
   4472 		break;
   4473 	case nir_intrinsic_load_var:
   4474 		result = visit_load_var(ctx, instr);
   4475 		break;
   4476 	case nir_intrinsic_store_var:
   4477 		visit_store_var(ctx, instr);
   4478 		break;
   4479 	case nir_intrinsic_image_load:
   4480 		result = visit_image_load(ctx, instr);
   4481 		break;
   4482 	case nir_intrinsic_image_store:
   4483 		visit_image_store(ctx, instr);
   4484 		break;
   4485 	case nir_intrinsic_image_atomic_add:
   4486 	case nir_intrinsic_image_atomic_min:
   4487 	case nir_intrinsic_image_atomic_max:
   4488 	case nir_intrinsic_image_atomic_and:
   4489 	case nir_intrinsic_image_atomic_or:
   4490 	case nir_intrinsic_image_atomic_xor:
   4491 	case nir_intrinsic_image_atomic_exchange:
   4492 	case nir_intrinsic_image_atomic_comp_swap:
   4493 		result = visit_image_atomic(ctx, instr);
   4494 		break;
   4495 	case nir_intrinsic_image_size:
   4496 		result = visit_image_size(ctx, instr);
   4497 		break;
   4498 	case nir_intrinsic_discard:
   4499 	case nir_intrinsic_discard_if:
   4500 		emit_discard(ctx, instr);
   4501 		break;
   4502 	case nir_intrinsic_memory_barrier:
   4503 	case nir_intrinsic_group_memory_barrier:
   4504 	case nir_intrinsic_memory_barrier_atomic_counter:
   4505 	case nir_intrinsic_memory_barrier_buffer:
   4506 	case nir_intrinsic_memory_barrier_image:
   4507 	case nir_intrinsic_memory_barrier_shared:
   4508 		emit_membar(ctx->nctx, instr);
   4509 		break;
   4510 	case nir_intrinsic_barrier:
   4511 		emit_barrier(&ctx->ac, ctx->stage);
   4512 		break;
   4513 	case nir_intrinsic_var_atomic_add:
   4514 	case nir_intrinsic_var_atomic_imin:
   4515 	case nir_intrinsic_var_atomic_umin:
   4516 	case nir_intrinsic_var_atomic_imax:
   4517 	case nir_intrinsic_var_atomic_umax:
   4518 	case nir_intrinsic_var_atomic_and:
   4519 	case nir_intrinsic_var_atomic_or:
   4520 	case nir_intrinsic_var_atomic_xor:
   4521 	case nir_intrinsic_var_atomic_exchange:
   4522 	case nir_intrinsic_var_atomic_comp_swap:
   4523 		result = visit_var_atomic(ctx->nctx, instr);
   4524 		break;
   4525 	case nir_intrinsic_interp_var_at_centroid:
   4526 	case nir_intrinsic_interp_var_at_sample:
   4527 	case nir_intrinsic_interp_var_at_offset:
   4528 		result = visit_interp(ctx->nctx, instr);
   4529 		break;
   4530 	case nir_intrinsic_emit_vertex:
   4531 		ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->outputs);
   4532 		break;
   4533 	case nir_intrinsic_end_primitive:
   4534 		ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
   4535 		break;
   4536 	case nir_intrinsic_load_tess_coord: {
   4537 		LLVMTypeRef type = ctx->nctx ?
   4538 			get_def_type(ctx->nctx->nir, &instr->dest.ssa) :
   4539 			NULL;
   4540 		result = ctx->abi->load_tess_coord(ctx->abi, type, instr->num_components);
   4541 		break;
   4542 	}
   4543 	case nir_intrinsic_load_tess_level_outer:
   4544 		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER);
   4545 		break;
   4546 	case nir_intrinsic_load_tess_level_inner:
   4547 		result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER);
   4548 		break;
   4549 	case nir_intrinsic_load_patch_vertices_in:
   4550 		result = ctx->abi->load_patch_vertices_in(ctx->abi);
   4551 		break;
   4552 	case nir_intrinsic_vote_all: {
   4553 		LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0]));
   4554 		result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
   4555 		break;
   4556 	}
   4557 	case nir_intrinsic_vote_any: {
   4558 		LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0]));
   4559 		result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
   4560 		break;
   4561 	}
   4562 	case nir_intrinsic_vote_eq: {
   4563 		LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, get_src(ctx, instr->src[0]));
   4564 		result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
   4565 		break;
   4566 	}
   4567 	default:
   4568 		fprintf(stderr, "Unknown intrinsic: ");
   4569 		nir_print_instr(&instr->instr, stderr);
   4570 		fprintf(stderr, "\n");
   4571 		break;
   4572 	}
   4573 	if (result) {
   4574 		_mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
   4575 	}
   4576 }
   4577 
   4578 static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
   4579 				   LLVMValueRef buffer_ptr, bool write)
   4580 {
   4581 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4582 
   4583 	if (write && ctx->stage == MESA_SHADER_FRAGMENT)
   4584 		ctx->shader_info->fs.writes_memory = true;
   4585 
   4586 	return LLVMBuildLoad(ctx->builder, buffer_ptr, "");
   4587 }
   4588 
   4589 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
   4590 {
   4591 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4592 	LLVMValueRef result;
   4593 
   4594 	LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
   4595 
   4596 	result = LLVMBuildLoad(ctx->builder, buffer_ptr, "");
   4597 	LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
   4598 
   4599 	return result;
   4600 }
   4601 
   4602 static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
   4603 					  unsigned descriptor_set,
   4604 					  unsigned base_index,
   4605 					  unsigned constant_index,
   4606 					  LLVMValueRef index,
   4607 					  enum ac_descriptor_type desc_type,
   4608 					  bool image, bool write)
   4609 {
   4610 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   4611 	LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
   4612 	struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
   4613 	struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
   4614 	unsigned offset = binding->offset;
   4615 	unsigned stride = binding->size;
   4616 	unsigned type_size;
   4617 	LLVMBuilderRef builder = ctx->builder;
   4618 	LLVMTypeRef type;
   4619 
   4620 	assert(base_index < layout->binding_count);
   4621 
   4622 	if (write && ctx->stage == MESA_SHADER_FRAGMENT)
   4623 		ctx->shader_info->fs.writes_memory = true;
   4624 
   4625 	switch (desc_type) {
   4626 	case AC_DESC_IMAGE:
   4627 		type = ctx->ac.v8i32;
   4628 		type_size = 32;
   4629 		break;
   4630 	case AC_DESC_FMASK:
   4631 		type = ctx->ac.v8i32;
   4632 		offset += 32;
   4633 		type_size = 32;
   4634 		break;
   4635 	case AC_DESC_SAMPLER:
   4636 		type = ctx->ac.v4i32;
   4637 		if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
   4638 			offset += 64;
   4639 
   4640 		type_size = 16;
   4641 		break;
   4642 	case AC_DESC_BUFFER:
   4643 		type = ctx->ac.v4i32;
   4644 		type_size = 16;
   4645 		break;
   4646 	default:
   4647 		unreachable("invalid desc_type\n");
   4648 	}
   4649 
   4650 	offset += constant_index * stride;
   4651 
   4652 	if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
   4653 	    (!index || binding->immutable_samplers_equal)) {
   4654 		if (binding->immutable_samplers_equal)
   4655 			constant_index = 0;
   4656 
   4657 		const uint32_t *samplers = radv_immutable_samplers(layout, binding);
   4658 
   4659 		LLVMValueRef constants[] = {
   4660 			LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
   4661 			LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
   4662 			LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
   4663 			LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
   4664 		};
   4665 		return ac_build_gather_values(&ctx->ac, constants, 4);
   4666 	}
   4667 
   4668 	assert(stride % type_size == 0);
   4669 
   4670 	if (!index)
   4671 		index = ctx->ac.i32_0;
   4672 
   4673 	index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
   4674 
   4675 	list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0));
   4676 	list = LLVMBuildPointerCast(builder, list, const_array(type, 0), "");
   4677 
   4678 	return ac_build_load_to_sgpr(&ctx->ac, list, index);
   4679 }
   4680 
   4681 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
   4682 				     const nir_deref_var *deref,
   4683 				     enum ac_descriptor_type desc_type,
   4684 				     const nir_tex_instr *tex_instr,
   4685 				     bool image, bool write)
   4686 {
   4687 	LLVMValueRef index = NULL;
   4688 	unsigned constant_index = 0;
   4689 	unsigned descriptor_set;
   4690 	unsigned base_index;
   4691 
   4692 	if (!deref) {
   4693 		assert(tex_instr && !image);
   4694 		descriptor_set = 0;
   4695 		base_index = tex_instr->sampler_index;
   4696 	} else {
   4697 		const nir_deref *tail = &deref->deref;
   4698 		while (tail->child) {
   4699 			const nir_deref_array *child = nir_deref_as_array(tail->child);
   4700 			unsigned array_size = glsl_get_aoa_size(tail->child->type);
   4701 
   4702 			if (!array_size)
   4703 				array_size = 1;
   4704 
   4705 			assert(child->deref_array_type != nir_deref_array_type_wildcard);
   4706 
   4707 			if (child->deref_array_type == nir_deref_array_type_indirect) {
   4708 				LLVMValueRef indirect = get_src(ctx, child->indirect);
   4709 
   4710 				indirect = LLVMBuildMul(ctx->ac.builder, indirect,
   4711 					LLVMConstInt(ctx->ac.i32, array_size, false), "");
   4712 
   4713 				if (!index)
   4714 					index = indirect;
   4715 				else
   4716 					index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
   4717 			}
   4718 
   4719 			constant_index += child->base_offset * array_size;
   4720 
   4721 			tail = &child->deref;
   4722 		}
   4723 		descriptor_set = deref->var->data.descriptor_set;
   4724 		base_index = deref->var->data.binding;
   4725 	}
   4726 
   4727 	return ctx->abi->load_sampler_desc(ctx->abi,
   4728 					  descriptor_set,
   4729 					  base_index,
   4730 					  constant_index, index,
   4731 					  desc_type, image, write);
   4732 }
   4733 
   4734 static void set_tex_fetch_args(struct ac_llvm_context *ctx,
   4735 			       struct ac_image_args *args,
   4736 			       const nir_tex_instr *instr,
   4737 			       nir_texop op,
   4738 			       LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
   4739 			       LLVMValueRef *param, unsigned count,
   4740 			       unsigned dmask)
   4741 {
   4742 	unsigned is_rect = 0;
   4743 	bool da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
   4744 
   4745 	if (op == nir_texop_lod)
   4746 		da = false;
   4747 	/* Pad to power of two vector */
   4748 	while (count < util_next_power_of_two(count))
   4749 		param[count++] = LLVMGetUndef(ctx->i32);
   4750 
   4751 	if (count > 1)
   4752 		args->addr = ac_build_gather_values(ctx, param, count);
   4753 	else
   4754 		args->addr = param[0];
   4755 
   4756 	args->resource = res_ptr;
   4757 	args->sampler = samp_ptr;
   4758 
   4759 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF && op == nir_texop_txf) {
   4760 		args->addr = param[0];
   4761 		return;
   4762 	}
   4763 
   4764 	args->dmask = dmask;
   4765 	args->unorm = is_rect;
   4766 	args->da = da;
   4767 }
   4768 
   4769 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
   4770  *
   4771  * SI-CI:
   4772  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
   4773  *   filtering manually. The driver sets img7 to a mask clearing
   4774  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
   4775  *     s_and_b32 samp0, samp0, img7
   4776  *
   4777  * VI:
   4778  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
   4779  */
   4780 static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
   4781                                            LLVMValueRef res, LLVMValueRef samp)
   4782 {
   4783 	LLVMBuilderRef builder = ctx->ac.builder;
   4784 	LLVMValueRef img7, samp0;
   4785 
   4786 	if (ctx->ac.chip_class >= VI)
   4787 		return samp;
   4788 
   4789 	img7 = LLVMBuildExtractElement(builder, res,
   4790 	                               LLVMConstInt(ctx->ac.i32, 7, 0), "");
   4791 	samp0 = LLVMBuildExtractElement(builder, samp,
   4792 	                                LLVMConstInt(ctx->ac.i32, 0, 0), "");
   4793 	samp0 = LLVMBuildAnd(builder, samp0, img7, "");
   4794 	return LLVMBuildInsertElement(builder, samp, samp0,
   4795 	                              LLVMConstInt(ctx->ac.i32, 0, 0), "");
   4796 }
   4797 
   4798 static void tex_fetch_ptrs(struct ac_nir_context *ctx,
   4799 			   nir_tex_instr *instr,
   4800 			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
   4801 			   LLVMValueRef *fmask_ptr)
   4802 {
   4803 	if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF)
   4804 		*res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_BUFFER, instr, false, false);
   4805 	else
   4806 		*res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_IMAGE, instr, false, false);
   4807 	if (samp_ptr) {
   4808 		if (instr->sampler)
   4809 			*samp_ptr = get_sampler_desc(ctx, instr->sampler, AC_DESC_SAMPLER, instr, false, false);
   4810 		else
   4811 			*samp_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_SAMPLER, instr, false, false);
   4812 		if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
   4813 			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
   4814 	}
   4815 	if (fmask_ptr && !instr->sampler && (instr->op == nir_texop_txf_ms ||
   4816 					     instr->op == nir_texop_samples_identical))
   4817 		*fmask_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_FMASK, instr, false, false);
   4818 }
   4819 
   4820 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
   4821 				      LLVMValueRef coord)
   4822 {
   4823 	coord = ac_to_float(ctx, coord);
   4824 	coord = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
   4825 	coord = ac_to_integer(ctx, coord);
   4826 	return coord;
   4827 }
   4828 
   4829 static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
   4830 {
   4831 	LLVMValueRef result = NULL;
   4832 	struct ac_image_args args = { 0 };
   4833 	unsigned dmask = 0xf;
   4834 	LLVMValueRef address[16];
   4835 	LLVMValueRef coords[5];
   4836 	LLVMValueRef coord = NULL, lod = NULL, comparator = NULL;
   4837 	LLVMValueRef bias = NULL, offsets = NULL;
   4838 	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL, sample_index = NULL;
   4839 	LLVMValueRef ddx = NULL, ddy = NULL;
   4840 	LLVMValueRef derivs[6];
   4841 	unsigned chan, count = 0;
   4842 	unsigned const_src = 0, num_deriv_comp = 0;
   4843 	bool lod_is_zero = false;
   4844 
   4845 	tex_fetch_ptrs(ctx, instr, &res_ptr, &samp_ptr, &fmask_ptr);
   4846 
   4847 	for (unsigned i = 0; i < instr->num_srcs; i++) {
   4848 		switch (instr->src[i].src_type) {
   4849 		case nir_tex_src_coord:
   4850 			coord = get_src(ctx, instr->src[i].src);
   4851 			break;
   4852 		case nir_tex_src_projector:
   4853 			break;
   4854 		case nir_tex_src_comparator:
   4855 			comparator = get_src(ctx, instr->src[i].src);
   4856 			break;
   4857 		case nir_tex_src_offset:
   4858 			offsets = get_src(ctx, instr->src[i].src);
   4859 			const_src = i;
   4860 			break;
   4861 		case nir_tex_src_bias:
   4862 			bias = get_src(ctx, instr->src[i].src);
   4863 			break;
   4864 		case nir_tex_src_lod: {
   4865 			nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
   4866 
   4867 			if (val && val->i32[0] == 0)
   4868 				lod_is_zero = true;
   4869 			lod = get_src(ctx, instr->src[i].src);
   4870 			break;
   4871 		}
   4872 		case nir_tex_src_ms_index:
   4873 			sample_index = get_src(ctx, instr->src[i].src);
   4874 			break;
   4875 		case nir_tex_src_ms_mcs:
   4876 			break;
   4877 		case nir_tex_src_ddx:
   4878 			ddx = get_src(ctx, instr->src[i].src);
   4879 			num_deriv_comp = instr->src[i].src.ssa->num_components;
   4880 			break;
   4881 		case nir_tex_src_ddy:
   4882 			ddy = get_src(ctx, instr->src[i].src);
   4883 			break;
   4884 		case nir_tex_src_texture_offset:
   4885 		case nir_tex_src_sampler_offset:
   4886 		case nir_tex_src_plane:
   4887 		default:
   4888 			break;
   4889 		}
   4890 	}
   4891 
   4892 	if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
   4893 		result = get_buffer_size(ctx, res_ptr, true);
   4894 		goto write_result;
   4895 	}
   4896 
   4897 	if (instr->op == nir_texop_texture_samples) {
   4898 		LLVMValueRef res, samples, is_msaa;
   4899 		res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->ac.v8i32, "");
   4900 		samples = LLVMBuildExtractElement(ctx->ac.builder, res,
   4901 						  LLVMConstInt(ctx->ac.i32, 3, false), "");
   4902 		is_msaa = LLVMBuildLShr(ctx->ac.builder, samples,
   4903 					LLVMConstInt(ctx->ac.i32, 28, false), "");
   4904 		is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa,
   4905 				       LLVMConstInt(ctx->ac.i32, 0xe, false), "");
   4906 		is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa,
   4907 					LLVMConstInt(ctx->ac.i32, 0xe, false), "");
   4908 
   4909 		samples = LLVMBuildLShr(ctx->ac.builder, samples,
   4910 					LLVMConstInt(ctx->ac.i32, 16, false), "");
   4911 		samples = LLVMBuildAnd(ctx->ac.builder, samples,
   4912 				       LLVMConstInt(ctx->ac.i32, 0xf, false), "");
   4913 		samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
   4914 				       samples, "");
   4915 		samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
   4916 					  ctx->ac.i32_1, "");
   4917 		result = samples;
   4918 		goto write_result;
   4919 	}
   4920 
   4921 	if (coord)
   4922 		for (chan = 0; chan < instr->coord_components; chan++)
   4923 			coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
   4924 
   4925 	if (offsets && instr->op != nir_texop_txf) {
   4926 		LLVMValueRef offset[3], pack;
   4927 		for (chan = 0; chan < 3; ++chan)
   4928 			offset[chan] = ctx->ac.i32_0;
   4929 
   4930 		args.offset = true;
   4931 		for (chan = 0; chan < ac_get_llvm_num_components(offsets); chan++) {
   4932 			offset[chan] = ac_llvm_extract_elem(&ctx->ac, offsets, chan);
   4933 			offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
   4934 						    LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
   4935 			if (chan)
   4936 				offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
   4937 							    LLVMConstInt(ctx->ac.i32, chan * 8, false), "");
   4938 		}
   4939 		pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
   4940 		pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
   4941 		address[count++] = pack;
   4942 
   4943 	}
   4944 	/* pack LOD bias value */
   4945 	if (instr->op == nir_texop_txb && bias) {
   4946 		address[count++] = bias;
   4947 	}
   4948 
   4949 	/* Pack depth comparison value */
   4950 	if (instr->is_shadow && comparator) {
   4951 		LLVMValueRef z = ac_to_float(&ctx->ac,
   4952 		                             ac_llvm_extract_elem(&ctx->ac, comparator, 0));
   4953 
   4954 		/* TC-compatible HTILE on radeonsi promotes Z16 and Z24 to Z32_FLOAT,
   4955 		 * so the depth comparison value isn't clamped for Z16 and
   4956 		 * Z24 anymore. Do it manually here.
   4957 		 *
   4958 		 * It's unnecessary if the original texture format was
   4959 		 * Z32_FLOAT, but we don't know that here.
   4960 		 */
   4961 		if (ctx->ac.chip_class == VI && ctx->abi->clamp_shadow_reference)
   4962 			z = ac_build_clamp(&ctx->ac, z);
   4963 
   4964 		address[count++] = z;
   4965 	}
   4966 
   4967 	/* pack derivatives */
   4968 	if (ddx || ddy) {
   4969 		int num_src_deriv_channels, num_dest_deriv_channels;
   4970 		switch (instr->sampler_dim) {
   4971 		case GLSL_SAMPLER_DIM_3D:
   4972 		case GLSL_SAMPLER_DIM_CUBE:
   4973 			num_deriv_comp = 3;
   4974 			num_src_deriv_channels = 3;
   4975 			num_dest_deriv_channels = 3;
   4976 			break;
   4977 		case GLSL_SAMPLER_DIM_2D:
   4978 		default:
   4979 			num_src_deriv_channels = 2;
   4980 			num_dest_deriv_channels = 2;
   4981 			num_deriv_comp = 2;
   4982 			break;
   4983 		case GLSL_SAMPLER_DIM_1D:
   4984 			num_src_deriv_channels = 1;
   4985 			if (ctx->ac.chip_class >= GFX9) {
   4986 				num_dest_deriv_channels = 2;
   4987 				num_deriv_comp = 2;
   4988 			} else {
   4989 				num_dest_deriv_channels = 1;
   4990 				num_deriv_comp = 1;
   4991 			}
   4992 			break;
   4993 		}
   4994 
   4995 		for (unsigned i = 0; i < num_src_deriv_channels; i++) {
   4996 			derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i));
   4997 			derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i));
   4998 		}
   4999 		for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
   5000 			derivs[i] = ctx->ac.f32_0;
   5001 			derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0;
   5002 		}
   5003 	}
   5004 
   5005 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) {
   5006 		for (chan = 0; chan < instr->coord_components; chan++)
   5007 			coords[chan] = ac_to_float(&ctx->ac, coords[chan]);
   5008 		if (instr->coord_components == 3)
   5009 			coords[3] = LLVMGetUndef(ctx->ac.f32);
   5010 		ac_prepare_cube_coords(&ctx->ac,
   5011 			instr->op == nir_texop_txd, instr->is_array,
   5012 			instr->op == nir_texop_lod, coords, derivs);
   5013 		if (num_deriv_comp)
   5014 			num_deriv_comp--;
   5015 	}
   5016 
   5017 	if (ddx || ddy) {
   5018 		for (unsigned i = 0; i < num_deriv_comp * 2; i++)
   5019 			address[count++] = derivs[i];
   5020 	}
   5021 
   5022 	/* Pack texture coordinates */
   5023 	if (coord) {
   5024 		address[count++] = coords[0];
   5025 		if (instr->coord_components > 1) {
   5026 			if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) {
   5027 				coords[1] = apply_round_slice(&ctx->ac, coords[1]);
   5028 			}
   5029 			address[count++] = coords[1];
   5030 		}
   5031 		if (instr->coord_components > 2) {
   5032 			/* This seems like a bit of a hack - but it passes Vulkan CTS with it */
   5033 			if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D &&
   5034 			    instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE &&
   5035 			    instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
   5036 				coords[2] = apply_round_slice(&ctx->ac, coords[2]);
   5037 			}
   5038 			address[count++] = coords[2];
   5039 		}
   5040 
   5041 		if (ctx->ac.chip_class >= GFX9) {
   5042 			LLVMValueRef filler;
   5043 			if (instr->op == nir_texop_txf)
   5044 				filler = ctx->ac.i32_0;
   5045 			else
   5046 				filler = LLVMConstReal(ctx->ac.f32, 0.5);
   5047 
   5048 			if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) {
   5049 				/* No nir_texop_lod, because it does not take a slice
   5050 				 * even with array textures. */
   5051 				if (instr->is_array && instr->op != nir_texop_lod ) {
   5052 					address[count] = address[count - 1];
   5053 					address[count - 1] = filler;
   5054 					count++;
   5055 				} else
   5056 					address[count++] = filler;
   5057 			}
   5058 		}
   5059 	}
   5060 
   5061 	/* Pack LOD */
   5062 	if (lod && ((instr->op == nir_texop_txl && !lod_is_zero) ||
   5063 		    instr->op == nir_texop_txf)) {
   5064 		address[count++] = lod;
   5065 	} else if (instr->op == nir_texop_txf_ms && sample_index) {
   5066 		address[count++] = sample_index;
   5067 	} else if(instr->op == nir_texop_txs) {
   5068 		count = 0;
   5069 		if (lod)
   5070 			address[count++] = lod;
   5071 		else
   5072 			address[count++] = ctx->ac.i32_0;
   5073 	}
   5074 
   5075 	for (chan = 0; chan < count; chan++) {
   5076 		address[chan] = LLVMBuildBitCast(ctx->ac.builder,
   5077 						 address[chan], ctx->ac.i32, "");
   5078 	}
   5079 
   5080 	if (instr->op == nir_texop_samples_identical) {
   5081 		LLVMValueRef txf_address[4];
   5082 		struct ac_image_args txf_args = { 0 };
   5083 		unsigned txf_count = count;
   5084 		memcpy(txf_address, address, sizeof(txf_address));
   5085 
   5086 		if (!instr->is_array)
   5087 			txf_address[2] = ctx->ac.i32_0;
   5088 		txf_address[3] = ctx->ac.i32_0;
   5089 
   5090 		set_tex_fetch_args(&ctx->ac, &txf_args, instr, nir_texop_txf,
   5091 				   fmask_ptr, NULL,
   5092 				   txf_address, txf_count, 0xf);
   5093 
   5094 		result = build_tex_intrinsic(ctx, instr, false, &txf_args);
   5095 
   5096 		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
   5097 		result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0);
   5098 		goto write_result;
   5099 	}
   5100 
   5101 	if (instr->sampler_dim == GLSL_SAMPLER_DIM_MS &&
   5102 	    instr->op != nir_texop_txs) {
   5103 		unsigned sample_chan = instr->is_array ? 3 : 2;
   5104 		address[sample_chan] = adjust_sample_index_using_fmask(&ctx->ac,
   5105 								       address[0],
   5106 								       address[1],
   5107 								       instr->is_array ? address[2] : NULL,
   5108 								       address[sample_chan],
   5109 								       fmask_ptr);
   5110 	}
   5111 
   5112 	if (offsets && instr->op == nir_texop_txf) {
   5113 		nir_const_value *const_offset =
   5114 			nir_src_as_const_value(instr->src[const_src].src);
   5115 		int num_offsets = instr->src[const_src].src.ssa->num_components;
   5116 		assert(const_offset);
   5117 		num_offsets = MIN2(num_offsets, instr->coord_components);
   5118 		if (num_offsets > 2)
   5119 			address[2] = LLVMBuildAdd(ctx->ac.builder,
   5120 						  address[2], LLVMConstInt(ctx->ac.i32, const_offset->i32[2], false), "");
   5121 		if (num_offsets > 1)
   5122 			address[1] = LLVMBuildAdd(ctx->ac.builder,
   5123 						  address[1], LLVMConstInt(ctx->ac.i32, const_offset->i32[1], false), "");
   5124 		address[0] = LLVMBuildAdd(ctx->ac.builder,
   5125 					  address[0], LLVMConstInt(ctx->ac.i32, const_offset->i32[0], false), "");
   5126 
   5127 	}
   5128 
   5129 	/* TODO TG4 support */
   5130 	if (instr->op == nir_texop_tg4) {
   5131 		if (instr->is_shadow)
   5132 			dmask = 1;
   5133 		else
   5134 			dmask = 1 << instr->component;
   5135 	}
   5136 	set_tex_fetch_args(&ctx->ac, &args, instr, instr->op,
   5137 			   res_ptr, samp_ptr, address, count, dmask);
   5138 
   5139 	result = build_tex_intrinsic(ctx, instr, lod_is_zero, &args);
   5140 
   5141 	if (instr->op == nir_texop_query_levels)
   5142 		result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
   5143 	else if (instr->is_shadow && instr->is_new_style_shadow &&
   5144 		 instr->op != nir_texop_txs && instr->op != nir_texop_lod &&
   5145 		 instr->op != nir_texop_tg4)
   5146 		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
   5147 	else if (instr->op == nir_texop_txs &&
   5148 		 instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
   5149 		 instr->is_array) {
   5150 		LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
   5151 		LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
   5152 		LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
   5153 		z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
   5154 		result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, "");
   5155 	} else if (ctx->ac.chip_class >= GFX9 &&
   5156 		   instr->op == nir_texop_txs &&
   5157 		   instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
   5158 		   instr->is_array) {
   5159 		LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
   5160 		LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
   5161 		result = LLVMBuildInsertElement(ctx->ac.builder, result, layers,
   5162 						ctx->ac.i32_1, "");
   5163 	} else if (instr->dest.ssa.num_components != 4)
   5164 		result = trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
   5165 
   5166 write_result:
   5167 	if (result) {
   5168 		assert(instr->dest.is_ssa);
   5169 		result = ac_to_integer(&ctx->ac, result);
   5170 		_mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
   5171 	}
   5172 }
   5173 
   5174 
   5175 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
   5176 {
   5177 	LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
   5178 	LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, "");
   5179 
   5180 	_mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
   5181 	_mesa_hash_table_insert(ctx->phis, instr, result);
   5182 }
   5183 
   5184 static void visit_post_phi(struct ac_nir_context *ctx,
   5185                            nir_phi_instr *instr,
   5186                            LLVMValueRef llvm_phi)
   5187 {
   5188 	nir_foreach_phi_src(src, instr) {
   5189 		LLVMBasicBlockRef block = get_block(ctx, src->pred);
   5190 		LLVMValueRef llvm_src = get_src(ctx, src->src);
   5191 
   5192 		LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
   5193 	}
   5194 }
   5195 
   5196 static void phi_post_pass(struct ac_nir_context *ctx)
   5197 {
   5198 	struct hash_entry *entry;
   5199 	hash_table_foreach(ctx->phis, entry) {
   5200 		visit_post_phi(ctx, (nir_phi_instr*)entry->key,
   5201 		               (LLVMValueRef)entry->data);
   5202 	}
   5203 }
   5204 
   5205 
   5206 static void visit_ssa_undef(struct ac_nir_context *ctx,
   5207 			    const nir_ssa_undef_instr *instr)
   5208 {
   5209 	unsigned num_components = instr->def.num_components;
   5210 	LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
   5211 	LLVMValueRef undef;
   5212 
   5213 	if (num_components == 1)
   5214 		undef = LLVMGetUndef(type);
   5215 	else {
   5216 		undef = LLVMGetUndef(LLVMVectorType(type, num_components));
   5217 	}
   5218 	_mesa_hash_table_insert(ctx->defs, &instr->def, undef);
   5219 }
   5220 
   5221 static void visit_jump(struct ac_llvm_context *ctx,
   5222 		       const nir_jump_instr *instr)
   5223 {
   5224 	switch (instr->type) {
   5225 	case nir_jump_break:
   5226 		ac_build_break(ctx);
   5227 		break;
   5228 	case nir_jump_continue:
   5229 		ac_build_continue(ctx);
   5230 		break;
   5231 	default:
   5232 		fprintf(stderr, "Unknown NIR jump instr: ");
   5233 		nir_print_instr(&instr->instr, stderr);
   5234 		fprintf(stderr, "\n");
   5235 		abort();
   5236 	}
   5237 }
   5238 
   5239 static void visit_cf_list(struct ac_nir_context *ctx,
   5240                           struct exec_list *list);
   5241 
   5242 static void visit_block(struct ac_nir_context *ctx, nir_block *block)
   5243 {
   5244 	LLVMBasicBlockRef llvm_block = LLVMGetInsertBlock(ctx->ac.builder);
   5245 	nir_foreach_instr(instr, block)
   5246 	{
   5247 		switch (instr->type) {
   5248 		case nir_instr_type_alu:
   5249 			visit_alu(ctx, nir_instr_as_alu(instr));
   5250 			break;
   5251 		case nir_instr_type_load_const:
   5252 			visit_load_const(ctx, nir_instr_as_load_const(instr));
   5253 			break;
   5254 		case nir_instr_type_intrinsic:
   5255 			visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
   5256 			break;
   5257 		case nir_instr_type_tex:
   5258 			visit_tex(ctx, nir_instr_as_tex(instr));
   5259 			break;
   5260 		case nir_instr_type_phi:
   5261 			visit_phi(ctx, nir_instr_as_phi(instr));
   5262 			break;
   5263 		case nir_instr_type_ssa_undef:
   5264 			visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
   5265 			break;
   5266 		case nir_instr_type_jump:
   5267 			visit_jump(&ctx->ac, nir_instr_as_jump(instr));
   5268 			break;
   5269 		default:
   5270 			fprintf(stderr, "Unknown NIR instr type: ");
   5271 			nir_print_instr(instr, stderr);
   5272 			fprintf(stderr, "\n");
   5273 			abort();
   5274 		}
   5275 	}
   5276 
   5277 	_mesa_hash_table_insert(ctx->defs, block, llvm_block);
   5278 }
   5279 
   5280 static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt)
   5281 {
   5282 	LLVMValueRef value = get_src(ctx, if_stmt->condition);
   5283 
   5284 	nir_block *then_block =
   5285 		(nir_block *) exec_list_get_head(&if_stmt->then_list);
   5286 
   5287 	ac_build_uif(&ctx->ac, value, then_block->index);
   5288 
   5289 	visit_cf_list(ctx, &if_stmt->then_list);
   5290 
   5291 	if (!exec_list_is_empty(&if_stmt->else_list)) {
   5292 		nir_block *else_block =
   5293 			(nir_block *) exec_list_get_head(&if_stmt->else_list);
   5294 
   5295 		ac_build_else(&ctx->ac, else_block->index);
   5296 		visit_cf_list(ctx, &if_stmt->else_list);
   5297 	}
   5298 
   5299 	ac_build_endif(&ctx->ac, then_block->index);
   5300 }
   5301 
   5302 static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop)
   5303 {
   5304 	nir_block *first_loop_block =
   5305 		(nir_block *) exec_list_get_head(&loop->body);
   5306 
   5307 	ac_build_bgnloop(&ctx->ac, first_loop_block->index);
   5308 
   5309 	visit_cf_list(ctx, &loop->body);
   5310 
   5311 	ac_build_endloop(&ctx->ac, first_loop_block->index);
   5312 }
   5313 
   5314 static void visit_cf_list(struct ac_nir_context *ctx,
   5315                           struct exec_list *list)
   5316 {
   5317 	foreach_list_typed(nir_cf_node, node, node, list)
   5318 	{
   5319 		switch (node->type) {
   5320 		case nir_cf_node_block:
   5321 			visit_block(ctx, nir_cf_node_as_block(node));
   5322 			break;
   5323 
   5324 		case nir_cf_node_if:
   5325 			visit_if(ctx, nir_cf_node_as_if(node));
   5326 			break;
   5327 
   5328 		case nir_cf_node_loop:
   5329 			visit_loop(ctx, nir_cf_node_as_loop(node));
   5330 			break;
   5331 
   5332 		default:
   5333 			assert(0);
   5334 		}
   5335 	}
   5336 }
   5337 
   5338 static void
   5339 handle_vs_input_decl(struct nir_to_llvm_context *ctx,
   5340 		     struct nir_variable *variable)
   5341 {
   5342 	LLVMValueRef t_list_ptr = ctx->vertex_buffers;
   5343 	LLVMValueRef t_offset;
   5344 	LLVMValueRef t_list;
   5345 	LLVMValueRef input;
   5346 	LLVMValueRef buffer_index;
   5347 	int index = variable->data.location - VERT_ATTRIB_GENERIC0;
   5348 	int idx = variable->data.location;
   5349 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
   5350 
   5351 	variable->data.driver_location = idx * 4;
   5352 
   5353 	for (unsigned i = 0; i < attrib_count; ++i, ++idx) {
   5354 		if (ctx->options->key.vs.instance_rate_inputs & (1u << (index + i))) {
   5355 			buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.instance_id,
   5356 			                            ctx->abi.start_instance, "");
   5357 			if (ctx->options->key.vs.as_ls) {
   5358 				ctx->shader_info->vs.vgpr_comp_cnt =
   5359 					MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt);
   5360 			} else {
   5361 				ctx->shader_info->vs.vgpr_comp_cnt =
   5362 					MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt);
   5363 			}
   5364 		} else
   5365 			buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.vertex_id,
   5366 			                            ctx->abi.base_vertex, "");
   5367 		t_offset = LLVMConstInt(ctx->ac.i32, index + i, false);
   5368 
   5369 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
   5370 
   5371 		input = ac_build_buffer_load_format(&ctx->ac, t_list,
   5372 						    buffer_index,
   5373 						    ctx->ac.i32_0,
   5374 						    true);
   5375 
   5376 		for (unsigned chan = 0; chan < 4; chan++) {
   5377 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
   5378 			ctx->inputs[radeon_llvm_reg_index_soa(idx, chan)] =
   5379 				ac_to_integer(&ctx->ac, LLVMBuildExtractElement(ctx->builder,
   5380 							input, llvm_chan, ""));
   5381 		}
   5382 	}
   5383 }
   5384 
   5385 static void interp_fs_input(struct nir_to_llvm_context *ctx,
   5386 			    unsigned attr,
   5387 			    LLVMValueRef interp_param,
   5388 			    LLVMValueRef prim_mask,
   5389 			    LLVMValueRef result[4])
   5390 {
   5391 	LLVMValueRef attr_number;
   5392 	unsigned chan;
   5393 	LLVMValueRef i, j;
   5394 	bool interp = interp_param != NULL;
   5395 
   5396 	attr_number = LLVMConstInt(ctx->ac.i32, attr, false);
   5397 
   5398 	/* fs.constant returns the param from the middle vertex, so it's not
   5399 	 * really useful for flat shading. It's meant to be used for custom
   5400 	 * interpolation (but the intrinsic can't fetch from the other two
   5401 	 * vertices).
   5402 	 *
   5403 	 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
   5404 	 * to do the right thing. The only reason we use fs.constant is that
   5405 	 * fs.interp cannot be used on integers, because they can be equal
   5406 	 * to NaN.
   5407 	 */
   5408 	if (interp) {
   5409 		interp_param = LLVMBuildBitCast(ctx->builder, interp_param,
   5410 						ctx->ac.v2f32, "");
   5411 
   5412 		i = LLVMBuildExtractElement(ctx->builder, interp_param,
   5413 						ctx->ac.i32_0, "");
   5414 		j = LLVMBuildExtractElement(ctx->builder, interp_param,
   5415 						ctx->ac.i32_1, "");
   5416 	}
   5417 
   5418 	for (chan = 0; chan < 4; chan++) {
   5419 		LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
   5420 
   5421 		if (interp) {
   5422 			result[chan] = ac_build_fs_interp(&ctx->ac,
   5423 							  llvm_chan,
   5424 							  attr_number,
   5425 							  prim_mask, i, j);
   5426 		} else {
   5427 			result[chan] = ac_build_fs_interp_mov(&ctx->ac,
   5428 							      LLVMConstInt(ctx->ac.i32, 2, false),
   5429 							      llvm_chan,
   5430 							      attr_number,
   5431 							      prim_mask);
   5432 		}
   5433 	}
   5434 }
   5435 
   5436 static void
   5437 handle_fs_input_decl(struct nir_to_llvm_context *ctx,
   5438 		     struct nir_variable *variable)
   5439 {
   5440 	int idx = variable->data.location;
   5441 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
   5442 	LLVMValueRef interp;
   5443 
   5444 	variable->data.driver_location = idx * 4;
   5445 	ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location;
   5446 
   5447 	if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
   5448 		unsigned interp_type;
   5449 		if (variable->data.sample) {
   5450 			interp_type = INTERP_SAMPLE;
   5451 			ctx->shader_info->info.ps.force_persample = true;
   5452 		} else if (variable->data.centroid)
   5453 			interp_type = INTERP_CENTROID;
   5454 		else
   5455 			interp_type = INTERP_CENTER;
   5456 
   5457 		interp = lookup_interp_param(ctx, variable->data.interpolation, interp_type);
   5458 	} else
   5459 		interp = NULL;
   5460 
   5461 	for (unsigned i = 0; i < attrib_count; ++i)
   5462 		ctx->inputs[radeon_llvm_reg_index_soa(idx + i, 0)] = interp;
   5463 
   5464 }
   5465 
   5466 static void
   5467 handle_vs_inputs(struct nir_to_llvm_context *ctx,
   5468                  struct nir_shader *nir) {
   5469 	nir_foreach_variable(variable, &nir->inputs)
   5470 		handle_vs_input_decl(ctx, variable);
   5471 }
   5472 
   5473 static void
   5474 prepare_interp_optimize(struct nir_to_llvm_context *ctx,
   5475                         struct nir_shader *nir)
   5476 {
   5477 	if (!ctx->options->key.fs.multisample)
   5478 		return;
   5479 
   5480 	bool uses_center = false;
   5481 	bool uses_centroid = false;
   5482 	nir_foreach_variable(variable, &nir->inputs) {
   5483 		if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
   5484 		    variable->data.sample)
   5485 			continue;
   5486 
   5487 		if (variable->data.centroid)
   5488 			uses_centroid = true;
   5489 		else
   5490 			uses_center = true;
   5491 	}
   5492 
   5493 	if (uses_center && uses_centroid) {
   5494 		LLVMValueRef sel = LLVMBuildICmp(ctx->builder, LLVMIntSLT, ctx->prim_mask, ctx->ac.i32_0, "");
   5495 		ctx->persp_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->persp_center, ctx->persp_centroid, "");
   5496 		ctx->linear_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->linear_center, ctx->linear_centroid, "");
   5497 	}
   5498 }
   5499 
   5500 static void
   5501 handle_fs_inputs(struct nir_to_llvm_context *ctx,
   5502                  struct nir_shader *nir)
   5503 {
   5504 	prepare_interp_optimize(ctx, nir);
   5505 
   5506 	nir_foreach_variable(variable, &nir->inputs)
   5507 		handle_fs_input_decl(ctx, variable);
   5508 
   5509 	unsigned index = 0;
   5510 
   5511 	if (ctx->shader_info->info.ps.uses_input_attachments ||
   5512 	    ctx->shader_info->info.needs_multiview_view_index)
   5513 		ctx->input_mask |= 1ull << VARYING_SLOT_LAYER;
   5514 
   5515 	for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
   5516 		LLVMValueRef interp_param;
   5517 		LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
   5518 
   5519 		if (!(ctx->input_mask & (1ull << i)))
   5520 			continue;
   5521 
   5522 		if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
   5523 		    i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
   5524 			interp_param = *inputs;
   5525 			interp_fs_input(ctx, index, interp_param, ctx->prim_mask,
   5526 					inputs);
   5527 
   5528 			if (!interp_param)
   5529 				ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
   5530 			++index;
   5531 		} else if (i == VARYING_SLOT_POS) {
   5532 			for(int i = 0; i < 3; ++i)
   5533 				inputs[i] = ctx->abi.frag_pos[i];
   5534 
   5535 			inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
   5536 						  ctx->abi.frag_pos[3]);
   5537 		}
   5538 	}
   5539 	ctx->shader_info->fs.num_interp = index;
   5540 	if (ctx->input_mask & (1 << VARYING_SLOT_PNTC))
   5541 		ctx->shader_info->fs.has_pcoord = true;
   5542 	if (ctx->input_mask & (1 << VARYING_SLOT_PRIMITIVE_ID))
   5543 		ctx->shader_info->fs.prim_id_input = true;
   5544 	if (ctx->input_mask & (1 << VARYING_SLOT_LAYER))
   5545 		ctx->shader_info->fs.layer_input = true;
   5546 	ctx->shader_info->fs.input_mask = ctx->input_mask >> VARYING_SLOT_VAR0;
   5547 
   5548 	if (ctx->shader_info->info.needs_multiview_view_index)
   5549 		ctx->view_index = ctx->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
   5550 }
   5551 
   5552 static LLVMValueRef
   5553 ac_build_alloca(struct ac_llvm_context *ac,
   5554                 LLVMTypeRef type,
   5555                 const char *name)
   5556 {
   5557 	LLVMBuilderRef builder = ac->builder;
   5558 	LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
   5559 	LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
   5560 	LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
   5561 	LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
   5562 	LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
   5563 	LLVMValueRef res;
   5564 
   5565 	if (first_instr) {
   5566 		LLVMPositionBuilderBefore(first_builder, first_instr);
   5567 	} else {
   5568 		LLVMPositionBuilderAtEnd(first_builder, first_block);
   5569 	}
   5570 
   5571 	res = LLVMBuildAlloca(first_builder, type, name);
   5572 	LLVMBuildStore(builder, LLVMConstNull(type), res);
   5573 
   5574 	LLVMDisposeBuilder(first_builder);
   5575 
   5576 	return res;
   5577 }
   5578 
   5579 static LLVMValueRef si_build_alloca_undef(struct ac_llvm_context *ac,
   5580 					  LLVMTypeRef type,
   5581 					  const char *name)
   5582 {
   5583 	LLVMValueRef ptr = ac_build_alloca(ac, type, name);
   5584 	LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
   5585 	return ptr;
   5586 }
   5587 
   5588 static void
   5589 scan_shader_output_decl(struct nir_to_llvm_context *ctx,
   5590 			struct nir_variable *variable,
   5591 			struct nir_shader *shader,
   5592 			gl_shader_stage stage)
   5593 {
   5594 	int idx = variable->data.location + variable->data.index;
   5595 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
   5596 	uint64_t mask_attribs;
   5597 
   5598 	variable->data.driver_location = idx * 4;
   5599 
   5600 	/* tess ctrl has it's own load/store paths for outputs */
   5601 	if (stage == MESA_SHADER_TESS_CTRL)
   5602 		return;
   5603 
   5604 	mask_attribs = ((1ull << attrib_count) - 1) << idx;
   5605 	if (stage == MESA_SHADER_VERTEX ||
   5606 	    stage == MESA_SHADER_TESS_EVAL ||
   5607 	    stage == MESA_SHADER_GEOMETRY) {
   5608 		if (idx == VARYING_SLOT_CLIP_DIST0) {
   5609 			int length = shader->info.clip_distance_array_size +
   5610 			             shader->info.cull_distance_array_size;
   5611 			if (stage == MESA_SHADER_VERTEX) {
   5612 				ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
   5613 				ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
   5614 			}
   5615 			if (stage == MESA_SHADER_TESS_EVAL) {
   5616 				ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
   5617 				ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
   5618 			}
   5619 
   5620 			if (length > 4)
   5621 				attrib_count = 2;
   5622 			else
   5623 				attrib_count = 1;
   5624 			mask_attribs = 1ull << idx;
   5625 		}
   5626 	}
   5627 
   5628 	ctx->output_mask |= mask_attribs;
   5629 }
   5630 
   5631 static void
   5632 handle_shader_output_decl(struct ac_nir_context *ctx,
   5633 			  struct nir_shader *nir,
   5634 			  struct nir_variable *variable)
   5635 {
   5636 	unsigned output_loc = variable->data.driver_location / 4;
   5637 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
   5638 
   5639 	/* tess ctrl has it's own load/store paths for outputs */
   5640 	if (ctx->stage == MESA_SHADER_TESS_CTRL)
   5641 		return;
   5642 
   5643 	if (ctx->stage == MESA_SHADER_VERTEX ||
   5644 	    ctx->stage == MESA_SHADER_TESS_EVAL ||
   5645 	    ctx->stage == MESA_SHADER_GEOMETRY) {
   5646 		int idx = variable->data.location + variable->data.index;
   5647 		if (idx == VARYING_SLOT_CLIP_DIST0) {
   5648 			int length = nir->info.clip_distance_array_size +
   5649 				     nir->info.cull_distance_array_size;
   5650 
   5651 			if (length > 4)
   5652 				attrib_count = 2;
   5653 			else
   5654 				attrib_count = 1;
   5655 		}
   5656 	}
   5657 
   5658 	for (unsigned i = 0; i < attrib_count; ++i) {
   5659 		for (unsigned chan = 0; chan < 4; chan++) {
   5660 			ctx->outputs[radeon_llvm_reg_index_soa(output_loc + i, chan)] =
   5661 		                       si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
   5662 		}
   5663 	}
   5664 }
   5665 
   5666 static LLVMTypeRef
   5667 glsl_base_to_llvm_type(struct nir_to_llvm_context *ctx,
   5668 		       enum glsl_base_type type)
   5669 {
   5670 	switch (type) {
   5671 	case GLSL_TYPE_INT:
   5672 	case GLSL_TYPE_UINT:
   5673 	case GLSL_TYPE_BOOL:
   5674 	case GLSL_TYPE_SUBROUTINE:
   5675 		return ctx->ac.i32;
   5676 	case GLSL_TYPE_FLOAT: /* TODO handle mediump */
   5677 		return ctx->ac.f32;
   5678 	case GLSL_TYPE_INT64:
   5679 	case GLSL_TYPE_UINT64:
   5680 		return ctx->ac.i64;
   5681 	case GLSL_TYPE_DOUBLE:
   5682 		return ctx->ac.f64;
   5683 	default:
   5684 		unreachable("unknown GLSL type");
   5685 	}
   5686 }
   5687 
   5688 static LLVMTypeRef
   5689 glsl_to_llvm_type(struct nir_to_llvm_context *ctx,
   5690 		  const struct glsl_type *type)
   5691 {
   5692 	if (glsl_type_is_scalar(type)) {
   5693 		return glsl_base_to_llvm_type(ctx, glsl_get_base_type(type));
   5694 	}
   5695 
   5696 	if (glsl_type_is_vector(type)) {
   5697 		return LLVMVectorType(
   5698 		   glsl_base_to_llvm_type(ctx, glsl_get_base_type(type)),
   5699 		   glsl_get_vector_elements(type));
   5700 	}
   5701 
   5702 	if (glsl_type_is_matrix(type)) {
   5703 		return LLVMArrayType(
   5704 		   glsl_to_llvm_type(ctx, glsl_get_column_type(type)),
   5705 		   glsl_get_matrix_columns(type));
   5706 	}
   5707 
   5708 	if (glsl_type_is_array(type)) {
   5709 		return LLVMArrayType(
   5710 		   glsl_to_llvm_type(ctx, glsl_get_array_element(type)),
   5711 		   glsl_get_length(type));
   5712 	}
   5713 
   5714 	assert(glsl_type_is_struct(type));
   5715 
   5716 	LLVMTypeRef member_types[glsl_get_length(type)];
   5717 
   5718 	for (unsigned i = 0; i < glsl_get_length(type); i++) {
   5719 		member_types[i] =
   5720 			glsl_to_llvm_type(ctx,
   5721 					  glsl_get_struct_field(type, i));
   5722 	}
   5723 
   5724 	return LLVMStructTypeInContext(ctx->context, member_types,
   5725 				       glsl_get_length(type), false);
   5726 }
   5727 
   5728 static void
   5729 setup_locals(struct ac_nir_context *ctx,
   5730 	     struct nir_function *func)
   5731 {
   5732 	int i, j;
   5733 	ctx->num_locals = 0;
   5734 	nir_foreach_variable(variable, &func->impl->locals) {
   5735 		unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
   5736 		variable->data.driver_location = ctx->num_locals * 4;
   5737 		variable->data.location_frac = 0;
   5738 		ctx->num_locals += attrib_count;
   5739 	}
   5740 	ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
   5741 	if (!ctx->locals)
   5742 	    return;
   5743 
   5744 	for (i = 0; i < ctx->num_locals; i++) {
   5745 		for (j = 0; j < 4; j++) {
   5746 			ctx->locals[i * 4 + j] =
   5747 				si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp");
   5748 		}
   5749 	}
   5750 }
   5751 
   5752 static void
   5753 setup_shared(struct ac_nir_context *ctx,
   5754 	     struct nir_shader *nir)
   5755 {
   5756 	nir_foreach_variable(variable, &nir->shared) {
   5757 		LLVMValueRef shared =
   5758 			LLVMAddGlobalInAddressSpace(
   5759 			   ctx->ac.module, glsl_to_llvm_type(ctx->nctx, variable->type),
   5760 			   variable->name ? variable->name : "",
   5761 			   LOCAL_ADDR_SPACE);
   5762 		_mesa_hash_table_insert(ctx->vars, variable, shared);
   5763 	}
   5764 }
   5765 
   5766 static LLVMValueRef
   5767 emit_float_saturate(struct ac_llvm_context *ctx, LLVMValueRef v, float lo, float hi)
   5768 {
   5769 	v = ac_to_float(ctx, v);
   5770 	v = emit_intrin_2f_param(ctx, "llvm.maxnum", ctx->f32, v, LLVMConstReal(ctx->f32, lo));
   5771 	return emit_intrin_2f_param(ctx, "llvm.minnum", ctx->f32, v, LLVMConstReal(ctx->f32, hi));
   5772 }
   5773 
   5774 
   5775 static LLVMValueRef emit_pack_int16(struct nir_to_llvm_context *ctx,
   5776 					LLVMValueRef src0, LLVMValueRef src1)
   5777 {
   5778 	LLVMValueRef const16 = LLVMConstInt(ctx->ac.i32, 16, false);
   5779 	LLVMValueRef comp[2];
   5780 
   5781 	comp[0] = LLVMBuildAnd(ctx->builder, src0, LLVMConstInt(ctx->ac.i32, 65535, 0), "");
   5782 	comp[1] = LLVMBuildAnd(ctx->builder, src1, LLVMConstInt(ctx->ac.i32, 65535, 0), "");
   5783 	comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, "");
   5784 	return LLVMBuildOr(ctx->builder, comp[0], comp[1], "");
   5785 }
   5786 
   5787 /* Initialize arguments for the shader export intrinsic */
   5788 static void
   5789 si_llvm_init_export_args(struct nir_to_llvm_context *ctx,
   5790 			 LLVMValueRef *values,
   5791 			 unsigned target,
   5792 			 struct ac_export_args *args)
   5793 {
   5794 	/* Default is 0xf. Adjusted below depending on the format. */
   5795 	args->enabled_channels = 0xf;
   5796 
   5797 	/* Specify whether the EXEC mask represents the valid mask */
   5798 	args->valid_mask = 0;
   5799 
   5800 	/* Specify whether this is the last export */
   5801 	args->done = 0;
   5802 
   5803 	/* Specify the target we are exporting */
   5804 	args->target = target;
   5805 
   5806 	args->compr = false;
   5807 	args->out[0] = LLVMGetUndef(ctx->ac.f32);
   5808 	args->out[1] = LLVMGetUndef(ctx->ac.f32);
   5809 	args->out[2] = LLVMGetUndef(ctx->ac.f32);
   5810 	args->out[3] = LLVMGetUndef(ctx->ac.f32);
   5811 
   5812 	if (!values)
   5813 		return;
   5814 
   5815 	if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) {
   5816 		LLVMValueRef val[4];
   5817 		unsigned index = target - V_008DFC_SQ_EXP_MRT;
   5818 		unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
   5819 		bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
   5820 		bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
   5821 
   5822 		switch(col_format) {
   5823 		case V_028714_SPI_SHADER_ZERO:
   5824 			args->enabled_channels = 0; /* writemask */
   5825 			args->target = V_008DFC_SQ_EXP_NULL;
   5826 			break;
   5827 
   5828 		case V_028714_SPI_SHADER_32_R:
   5829 			args->enabled_channels = 1;
   5830 			args->out[0] = values[0];
   5831 			break;
   5832 
   5833 		case V_028714_SPI_SHADER_32_GR:
   5834 			args->enabled_channels = 0x3;
   5835 			args->out[0] = values[0];
   5836 			args->out[1] = values[1];
   5837 			break;
   5838 
   5839 		case V_028714_SPI_SHADER_32_AR:
   5840 			args->enabled_channels = 0x9;
   5841 			args->out[0] = values[0];
   5842 			args->out[3] = values[3];
   5843 			break;
   5844 
   5845 		case V_028714_SPI_SHADER_FP16_ABGR:
   5846 			args->compr = 1;
   5847 
   5848 			for (unsigned chan = 0; chan < 2; chan++) {
   5849 				LLVMValueRef pack_args[2] = {
   5850 					values[2 * chan],
   5851 					values[2 * chan + 1]
   5852 				};
   5853 				LLVMValueRef packed;
   5854 
   5855 				packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
   5856 				args->out[chan] = packed;
   5857 			}
   5858 			break;
   5859 
   5860 		case V_028714_SPI_SHADER_UNORM16_ABGR:
   5861 			for (unsigned chan = 0; chan < 4; chan++) {
   5862 				val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
   5863 				val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
   5864 							LLVMConstReal(ctx->ac.f32, 65535), "");
   5865 				val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
   5866 							LLVMConstReal(ctx->ac.f32, 0.5), "");
   5867 				val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
   5868 							ctx->ac.i32, "");
   5869 			}
   5870 
   5871 			args->compr = 1;
   5872 			args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
   5873 			args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
   5874 			break;
   5875 
   5876 		case V_028714_SPI_SHADER_SNORM16_ABGR:
   5877 			for (unsigned chan = 0; chan < 4; chan++) {
   5878 				val[chan] = emit_float_saturate(&ctx->ac, values[chan], -1, 1);
   5879 				val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
   5880 							LLVMConstReal(ctx->ac.f32, 32767), "");
   5881 
   5882 				/* If positive, add 0.5, else add -0.5. */
   5883 				val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
   5884 						LLVMBuildSelect(ctx->builder,
   5885 							LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
   5886 								val[chan], ctx->ac.f32_0, ""),
   5887 							LLVMConstReal(ctx->ac.f32, 0.5),
   5888 							LLVMConstReal(ctx->ac.f32, -0.5), ""), "");
   5889 				val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->ac.i32, "");
   5890 			}
   5891 
   5892 			args->compr = 1;
   5893 			args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
   5894 			args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
   5895 			break;
   5896 
   5897 		case V_028714_SPI_SHADER_UINT16_ABGR: {
   5898 			LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32,
   5899 							    is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
   5900 			LLVMValueRef max_alpha = !is_int10 ? max_rgb : LLVMConstInt(ctx->ac.i32, 3, 0);
   5901 
   5902 			for (unsigned chan = 0; chan < 4; chan++) {
   5903 				val[chan] = ac_to_integer(&ctx->ac, values[chan]);
   5904 				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntULT, val[chan], chan == 3 ? max_alpha : max_rgb);
   5905 			}
   5906 
   5907 			args->compr = 1;
   5908 			args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
   5909 			args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
   5910 			break;
   5911 		}
   5912 
   5913 		case V_028714_SPI_SHADER_SINT16_ABGR: {
   5914 			LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32,
   5915 							    is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
   5916 			LLVMValueRef min_rgb = LLVMConstInt(ctx->ac.i32,
   5917 							    is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
   5918 			LLVMValueRef max_alpha = !is_int10 ? max_rgb : ctx->ac.i32_1;
   5919 			LLVMValueRef min_alpha = !is_int10 ? min_rgb : LLVMConstInt(ctx->ac.i32, -2, 0);
   5920 
   5921 			/* Clamp. */
   5922 			for (unsigned chan = 0; chan < 4; chan++) {
   5923 				val[chan] = ac_to_integer(&ctx->ac, values[chan]);
   5924 				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSLT, val[chan], chan == 3 ? max_alpha : max_rgb);
   5925 				val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSGT, val[chan], chan == 3 ? min_alpha : min_rgb);
   5926 			}
   5927 
   5928 			args->compr = 1;
   5929 			args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
   5930 			args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
   5931 			break;
   5932 		}
   5933 
   5934 		default:
   5935 		case V_028714_SPI_SHADER_32_ABGR:
   5936 			memcpy(&args->out[0], values, sizeof(values[0]) * 4);
   5937 			break;
   5938 		}
   5939 	} else
   5940 		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
   5941 
   5942 	for (unsigned i = 0; i < 4; ++i)
   5943 		args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
   5944 }
   5945 
   5946 static void
   5947 handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
   5948 		       bool export_prim_id,
   5949 		       struct ac_vs_output_info *outinfo)
   5950 {
   5951 	uint32_t param_count = 0;
   5952 	unsigned target;
   5953 	unsigned pos_idx, num_pos_exports = 0;
   5954 	struct ac_export_args args, pos_args[4] = {};
   5955 	LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL;
   5956 	int i;
   5957 
   5958 	if (ctx->options->key.has_multiview_view_index) {
   5959 		LLVMValueRef* tmp_out = &ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
   5960 		if(!*tmp_out) {
   5961 			for(unsigned i = 0; i < 4; ++i)
   5962 				ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
   5963 				            si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
   5964 		}
   5965 
   5966 		LLVMBuildStore(ctx->builder, ac_to_float(&ctx->ac, ctx->view_index),  *tmp_out);
   5967 		ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
   5968 	}
   5969 
   5970 	memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
   5971 	       sizeof(outinfo->vs_output_param_offset));
   5972 
   5973 	if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
   5974 		LLVMValueRef slots[8];
   5975 		unsigned j;
   5976 
   5977 		if (outinfo->cull_dist_mask)
   5978 			outinfo->cull_dist_mask <<= ctx->num_output_clips;
   5979 
   5980 		i = VARYING_SLOT_CLIP_DIST0;
   5981 		for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++)
   5982 			slots[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   5983 							       ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
   5984 
   5985 		for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++)
   5986 			slots[i] = LLVMGetUndef(ctx->ac.f32);
   5987 
   5988 		if (ctx->num_output_clips + ctx->num_output_culls > 4) {
   5989 			target = V_008DFC_SQ_EXP_POS + 3;
   5990 			si_llvm_init_export_args(ctx, &slots[4], target, &args);
   5991 			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
   5992 			       &args, sizeof(args));
   5993 		}
   5994 
   5995 		target = V_008DFC_SQ_EXP_POS + 2;
   5996 		si_llvm_init_export_args(ctx, &slots[0], target, &args);
   5997 		memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
   5998 		       &args, sizeof(args));
   5999 
   6000 	}
   6001 
   6002 	LLVMValueRef pos_values[4] = {ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_1};
   6003 	if (ctx->output_mask & (1ull << VARYING_SLOT_POS)) {
   6004 		for (unsigned j = 0; j < 4; j++)
   6005 			pos_values[j] = LLVMBuildLoad(ctx->builder,
   6006 			                         ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_POS, j)], "");
   6007 	}
   6008 	si_llvm_init_export_args(ctx, pos_values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
   6009 
   6010 	if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) {
   6011 		outinfo->writes_pointsize = true;
   6012 		psize_value = LLVMBuildLoad(ctx->builder,
   6013 		                            ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_PSIZ, 0)], "");
   6014 	}
   6015 
   6016 	if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) {
   6017 		outinfo->writes_layer = true;
   6018 		layer_value = LLVMBuildLoad(ctx->builder,
   6019 		                            ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)], "");
   6020 	}
   6021 
   6022 	if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) {
   6023 		outinfo->writes_viewport_index = true;
   6024 		viewport_index_value = LLVMBuildLoad(ctx->builder,
   6025 		                                     ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_VIEWPORT, 0)], "");
   6026 	}
   6027 
   6028 	if (outinfo->writes_pointsize ||
   6029 	    outinfo->writes_layer ||
   6030 	    outinfo->writes_viewport_index) {
   6031 		pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
   6032 						(outinfo->writes_layer == true ? 4 : 0));
   6033 		pos_args[1].valid_mask = 0;
   6034 		pos_args[1].done = 0;
   6035 		pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
   6036 		pos_args[1].compr = 0;
   6037 		pos_args[1].out[0] = ctx->ac.f32_0; /* X */
   6038 		pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
   6039 		pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
   6040 		pos_args[1].out[3] = ctx->ac.f32_0;  /* W */
   6041 
   6042 		if (outinfo->writes_pointsize == true)
   6043 			pos_args[1].out[0] = psize_value;
   6044 		if (outinfo->writes_layer == true)
   6045 			pos_args[1].out[2] = layer_value;
   6046 		if (outinfo->writes_viewport_index == true) {
   6047 			if (ctx->options->chip_class >= GFX9) {
   6048 				/* GFX9 has the layer in out.z[10:0] and the viewport
   6049 				 * index in out.z[19:16].
   6050 				 */
   6051 				LLVMValueRef v = viewport_index_value;
   6052 				v = ac_to_integer(&ctx->ac, v);
   6053 				v = LLVMBuildShl(ctx->builder, v,
   6054 						 LLVMConstInt(ctx->ac.i32, 16, false),
   6055 						 "");
   6056 				v = LLVMBuildOr(ctx->builder, v,
   6057 						ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
   6058 
   6059 				pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
   6060 				pos_args[1].enabled_channels |= 1 << 2;
   6061 			} else {
   6062 				pos_args[1].out[3] = viewport_index_value;
   6063 				pos_args[1].enabled_channels |= 1 << 3;
   6064 			}
   6065 		}
   6066 	}
   6067 	for (i = 0; i < 4; i++) {
   6068 		if (pos_args[i].out[0])
   6069 			num_pos_exports++;
   6070 	}
   6071 
   6072 	pos_idx = 0;
   6073 	for (i = 0; i < 4; i++) {
   6074 		if (!pos_args[i].out[0])
   6075 			continue;
   6076 
   6077 		/* Specify the target we are exporting */
   6078 		pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
   6079 		if (pos_idx == num_pos_exports)
   6080 			pos_args[i].done = 1;
   6081 		ac_build_export(&ctx->ac, &pos_args[i]);
   6082 	}
   6083 
   6084 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   6085 		LLVMValueRef values[4];
   6086 		if (!(ctx->output_mask & (1ull << i)))
   6087 			continue;
   6088 
   6089 		for (unsigned j = 0; j < 4; j++)
   6090 			values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   6091 					        ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
   6092 
   6093 		if (i == VARYING_SLOT_LAYER) {
   6094 			target = V_008DFC_SQ_EXP_PARAM + param_count;
   6095 			outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = param_count;
   6096 			param_count++;
   6097 		} else if (i == VARYING_SLOT_PRIMITIVE_ID) {
   6098 			target = V_008DFC_SQ_EXP_PARAM + param_count;
   6099 			outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
   6100 			param_count++;
   6101 		} else if (i >= VARYING_SLOT_VAR0) {
   6102 			outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0);
   6103 			target = V_008DFC_SQ_EXP_PARAM + param_count;
   6104 			outinfo->vs_output_param_offset[i] = param_count;
   6105 			param_count++;
   6106 		} else
   6107 			continue;
   6108 
   6109 		si_llvm_init_export_args(ctx, values, target, &args);
   6110 
   6111 		if (target >= V_008DFC_SQ_EXP_POS &&
   6112 		    target <= (V_008DFC_SQ_EXP_POS + 3)) {
   6113 			memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
   6114 			       &args, sizeof(args));
   6115 		} else {
   6116 			ac_build_export(&ctx->ac, &args);
   6117 		}
   6118 	}
   6119 
   6120 	if (export_prim_id) {
   6121 		LLVMValueRef values[4];
   6122 		target = V_008DFC_SQ_EXP_PARAM + param_count;
   6123 		outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
   6124 		param_count++;
   6125 
   6126 		values[0] = ctx->vs_prim_id;
   6127 		ctx->shader_info->vs.vgpr_comp_cnt = MAX2(2,
   6128 							  ctx->shader_info->vs.vgpr_comp_cnt);
   6129 		for (unsigned j = 1; j < 4; j++)
   6130 			values[j] = ctx->ac.f32_0;
   6131 		si_llvm_init_export_args(ctx, values, target, &args);
   6132 		ac_build_export(&ctx->ac, &args);
   6133 		outinfo->export_prim_id = true;
   6134 	}
   6135 
   6136 	outinfo->pos_exports = num_pos_exports;
   6137 	outinfo->param_exports = param_count;
   6138 }
   6139 
   6140 static void
   6141 handle_es_outputs_post(struct nir_to_llvm_context *ctx,
   6142 		       struct ac_es_output_info *outinfo)
   6143 {
   6144 	int j;
   6145 	uint64_t max_output_written = 0;
   6146 	LLVMValueRef lds_base = NULL;
   6147 
   6148 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   6149 		int param_index;
   6150 		int length = 4;
   6151 
   6152 		if (!(ctx->output_mask & (1ull << i)))
   6153 			continue;
   6154 
   6155 		if (i == VARYING_SLOT_CLIP_DIST0)
   6156 			length = ctx->num_output_clips + ctx->num_output_culls;
   6157 
   6158 		param_index = shader_io_get_unique_index(i);
   6159 
   6160 		max_output_written = MAX2(param_index + (length > 4), max_output_written);
   6161 	}
   6162 
   6163 	outinfo->esgs_itemsize = (max_output_written + 1) * 16;
   6164 
   6165 	if (ctx->ac.chip_class  >= GFX9) {
   6166 		unsigned itemsize_dw = outinfo->esgs_itemsize / 4;
   6167 		LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
   6168 		LLVMValueRef wave_idx = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
   6169 		                                     LLVMConstInt(ctx->ac.i32, 24, false),
   6170 		                                     LLVMConstInt(ctx->ac.i32, 4, false), false);
   6171 		vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
   6172 					 LLVMBuildMul(ctx->ac.builder, wave_idx,
   6173 						      LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
   6174 		lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
   6175 					LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
   6176 	}
   6177 
   6178 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   6179 		LLVMValueRef dw_addr = NULL;
   6180 		LLVMValueRef *out_ptr = &ctx->nir->outputs[i * 4];
   6181 		int param_index;
   6182 		int length = 4;
   6183 
   6184 		if (!(ctx->output_mask & (1ull << i)))
   6185 			continue;
   6186 
   6187 		if (i == VARYING_SLOT_CLIP_DIST0)
   6188 			length = ctx->num_output_clips + ctx->num_output_culls;
   6189 
   6190 		param_index = shader_io_get_unique_index(i);
   6191 
   6192 		if (lds_base) {
   6193 			dw_addr = LLVMBuildAdd(ctx->builder, lds_base,
   6194 			                       LLVMConstInt(ctx->ac.i32, param_index * 4, false),
   6195 			                       "");
   6196 		}
   6197 		for (j = 0; j < length; j++) {
   6198 			LLVMValueRef out_val = LLVMBuildLoad(ctx->builder, out_ptr[j], "");
   6199 			out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->ac.i32, "");
   6200 
   6201 			if (ctx->ac.chip_class  >= GFX9) {
   6202 				ac_lds_store(&ctx->ac, dw_addr,
   6203 					     LLVMBuildLoad(ctx->builder, out_ptr[j], ""));
   6204 				dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, ctx->ac.i32_1, "");
   6205 			} else {
   6206 				ac_build_buffer_store_dword(&ctx->ac,
   6207 				                            ctx->esgs_ring,
   6208 				                            out_val, 1,
   6209 				                            NULL, ctx->es2gs_offset,
   6210 				                            (4 * param_index + j) * 4,
   6211 				                            1, 1, true, true);
   6212 			}
   6213 		}
   6214 	}
   6215 }
   6216 
   6217 static void
   6218 handle_ls_outputs_post(struct nir_to_llvm_context *ctx)
   6219 {
   6220 	LLVMValueRef vertex_id = ctx->rel_auto_id;
   6221 	LLVMValueRef vertex_dw_stride = unpack_param(&ctx->ac, ctx->ls_out_layout, 13, 8);
   6222 	LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->builder, vertex_id,
   6223 						 vertex_dw_stride, "");
   6224 
   6225 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   6226 		LLVMValueRef *out_ptr = &ctx->nir->outputs[i * 4];
   6227 		int length = 4;
   6228 
   6229 		if (!(ctx->output_mask & (1ull << i)))
   6230 			continue;
   6231 
   6232 		if (i == VARYING_SLOT_CLIP_DIST0)
   6233 			length = ctx->num_output_clips + ctx->num_output_culls;
   6234 		int param = shader_io_get_unique_index(i);
   6235 		mark_tess_output(ctx, false, param, 1);
   6236 		if (length > 4)
   6237 			mark_tess_output(ctx, false, param + 1, 1);
   6238 		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->builder, base_dw_addr,
   6239 						    LLVMConstInt(ctx->ac.i32, param * 4, false),
   6240 						    "");
   6241 		for (unsigned j = 0; j < length; j++) {
   6242 			ac_lds_store(&ctx->ac, dw_addr,
   6243 				     LLVMBuildLoad(ctx->builder, out_ptr[j], ""));
   6244 			dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, ctx->ac.i32_1, "");
   6245 		}
   6246 	}
   6247 }
   6248 
   6249 struct ac_build_if_state
   6250 {
   6251 	struct nir_to_llvm_context *ctx;
   6252 	LLVMValueRef condition;
   6253 	LLVMBasicBlockRef entry_block;
   6254 	LLVMBasicBlockRef true_block;
   6255 	LLVMBasicBlockRef false_block;
   6256 	LLVMBasicBlockRef merge_block;
   6257 };
   6258 
   6259 static LLVMBasicBlockRef
   6260 ac_build_insert_new_block(struct nir_to_llvm_context *ctx, const char *name)
   6261 {
   6262 	LLVMBasicBlockRef current_block;
   6263 	LLVMBasicBlockRef next_block;
   6264 	LLVMBasicBlockRef new_block;
   6265 
   6266 	/* get current basic block */
   6267 	current_block = LLVMGetInsertBlock(ctx->builder);
   6268 
   6269 	/* chqeck if there's another block after this one */
   6270 	next_block = LLVMGetNextBasicBlock(current_block);
   6271 	if (next_block) {
   6272 		/* insert the new block before the next block */
   6273 		new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
   6274 	}
   6275 	else {
   6276 		/* append new block after current block */
   6277 		LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
   6278 		new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
   6279 	}
   6280 	return new_block;
   6281 }
   6282 
   6283 static void
   6284 ac_nir_build_if(struct ac_build_if_state *ifthen,
   6285 		struct nir_to_llvm_context *ctx,
   6286 		LLVMValueRef condition)
   6287 {
   6288 	LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->builder);
   6289 
   6290 	memset(ifthen, 0, sizeof *ifthen);
   6291 	ifthen->ctx = ctx;
   6292 	ifthen->condition = condition;
   6293 	ifthen->entry_block = block;
   6294 
   6295 	/* create endif/merge basic block for the phi functions */
   6296 	ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
   6297 
   6298 	/* create/insert true_block before merge_block */
   6299 	ifthen->true_block =
   6300 		LLVMInsertBasicBlockInContext(ctx->context,
   6301 					      ifthen->merge_block,
   6302 					      "if-true-block");
   6303 
   6304 	/* successive code goes into the true block */
   6305 	LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
   6306 }
   6307 
   6308 /**
   6309  * End a conditional.
   6310  */
   6311 static void
   6312 ac_nir_build_endif(struct ac_build_if_state *ifthen)
   6313 {
   6314 	LLVMBuilderRef builder = ifthen->ctx->builder;
   6315 
   6316 	/* Insert branch to the merge block from current block */
   6317 	LLVMBuildBr(builder, ifthen->merge_block);
   6318 
   6319 	/*
   6320 	 * Now patch in the various branch instructions.
   6321 	 */
   6322 
   6323 	/* Insert the conditional branch instruction at the end of entry_block */
   6324 	LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
   6325 	if (ifthen->false_block) {
   6326 		/* we have an else clause */
   6327 		LLVMBuildCondBr(builder, ifthen->condition,
   6328 				ifthen->true_block, ifthen->false_block);
   6329 	}
   6330 	else {
   6331 		/* no else clause */
   6332 		LLVMBuildCondBr(builder, ifthen->condition,
   6333 				ifthen->true_block, ifthen->merge_block);
   6334 	}
   6335 
   6336 	/* Resume building code at end of the ifthen->merge_block */
   6337 	LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
   6338 }
   6339 
   6340 static void
   6341 write_tess_factors(struct nir_to_llvm_context *ctx)
   6342 {
   6343 	unsigned stride, outer_comps, inner_comps;
   6344 	struct ac_build_if_state if_ctx, inner_if_ctx;
   6345 	LLVMValueRef invocation_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5);
   6346 	LLVMValueRef rel_patch_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
   6347 	unsigned tess_inner_index, tess_outer_index;
   6348 	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
   6349 	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
   6350 	int i;
   6351 	emit_barrier(&ctx->ac, ctx->stage);
   6352 
   6353 	switch (ctx->options->key.tcs.primitive_mode) {
   6354 	case GL_ISOLINES:
   6355 		stride = 2;
   6356 		outer_comps = 2;
   6357 		inner_comps = 0;
   6358 		break;
   6359 	case GL_TRIANGLES:
   6360 		stride = 4;
   6361 		outer_comps = 3;
   6362 		inner_comps = 1;
   6363 		break;
   6364 	case GL_QUADS:
   6365 		stride = 6;
   6366 		outer_comps = 4;
   6367 		inner_comps = 2;
   6368 		break;
   6369 	default:
   6370 		return;
   6371 	}
   6372 
   6373 	ac_nir_build_if(&if_ctx, ctx,
   6374 			LLVMBuildICmp(ctx->builder, LLVMIntEQ,
   6375 				      invocation_id, ctx->ac.i32_0, ""));
   6376 
   6377 	tess_inner_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
   6378 	tess_outer_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
   6379 
   6380 	mark_tess_output(ctx, true, tess_inner_index, 1);
   6381 	mark_tess_output(ctx, true, tess_outer_index, 1);
   6382 	lds_base = get_tcs_out_current_patch_data_offset(ctx);
   6383 	lds_inner = LLVMBuildAdd(ctx->builder, lds_base,
   6384 				 LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, false), "");
   6385 	lds_outer = LLVMBuildAdd(ctx->builder, lds_base,
   6386 				 LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, false), "");
   6387 
   6388 	for (i = 0; i < 4; i++) {
   6389 		inner[i] = LLVMGetUndef(ctx->ac.i32);
   6390 		outer[i] = LLVMGetUndef(ctx->ac.i32);
   6391 	}
   6392 
   6393 	// LINES reverseal
   6394 	if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) {
   6395 		outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
   6396 		lds_outer = LLVMBuildAdd(ctx->builder, lds_outer,
   6397 					 ctx->ac.i32_1, "");
   6398 		outer[1] = out[0] = ac_lds_load(&ctx->ac, lds_outer);
   6399 	} else {
   6400 		for (i = 0; i < outer_comps; i++) {
   6401 			outer[i] = out[i] =
   6402 				ac_lds_load(&ctx->ac, lds_outer);
   6403 			lds_outer = LLVMBuildAdd(ctx->builder, lds_outer,
   6404 						 ctx->ac.i32_1, "");
   6405 		}
   6406 		for (i = 0; i < inner_comps; i++) {
   6407 			inner[i] = out[outer_comps+i] =
   6408 				ac_lds_load(&ctx->ac, lds_inner);
   6409 			lds_inner = LLVMBuildAdd(ctx->builder, lds_inner,
   6410 						 ctx->ac.i32_1, "");
   6411 		}
   6412 	}
   6413 
   6414 	/* Convert the outputs to vectors for stores. */
   6415 	vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
   6416 	vec1 = NULL;
   6417 
   6418 	if (stride > 4)
   6419 		vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
   6420 
   6421 
   6422 	buffer = ctx->hs_ring_tess_factor;
   6423 	tf_base = ctx->tess_factor_offset;
   6424 	byteoffset = LLVMBuildMul(ctx->builder, rel_patch_id,
   6425 				  LLVMConstInt(ctx->ac.i32, 4 * stride, false), "");
   6426 	unsigned tf_offset = 0;
   6427 
   6428 	if (ctx->options->chip_class <= VI) {
   6429 		ac_nir_build_if(&inner_if_ctx, ctx,
   6430 		                LLVMBuildICmp(ctx->builder, LLVMIntEQ,
   6431 		                              rel_patch_id, ctx->ac.i32_0, ""));
   6432 
   6433 		/* Store the dynamic HS control word. */
   6434 		ac_build_buffer_store_dword(&ctx->ac, buffer,
   6435 					    LLVMConstInt(ctx->ac.i32, 0x80000000, false),
   6436 					    1, ctx->ac.i32_0, tf_base,
   6437 					    0, 1, 0, true, false);
   6438 		tf_offset += 4;
   6439 
   6440 		ac_nir_build_endif(&inner_if_ctx);
   6441 	}
   6442 
   6443 	/* Store the tessellation factors. */
   6444 	ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
   6445 				    MIN2(stride, 4), byteoffset, tf_base,
   6446 				    tf_offset, 1, 0, true, false);
   6447 	if (vec1)
   6448 		ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
   6449 					    stride - 4, byteoffset, tf_base,
   6450 					    16 + tf_offset, 1, 0, true, false);
   6451 
   6452 	//store to offchip for TES to read - only if TES reads them
   6453 	if (ctx->options->key.tcs.tes_reads_tess_factors) {
   6454 		LLVMValueRef inner_vec, outer_vec, tf_outer_offset;
   6455 		LLVMValueRef tf_inner_offset;
   6456 		unsigned param_outer, param_inner;
   6457 
   6458 		param_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
   6459 		tf_outer_offset = get_tcs_tes_buffer_address(ctx, NULL,
   6460 							     LLVMConstInt(ctx->ac.i32, param_outer, 0));
   6461 
   6462 		outer_vec = ac_build_gather_values(&ctx->ac, outer,
   6463 						   util_next_power_of_two(outer_comps));
   6464 
   6465 		ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec,
   6466 					    outer_comps, tf_outer_offset,
   6467 					    ctx->oc_lds, 0, 1, 0, true, false);
   6468 		if (inner_comps) {
   6469 			param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
   6470 			tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL,
   6471 								     LLVMConstInt(ctx->ac.i32, param_inner, 0));
   6472 
   6473 			inner_vec = inner_comps == 1 ? inner[0] :
   6474 				ac_build_gather_values(&ctx->ac, inner, inner_comps);
   6475 			ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec,
   6476 						    inner_comps, tf_inner_offset,
   6477 						    ctx->oc_lds, 0, 1, 0, true, false);
   6478 		}
   6479 	}
   6480 	ac_nir_build_endif(&if_ctx);
   6481 }
   6482 
   6483 static void
   6484 handle_tcs_outputs_post(struct nir_to_llvm_context *ctx)
   6485 {
   6486 	write_tess_factors(ctx);
   6487 }
   6488 
   6489 static bool
   6490 si_export_mrt_color(struct nir_to_llvm_context *ctx,
   6491 		    LLVMValueRef *color, unsigned param, bool is_last,
   6492 		    struct ac_export_args *args)
   6493 {
   6494 	/* Export */
   6495 	si_llvm_init_export_args(ctx, color, param,
   6496 				 args);
   6497 
   6498 	if (is_last) {
   6499 		args->valid_mask = 1; /* whether the EXEC mask is valid */
   6500 		args->done = 1; /* DONE bit */
   6501 	} else if (!args->enabled_channels)
   6502 		return false; /* unnecessary NULL export */
   6503 
   6504 	return true;
   6505 }
   6506 
   6507 static void
   6508 radv_export_mrt_z(struct nir_to_llvm_context *ctx,
   6509 		  LLVMValueRef depth, LLVMValueRef stencil,
   6510 		  LLVMValueRef samplemask)
   6511 {
   6512 	struct ac_export_args args;
   6513 
   6514 	ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
   6515 
   6516 	ac_build_export(&ctx->ac, &args);
   6517 }
   6518 
   6519 static void
   6520 handle_fs_outputs_post(struct nir_to_llvm_context *ctx)
   6521 {
   6522 	unsigned index = 0;
   6523 	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
   6524 	struct ac_export_args color_args[8];
   6525 
   6526 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   6527 		LLVMValueRef values[4];
   6528 
   6529 		if (!(ctx->output_mask & (1ull << i)))
   6530 			continue;
   6531 
   6532 		if (i == FRAG_RESULT_DEPTH) {
   6533 			ctx->shader_info->fs.writes_z = true;
   6534 			depth = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   6535 							    ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
   6536 		} else if (i == FRAG_RESULT_STENCIL) {
   6537 			ctx->shader_info->fs.writes_stencil = true;
   6538 			stencil = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   6539 							      ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
   6540 		} else if (i == FRAG_RESULT_SAMPLE_MASK) {
   6541 			ctx->shader_info->fs.writes_sample_mask = true;
   6542 			samplemask = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   6543 								  ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
   6544 		} else {
   6545 			bool last = false;
   6546 			for (unsigned j = 0; j < 4; j++)
   6547 				values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
   6548 									ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
   6549 
   6550 			if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask)
   6551 				last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
   6552 
   6553 			bool ret = si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + (i - FRAG_RESULT_DATA0), last, &color_args[index]);
   6554 			if (ret)
   6555 				index++;
   6556 		}
   6557 	}
   6558 
   6559 	for (unsigned i = 0; i < index; i++)
   6560 		ac_build_export(&ctx->ac, &color_args[i]);
   6561 	if (depth || stencil || samplemask)
   6562 		radv_export_mrt_z(ctx, depth, stencil, samplemask);
   6563 	else if (!index) {
   6564 		si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true, &color_args[0]);
   6565 		ac_build_export(&ctx->ac, &color_args[0]);
   6566 	}
   6567 }
   6568 
   6569 static void
   6570 emit_gs_epilogue(struct nir_to_llvm_context *ctx)
   6571 {
   6572 	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
   6573 }
   6574 
   6575 static void
   6576 handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
   6577 			   LLVMValueRef *addrs)
   6578 {
   6579 	struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
   6580 
   6581 	switch (ctx->stage) {
   6582 	case MESA_SHADER_VERTEX:
   6583 		if (ctx->options->key.vs.as_ls)
   6584 			handle_ls_outputs_post(ctx);
   6585 		else if (ctx->options->key.vs.as_es)
   6586 			handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
   6587 		else
   6588 			handle_vs_outputs_post(ctx, ctx->options->key.vs.export_prim_id,
   6589 					       &ctx->shader_info->vs.outinfo);
   6590 		break;
   6591 	case MESA_SHADER_FRAGMENT:
   6592 		handle_fs_outputs_post(ctx);
   6593 		break;
   6594 	case MESA_SHADER_GEOMETRY:
   6595 		emit_gs_epilogue(ctx);
   6596 		break;
   6597 	case MESA_SHADER_TESS_CTRL:
   6598 		handle_tcs_outputs_post(ctx);
   6599 		break;
   6600 	case MESA_SHADER_TESS_EVAL:
   6601 		if (ctx->options->key.tes.as_es)
   6602 			handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info);
   6603 		else
   6604 			handle_vs_outputs_post(ctx, ctx->options->key.tes.export_prim_id,
   6605 					       &ctx->shader_info->tes.outinfo);
   6606 		break;
   6607 	default:
   6608 		break;
   6609 	}
   6610 }
   6611 
   6612 static void ac_llvm_finalize_module(struct nir_to_llvm_context * ctx)
   6613 {
   6614 	LLVMPassManagerRef passmgr;
   6615 	/* Create the pass manager */
   6616 	passmgr = LLVMCreateFunctionPassManagerForModule(
   6617 							ctx->module);
   6618 
   6619 	/* This pass should eliminate all the load and store instructions */
   6620 	LLVMAddPromoteMemoryToRegisterPass(passmgr);
   6621 
   6622 	/* Add some optimization passes */
   6623 	LLVMAddScalarReplAggregatesPass(passmgr);
   6624 	LLVMAddLICMPass(passmgr);
   6625 	LLVMAddAggressiveDCEPass(passmgr);
   6626 	LLVMAddCFGSimplificationPass(passmgr);
   6627 	LLVMAddInstructionCombiningPass(passmgr);
   6628 
   6629 	/* Run the pass */
   6630 	LLVMInitializeFunctionPassManager(passmgr);
   6631 	LLVMRunFunctionPassManager(passmgr, ctx->main_function);
   6632 	LLVMFinalizeFunctionPassManager(passmgr);
   6633 
   6634 	LLVMDisposeBuilder(ctx->builder);
   6635 	LLVMDisposePassManager(passmgr);
   6636 
   6637 	ac_llvm_context_dispose(&ctx->ac);
   6638 }
   6639 
   6640 static void
   6641 ac_nir_eliminate_const_vs_outputs(struct nir_to_llvm_context *ctx)
   6642 {
   6643 	struct ac_vs_output_info *outinfo;
   6644 
   6645 	switch (ctx->stage) {
   6646 	case MESA_SHADER_FRAGMENT:
   6647 	case MESA_SHADER_COMPUTE:
   6648 	case MESA_SHADER_TESS_CTRL:
   6649 	case MESA_SHADER_GEOMETRY:
   6650 		return;
   6651 	case MESA_SHADER_VERTEX:
   6652 		if (ctx->options->key.vs.as_ls ||
   6653 		    ctx->options->key.vs.as_es)
   6654 			return;
   6655 		outinfo = &ctx->shader_info->vs.outinfo;
   6656 		break;
   6657 	case MESA_SHADER_TESS_EVAL:
   6658 		if (ctx->options->key.vs.as_es)
   6659 			return;
   6660 		outinfo = &ctx->shader_info->tes.outinfo;
   6661 		break;
   6662 	default:
   6663 		unreachable("Unhandled shader type");
   6664 	}
   6665 
   6666 	ac_optimize_vs_outputs(&ctx->ac,
   6667 			       ctx->main_function,
   6668 			       outinfo->vs_output_param_offset,
   6669 			       VARYING_SLOT_MAX,
   6670 			       &outinfo->param_exports);
   6671 }
   6672 
   6673 static void
   6674 ac_setup_rings(struct nir_to_llvm_context *ctx)
   6675 {
   6676 	if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
   6677 	    (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->options->key.tes.as_es)) {
   6678 		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_VS, false));
   6679 	}
   6680 
   6681 	if (ctx->is_gs_copy_shader) {
   6682 		ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
   6683 	}
   6684 	if (ctx->stage == MESA_SHADER_GEOMETRY) {
   6685 		LLVMValueRef tmp;
   6686 		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_GS, false));
   6687 		ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
   6688 
   6689 		ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->ac.v4i32, "");
   6690 
   6691 		ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, ctx->gsvs_num_entries, LLVMConstInt(ctx->ac.i32, 2, false), "");
   6692 		tmp = LLVMBuildExtractElement(ctx->builder, ctx->gsvs_ring, ctx->ac.i32_1, "");
   6693 		tmp = LLVMBuildOr(ctx->builder, tmp, ctx->gsvs_ring_stride, "");
   6694 		ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, tmp, ctx->ac.i32_1, "");
   6695 	}
   6696 
   6697 	if (ctx->stage == MESA_SHADER_TESS_CTRL ||
   6698 	    ctx->stage == MESA_SHADER_TESS_EVAL) {
   6699 		ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
   6700 		ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
   6701 	}
   6702 }
   6703 
   6704 static unsigned
   6705 ac_nir_get_max_workgroup_size(enum chip_class chip_class,
   6706 			      const struct nir_shader *nir)
   6707 {
   6708 	switch (nir->info.stage) {
   6709 	case MESA_SHADER_TESS_CTRL:
   6710 		return chip_class >= CIK ? 128 : 64;
   6711 	case MESA_SHADER_GEOMETRY:
   6712 		return chip_class >= GFX9 ? 128 : 64;
   6713 	case MESA_SHADER_COMPUTE:
   6714 		break;
   6715 	default:
   6716 		return 0;
   6717 	}
   6718 
   6719 	unsigned max_workgroup_size = nir->info.cs.local_size[0] *
   6720 		nir->info.cs.local_size[1] *
   6721 		nir->info.cs.local_size[2];
   6722 	return max_workgroup_size;
   6723 }
   6724 
   6725 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
   6726 static void ac_nir_fixup_ls_hs_input_vgprs(struct nir_to_llvm_context *ctx)
   6727 {
   6728 	LLVMValueRef count = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
   6729 	                                  LLVMConstInt(ctx->ac.i32, 8, false),
   6730 	                                  LLVMConstInt(ctx->ac.i32, 8, false), false);
   6731 	LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count,
   6732 	                                      ctx->ac.i32_0, "");
   6733 	ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, "");
   6734 	ctx->vs_prim_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.vertex_id, ctx->vs_prim_id, "");
   6735 	ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, "");
   6736 	ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, "");
   6737 }
   6738 
   6739 static void prepare_gs_input_vgprs(struct nir_to_llvm_context *ctx)
   6740 {
   6741 	for(int i = 5; i >= 0; --i) {
   6742 		ctx->gs_vtx_offset[i] = ac_build_bfe(&ctx->ac, ctx->gs_vtx_offset[i & ~1],
   6743 		                                     LLVMConstInt(ctx->ac.i32, (i & 1) * 16, false),
   6744 		                                     LLVMConstInt(ctx->ac.i32, 16, false), false);
   6745 	}
   6746 
   6747 	ctx->gs_wave_id = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
   6748 	                               LLVMConstInt(ctx->ac.i32, 16, false),
   6749 	                               LLVMConstInt(ctx->ac.i32, 8, false), false);
   6750 }
   6751 
   6752 void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
   6753 		      struct nir_shader *nir, struct nir_to_llvm_context *nctx)
   6754 {
   6755 	struct ac_nir_context ctx = {};
   6756 	struct nir_function *func;
   6757 
   6758 	ctx.ac = *ac;
   6759 	ctx.abi = abi;
   6760 
   6761 	ctx.nctx = nctx;
   6762 	if (nctx)
   6763 		nctx->nir = &ctx;
   6764 
   6765 	ctx.stage = nir->info.stage;
   6766 
   6767 	ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
   6768 
   6769 	nir_foreach_variable(variable, &nir->outputs)
   6770 		handle_shader_output_decl(&ctx, nir, variable);
   6771 
   6772 	ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
   6773 	                                   _mesa_key_pointer_equal);
   6774 	ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
   6775 	                                   _mesa_key_pointer_equal);
   6776 	ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
   6777 	                                   _mesa_key_pointer_equal);
   6778 
   6779 	func = (struct nir_function *)exec_list_get_head(&nir->functions);
   6780 
   6781 	setup_locals(&ctx, func);
   6782 
   6783 	if (nir->info.stage == MESA_SHADER_COMPUTE)
   6784 		setup_shared(&ctx, nir);
   6785 
   6786 	visit_cf_list(&ctx, &func->impl->body);
   6787 	phi_post_pass(&ctx);
   6788 
   6789 	ctx.abi->emit_outputs(ctx.abi, RADEON_LLVM_MAX_OUTPUTS,
   6790 			      ctx.outputs);
   6791 
   6792 	free(ctx.locals);
   6793 	ralloc_free(ctx.defs);
   6794 	ralloc_free(ctx.phis);
   6795 	ralloc_free(ctx.vars);
   6796 
   6797 	if (nctx)
   6798 		nctx->nir = NULL;
   6799 }
   6800 
   6801 static
   6802 LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
   6803                                        struct nir_shader *const *shaders,
   6804                                        int shader_count,
   6805                                        struct ac_shader_variant_info *shader_info,
   6806                                        const struct ac_nir_compiler_options *options)
   6807 {
   6808 	struct nir_to_llvm_context ctx = {0};
   6809 	unsigned i;
   6810 	ctx.options = options;
   6811 	ctx.shader_info = shader_info;
   6812 	ctx.context = LLVMContextCreate();
   6813 	ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
   6814 
   6815 	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
   6816 			     options->family);
   6817 	ctx.ac.module = ctx.module;
   6818 	LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
   6819 
   6820 	LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
   6821 	char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
   6822 	LLVMSetDataLayout(ctx.module, data_layout_str);
   6823 	LLVMDisposeTargetData(data_layout);
   6824 	LLVMDisposeMessage(data_layout_str);
   6825 
   6826 	enum ac_float_mode float_mode =
   6827 		options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
   6828 				       AC_FLOAT_MODE_DEFAULT;
   6829 
   6830 	ctx.builder = ac_create_builder(ctx.context, float_mode);
   6831 	ctx.ac.builder = ctx.builder;
   6832 
   6833 	memset(shader_info, 0, sizeof(*shader_info));
   6834 
   6835 	for(int i = 0; i < shader_count; ++i)
   6836 		ac_nir_shader_info_pass(shaders[i], options, &shader_info->info);
   6837 
   6838 	for (i = 0; i < AC_UD_MAX_SETS; i++)
   6839 		shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
   6840 	for (i = 0; i < AC_UD_MAX_UD; i++)
   6841 		shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
   6842 
   6843 	ctx.max_workgroup_size = 0;
   6844 	for (int i = 0; i < shader_count; ++i) {
   6845 		ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size,
   6846 		                              ac_nir_get_max_workgroup_size(ctx.options->chip_class,
   6847 		                                                            shaders[i]));
   6848 	}
   6849 
   6850 	create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
   6851 	                shader_count >= 2 ? shaders[shader_count - 2]->info.stage  : MESA_SHADER_VERTEX);
   6852 
   6853 	ctx.abi.inputs = &ctx.inputs[0];
   6854 	ctx.abi.emit_outputs = handle_shader_outputs_post;
   6855 	ctx.abi.emit_vertex = visit_emit_vertex;
   6856 	ctx.abi.load_ubo = radv_load_ubo;
   6857 	ctx.abi.load_ssbo = radv_load_ssbo;
   6858 	ctx.abi.load_sampler_desc = radv_get_sampler_desc;
   6859 	ctx.abi.clamp_shadow_reference = false;
   6860 	ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9;
   6861 
   6862 	if (shader_count >= 2)
   6863 		ac_init_exec_full_mask(&ctx.ac);
   6864 
   6865 	if (ctx.ac.chip_class == GFX9 &&
   6866 	    shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
   6867 		ac_nir_fixup_ls_hs_input_vgprs(&ctx);
   6868 
   6869 	for(int i = 0; i < shader_count; ++i) {
   6870 		ctx.stage = shaders[i]->info.stage;
   6871 		ctx.output_mask = 0;
   6872 		ctx.tess_outputs_written = 0;
   6873 		ctx.num_output_clips = shaders[i]->info.clip_distance_array_size;
   6874 		ctx.num_output_culls = shaders[i]->info.cull_distance_array_size;
   6875 
   6876 		if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
   6877 			ctx.gs_next_vertex = ac_build_alloca(&ctx.ac, ctx.ac.i32, "gs_next_vertex");
   6878 			ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out;
   6879 			ctx.abi.load_inputs = load_gs_input;
   6880 			ctx.abi.emit_primitive = visit_end_primitive;
   6881 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
   6882 			ctx.tcs_outputs_read = shaders[i]->info.outputs_read;
   6883 			ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read;
   6884 			ctx.abi.load_tess_varyings = load_tcs_varyings;
   6885 			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
   6886 			ctx.abi.store_tcs_outputs = store_tcs_output;
   6887 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
   6888 			ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode;
   6889 			ctx.abi.load_tess_varyings = load_tes_input;
   6890 			ctx.abi.load_tess_coord = load_tess_coord;
   6891 			ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
   6892 		} else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) {
   6893 			if (shader_info->info.vs.needs_instance_id) {
   6894 				if (ctx.options->key.vs.as_ls) {
   6895 					ctx.shader_info->vs.vgpr_comp_cnt =
   6896 						MAX2(2, ctx.shader_info->vs.vgpr_comp_cnt);
   6897 				} else {
   6898 					ctx.shader_info->vs.vgpr_comp_cnt =
   6899 						MAX2(1, ctx.shader_info->vs.vgpr_comp_cnt);
   6900 				}
   6901 			}
   6902 		} else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
   6903 			shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
   6904 		}
   6905 
   6906 		if (i)
   6907 			emit_barrier(&ctx.ac, ctx.stage);
   6908 
   6909 		ac_setup_rings(&ctx);
   6910 
   6911 		LLVMBasicBlockRef merge_block;
   6912 		if (shader_count >= 2) {
   6913 			LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
   6914 			LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
   6915 			merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
   6916 
   6917 			LLVMValueRef count = ac_build_bfe(&ctx.ac, ctx.merged_wave_info,
   6918 			                                  LLVMConstInt(ctx.ac.i32, 8 * i, false),
   6919 			                                  LLVMConstInt(ctx.ac.i32, 8, false), false);
   6920 			LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
   6921 			LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT,
   6922 			                                  thread_id, count, "");
   6923 			LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
   6924 
   6925 			LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
   6926 		}
   6927 
   6928 		if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
   6929 			handle_fs_inputs(&ctx, shaders[i]);
   6930 		else if(shaders[i]->info.stage == MESA_SHADER_VERTEX)
   6931 			handle_vs_inputs(&ctx, shaders[i]);
   6932 		else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY)
   6933 			prepare_gs_input_vgprs(&ctx);
   6934 
   6935 		nir_foreach_variable(variable, &shaders[i]->outputs)
   6936 			scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
   6937 
   6938 		ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i], &ctx);
   6939 
   6940 		if (shader_count >= 2) {
   6941 			LLVMBuildBr(ctx.ac.builder, merge_block);
   6942 			LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
   6943 		}
   6944 
   6945 		if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
   6946 			unsigned addclip = shaders[i]->info.clip_distance_array_size +
   6947 					shaders[i]->info.cull_distance_array_size > 4;
   6948 			shader_info->gs.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16;
   6949 			shader_info->gs.max_gsvs_emit_size = shader_info->gs.gsvs_vertex_size *
   6950 				shaders[i]->info.gs.vertices_out;
   6951 		} else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
   6952 			shader_info->tcs.outputs_written = ctx.tess_outputs_written;
   6953 			shader_info->tcs.patch_outputs_written = ctx.tess_patch_outputs_written;
   6954 		} else if (shaders[i]->info.stage == MESA_SHADER_VERTEX && ctx.options->key.vs.as_ls) {
   6955 			shader_info->vs.outputs_written = ctx.tess_outputs_written;
   6956 		}
   6957 	}
   6958 
   6959 	LLVMBuildRetVoid(ctx.builder);
   6960 
   6961 	if (options->dump_preoptir)
   6962 		ac_dump_module(ctx.module);
   6963 
   6964 	ac_llvm_finalize_module(&ctx);
   6965 
   6966 	if (shader_count == 1)
   6967 		ac_nir_eliminate_const_vs_outputs(&ctx);
   6968 
   6969 	return ctx.module;
   6970 }
   6971 
   6972 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
   6973 {
   6974 	unsigned *retval = (unsigned *)context;
   6975 	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
   6976 	char *description = LLVMGetDiagInfoDescription(di);
   6977 
   6978 	if (severity == LLVMDSError) {
   6979 		*retval = 1;
   6980 		fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n",
   6981 		        description);
   6982 	}
   6983 
   6984 	LLVMDisposeMessage(description);
   6985 }
   6986 
   6987 static unsigned ac_llvm_compile(LLVMModuleRef M,
   6988                                 struct ac_shader_binary *binary,
   6989                                 LLVMTargetMachineRef tm)
   6990 {
   6991 	unsigned retval = 0;
   6992 	char *err;
   6993 	LLVMContextRef llvm_ctx;
   6994 	LLVMMemoryBufferRef out_buffer;
   6995 	unsigned buffer_size;
   6996 	const char *buffer_data;
   6997 	LLVMBool mem_err;
   6998 
   6999 	/* Setup Diagnostic Handler*/
   7000 	llvm_ctx = LLVMGetModuleContext(M);
   7001 
   7002 	LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler,
   7003 	                                &retval);
   7004 
   7005 	/* Compile IR*/
   7006 	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile,
   7007 	                                              &err, &out_buffer);
   7008 
   7009 	/* Process Errors/Warnings */
   7010 	if (mem_err) {
   7011 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
   7012 		free(err);
   7013 		retval = 1;
   7014 		goto out;
   7015 	}
   7016 
   7017 	/* Extract Shader Code*/
   7018 	buffer_size = LLVMGetBufferSize(out_buffer);
   7019 	buffer_data = LLVMGetBufferStart(out_buffer);
   7020 
   7021 	ac_elf_read(buffer_data, buffer_size, binary);
   7022 
   7023 	/* Clean up */
   7024 	LLVMDisposeMemoryBuffer(out_buffer);
   7025 
   7026 out:
   7027 	return retval;
   7028 }
   7029 
   7030 static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
   7031 				   LLVMModuleRef llvm_module,
   7032 				   struct ac_shader_binary *binary,
   7033 				   struct ac_shader_config *config,
   7034 				   struct ac_shader_variant_info *shader_info,
   7035 				   gl_shader_stage stage,
   7036 				   bool dump_shader, bool supports_spill)
   7037 {
   7038 	if (dump_shader)
   7039 		ac_dump_module(llvm_module);
   7040 
   7041 	memset(binary, 0, sizeof(*binary));
   7042 	int v = ac_llvm_compile(llvm_module, binary, tm);
   7043 	if (v) {
   7044 		fprintf(stderr, "compile failed\n");
   7045 	}
   7046 
   7047 	if (dump_shader)
   7048 		fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
   7049 
   7050 	ac_shader_binary_read_config(binary, config, 0, supports_spill);
   7051 
   7052 	LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
   7053 	LLVMDisposeModule(llvm_module);
   7054 	LLVMContextDispose(ctx);
   7055 
   7056 	if (stage == MESA_SHADER_FRAGMENT) {
   7057 		shader_info->num_input_vgprs = 0;
   7058 		if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
   7059 			shader_info->num_input_vgprs += 2;
   7060 		if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
   7061 			shader_info->num_input_vgprs += 2;
   7062 		if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
   7063 			shader_info->num_input_vgprs += 2;
   7064 		if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
   7065 			shader_info->num_input_vgprs += 3;
   7066 		if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
   7067 			shader_info->num_input_vgprs += 2;
   7068 		if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
   7069 			shader_info->num_input_vgprs += 2;
   7070 		if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
   7071 			shader_info->num_input_vgprs += 2;
   7072 		if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
   7073 			shader_info->num_input_vgprs += 1;
   7074 		if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
   7075 			shader_info->num_input_vgprs += 1;
   7076 		if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
   7077 			shader_info->num_input_vgprs += 1;
   7078 		if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
   7079 			shader_info->num_input_vgprs += 1;
   7080 		if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
   7081 			shader_info->num_input_vgprs += 1;
   7082 		if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
   7083 			shader_info->num_input_vgprs += 1;
   7084 		if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr))
   7085 			shader_info->num_input_vgprs += 1;
   7086 		if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
   7087 			shader_info->num_input_vgprs += 1;
   7088 		if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
   7089 			shader_info->num_input_vgprs += 1;
   7090 	}
   7091 	config->num_vgprs = MAX2(config->num_vgprs, shader_info->num_input_vgprs);
   7092 
   7093 	/* +3 for scratch wave offset and VCC */
   7094 	config->num_sgprs = MAX2(config->num_sgprs,
   7095 	                         shader_info->num_input_sgprs + 3);
   7096 
   7097 	/* Enable 64-bit and 16-bit denormals, because there is no performance
   7098 	 * cost.
   7099 	 *
   7100 	 * If denormals are enabled, all floating-point output modifiers are
   7101 	 * ignored.
   7102 	 *
   7103 	 * Don't enable denormals for 32-bit floats, because:
   7104 	 * - Floating-point output modifiers would be ignored by the hw.
   7105 	 * - Some opcodes don't support denormals, such as v_mad_f32. We would
   7106 	 *   have to stop using those.
   7107 	 * - SI & CI would be very slow.
   7108 	 */
   7109 	config->float_mode |= V_00B028_FP_64_DENORMS;
   7110 }
   7111 
   7112 static void
   7113 ac_fill_shader_info(struct ac_shader_variant_info *shader_info, struct nir_shader *nir, const struct ac_nir_compiler_options *options)
   7114 {
   7115         switch (nir->info.stage) {
   7116         case MESA_SHADER_COMPUTE:
   7117                 for (int i = 0; i < 3; ++i)
   7118                         shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
   7119                 break;
   7120         case MESA_SHADER_FRAGMENT:
   7121                 shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
   7122                 break;
   7123         case MESA_SHADER_GEOMETRY:
   7124                 shader_info->gs.vertices_in = nir->info.gs.vertices_in;
   7125                 shader_info->gs.vertices_out = nir->info.gs.vertices_out;
   7126                 shader_info->gs.output_prim = nir->info.gs.output_primitive;
   7127                 shader_info->gs.invocations = nir->info.gs.invocations;
   7128                 break;
   7129         case MESA_SHADER_TESS_EVAL:
   7130                 shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
   7131                 shader_info->tes.spacing = nir->info.tess.spacing;
   7132                 shader_info->tes.ccw = nir->info.tess.ccw;
   7133                 shader_info->tes.point_mode = nir->info.tess.point_mode;
   7134                 shader_info->tes.as_es = options->key.tes.as_es;
   7135                 break;
   7136         case MESA_SHADER_TESS_CTRL:
   7137                 shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
   7138                 break;
   7139         case MESA_SHADER_VERTEX:
   7140                 shader_info->vs.as_es = options->key.vs.as_es;
   7141                 shader_info->vs.as_ls = options->key.vs.as_ls;
   7142                 /* in LS mode we need at least 1, invocation id needs 2, handled elsewhere */
   7143                 if (options->key.vs.as_ls)
   7144                         shader_info->vs.vgpr_comp_cnt = MAX2(1, shader_info->vs.vgpr_comp_cnt);
   7145                 break;
   7146         default:
   7147                 break;
   7148         }
   7149 }
   7150 
   7151 void ac_compile_nir_shader(LLVMTargetMachineRef tm,
   7152                            struct ac_shader_binary *binary,
   7153                            struct ac_shader_config *config,
   7154                            struct ac_shader_variant_info *shader_info,
   7155                            struct nir_shader *const *nir,
   7156                            int nir_count,
   7157                            const struct ac_nir_compiler_options *options,
   7158 			   bool dump_shader)
   7159 {
   7160 
   7161 	LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, nir_count, shader_info,
   7162 	                                                     options);
   7163 
   7164 	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir[0]->info.stage, dump_shader, options->supports_spill);
   7165 	for (int i = 0; i < nir_count; ++i)
   7166 		ac_fill_shader_info(shader_info, nir[i], options);
   7167 
   7168 	/* Determine the ES type (VS or TES) for the GS on GFX9. */
   7169 	if (options->chip_class == GFX9) {
   7170 		if (nir_count == 2 &&
   7171 		    nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
   7172 			shader_info->gs.es_type = nir[0]->info.stage;
   7173 		}
   7174 	}
   7175 }
   7176 
   7177 static void
   7178 ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
   7179 {
   7180 	LLVMValueRef args[9];
   7181 	args[0] = ctx->gsvs_ring;
   7182 	args[1] = LLVMBuildMul(ctx->builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 4, false), "");
   7183 	args[3] = ctx->ac.i32_0;
   7184 	args[4] = ctx->ac.i32_1;  /* OFFEN */
   7185 	args[5] = ctx->ac.i32_0; /* IDXEN */
   7186 	args[6] = ctx->ac.i32_1;  /* GLC */
   7187 	args[7] = ctx->ac.i32_1;  /* SLC */
   7188 	args[8] = ctx->ac.i32_0; /* TFE */
   7189 
   7190 	int idx = 0;
   7191 
   7192 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
   7193 		int length = 4;
   7194 		int slot = idx;
   7195 		int slot_inc = 1;
   7196 		if (!(ctx->output_mask & (1ull << i)))
   7197 			continue;
   7198 
   7199 		if (i == VARYING_SLOT_CLIP_DIST0) {
   7200 			/* unpack clip and cull from a single set of slots */
   7201 			length = ctx->num_output_clips + ctx->num_output_culls;
   7202 			if (length > 4)
   7203 				slot_inc = 2;
   7204 		}
   7205 
   7206 		for (unsigned j = 0; j < length; j++) {
   7207 			LLVMValueRef value;
   7208 			args[2] = LLVMConstInt(ctx->ac.i32,
   7209 					       (slot * 4 + j) *
   7210 					       ctx->gs_max_out_vertices * 16 * 4, false);
   7211 
   7212 			value = ac_build_intrinsic(&ctx->ac,
   7213 						   "llvm.SI.buffer.load.dword.i32.i32",
   7214 						   ctx->ac.i32, args, 9,
   7215 						   AC_FUNC_ATTR_READONLY |
   7216 						   AC_FUNC_ATTR_LEGACY);
   7217 
   7218 			LLVMBuildStore(ctx->builder,
   7219 				       ac_to_float(&ctx->ac, value), ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)]);
   7220 		}
   7221 		idx += slot_inc;
   7222 	}
   7223 	handle_vs_outputs_post(ctx, false, &ctx->shader_info->vs.outinfo);
   7224 }
   7225 
   7226 void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
   7227 			      struct nir_shader *geom_shader,
   7228 			      struct ac_shader_binary *binary,
   7229 			      struct ac_shader_config *config,
   7230 			      struct ac_shader_variant_info *shader_info,
   7231 			      const struct ac_nir_compiler_options *options,
   7232 			      bool dump_shader)
   7233 {
   7234 	struct nir_to_llvm_context ctx = {0};
   7235 	ctx.context = LLVMContextCreate();
   7236 	ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
   7237 	ctx.options = options;
   7238 	ctx.shader_info = shader_info;
   7239 
   7240 	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
   7241 			     options->family);
   7242 	ctx.ac.module = ctx.module;
   7243 
   7244 	ctx.is_gs_copy_shader = true;
   7245 	LLVMSetTarget(ctx.module, "amdgcn--");
   7246 
   7247 	enum ac_float_mode float_mode =
   7248 		options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
   7249 				       AC_FLOAT_MODE_DEFAULT;
   7250 
   7251 	ctx.builder = ac_create_builder(ctx.context, float_mode);
   7252 	ctx.ac.builder = ctx.builder;
   7253 	ctx.stage = MESA_SHADER_VERTEX;
   7254 
   7255 	create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);
   7256 
   7257 	ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
   7258 	ac_setup_rings(&ctx);
   7259 
   7260 	ctx.num_output_clips = geom_shader->info.clip_distance_array_size;
   7261 	ctx.num_output_culls = geom_shader->info.cull_distance_array_size;
   7262 
   7263 	struct ac_nir_context nir_ctx = {};
   7264 	nir_ctx.ac = ctx.ac;
   7265 	nir_ctx.abi = &ctx.abi;
   7266 
   7267 	nir_ctx.nctx = &ctx;
   7268 	ctx.nir = &nir_ctx;
   7269 
   7270 	nir_foreach_variable(variable, &geom_shader->outputs) {
   7271 		scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
   7272 		handle_shader_output_decl(&nir_ctx, geom_shader, variable);
   7273 	}
   7274 
   7275 	ac_gs_copy_shader_emit(&ctx);
   7276 
   7277 	ctx.nir = NULL;
   7278 
   7279 	LLVMBuildRetVoid(ctx.builder);
   7280 
   7281 	ac_llvm_finalize_module(&ctx);
   7282 
   7283 	ac_compile_llvm_module(tm, ctx.module, binary, config, shader_info,
   7284 			       MESA_SHADER_VERTEX,
   7285 			       dump_shader, options->supports_spill);
   7286 }
   7287