Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright 2014 Advanced Micro Devices, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the
      6  * "Software"), to deal in the Software without restriction, including
      7  * without limitation the rights to use, copy, modify, merge, publish,
      8  * distribute, sub license, and/or sell copies of the Software, and to
      9  * permit persons to whom the Software is furnished to do so, subject to
     10  * the following conditions:
     11  *
     12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
     16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     19  *
     20  * The above copyright notice and this permission notice (including the
     21  * next paragraph) shall be included in all copies or substantial portions
     22  * of the Software.
     23  *
     24  */
     25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
     26 #include "ac_llvm_build.h"
     27 
     28 #include <llvm-c/Core.h>
     29 
     30 #include "c11/threads.h"
     31 
     32 #include <assert.h>
     33 #include <stdio.h>
     34 
     35 #include "ac_llvm_util.h"
     36 #include "ac_exp_param.h"
     37 #include "util/bitscan.h"
     38 #include "util/macros.h"
     39 #include "util/u_atomic.h"
     40 #include "sid.h"
     41 
     42 #include "shader_enums.h"
     43 
     44 #define AC_LLVM_INITIAL_CF_DEPTH 4
     45 
     46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
     47  */
     48 struct ac_llvm_flow {
     49 	/* Loop exit or next part of if/else/endif. */
     50 	LLVMBasicBlockRef next_block;
     51 	LLVMBasicBlockRef loop_entry_block;
     52 };
     53 
     54 /* Initialize module-independent parts of the context.
     55  *
     56  * The caller is responsible for initializing ctx::module and ctx::builder.
     57  */
     58 void
     59 ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
     60 		     enum chip_class chip_class, enum radeon_family family)
     61 {
     62 	LLVMValueRef args[1];
     63 
     64 	ctx->chip_class = chip_class;
     65 	ctx->family = family;
     66 
     67 	ctx->context = context;
     68 	ctx->module = NULL;
     69 	ctx->builder = NULL;
     70 
     71 	ctx->voidt = LLVMVoidTypeInContext(ctx->context);
     72 	ctx->i1 = LLVMInt1TypeInContext(ctx->context);
     73 	ctx->i8 = LLVMInt8TypeInContext(ctx->context);
     74 	ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
     75 	ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
     76 	ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
     77 	ctx->f16 = LLVMHalfTypeInContext(ctx->context);
     78 	ctx->f32 = LLVMFloatTypeInContext(ctx->context);
     79 	ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
     80 	ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
     81 	ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
     82 	ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
     83 	ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
     84 	ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
     85 	ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
     86 
     87 	ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
     88 	ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
     89 	ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
     90 	ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
     91 	ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
     92 	ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
     93 	ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
     94 	ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
     95 
     96 	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
     97 	ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
     98 
     99 	ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
    100 						     "range", 5);
    101 
    102 	ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
    103 							       "invariant.load", 14);
    104 
    105 	ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
    106 
    107 	args[0] = LLVMConstReal(ctx->f32, 2.5);
    108 	ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
    109 
    110 	ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
    111 							"amdgpu.uniform", 14);
    112 
    113 	ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
    114 }
    115 
    116 void
    117 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
    118 {
    119 	free(ctx->flow);
    120 	ctx->flow = NULL;
    121 	ctx->flow_depth_max = 0;
    122 }
    123 
    124 int
    125 ac_get_llvm_num_components(LLVMValueRef value)
    126 {
    127 	LLVMTypeRef type = LLVMTypeOf(value);
    128 	unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
    129 	                              ? LLVMGetVectorSize(type)
    130 	                              : 1;
    131 	return num_components;
    132 }
    133 
    134 LLVMValueRef
    135 ac_llvm_extract_elem(struct ac_llvm_context *ac,
    136 		     LLVMValueRef value,
    137 		     int index)
    138 {
    139 	if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
    140 		assert(index == 0);
    141 		return value;
    142 	}
    143 
    144 	return LLVMBuildExtractElement(ac->builder, value,
    145 				       LLVMConstInt(ac->i32, index, false), "");
    146 }
    147 
    148 unsigned
    149 ac_get_type_size(LLVMTypeRef type)
    150 {
    151 	LLVMTypeKind kind = LLVMGetTypeKind(type);
    152 
    153 	switch (kind) {
    154 	case LLVMIntegerTypeKind:
    155 		return LLVMGetIntTypeWidth(type) / 8;
    156 	case LLVMFloatTypeKind:
    157 		return 4;
    158 	case LLVMDoubleTypeKind:
    159 	case LLVMPointerTypeKind:
    160 		return 8;
    161 	case LLVMVectorTypeKind:
    162 		return LLVMGetVectorSize(type) *
    163 		       ac_get_type_size(LLVMGetElementType(type));
    164 	case LLVMArrayTypeKind:
    165 		return LLVMGetArrayLength(type) *
    166 		       ac_get_type_size(LLVMGetElementType(type));
    167 	default:
    168 		assert(0);
    169 		return 0;
    170 	}
    171 }
    172 
    173 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
    174 {
    175 	if (t == ctx->f16 || t == ctx->i16)
    176 		return ctx->i16;
    177 	else if (t == ctx->f32 || t == ctx->i32)
    178 		return ctx->i32;
    179 	else if (t == ctx->f64 || t == ctx->i64)
    180 		return ctx->i64;
    181 	else
    182 		unreachable("Unhandled integer size");
    183 }
    184 
    185 LLVMTypeRef
    186 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
    187 {
    188 	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
    189 		LLVMTypeRef elem_type = LLVMGetElementType(t);
    190 		return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
    191 		                      LLVMGetVectorSize(t));
    192 	}
    193 	return to_integer_type_scalar(ctx, t);
    194 }
    195 
    196 LLVMValueRef
    197 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
    198 {
    199 	LLVMTypeRef type = LLVMTypeOf(v);
    200 	return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
    201 }
    202 
    203 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
    204 {
    205 	if (t == ctx->i16 || t == ctx->f16)
    206 		return ctx->f16;
    207 	else if (t == ctx->i32 || t == ctx->f32)
    208 		return ctx->f32;
    209 	else if (t == ctx->i64 || t == ctx->f64)
    210 		return ctx->f64;
    211 	else
    212 		unreachable("Unhandled float size");
    213 }
    214 
    215 LLVMTypeRef
    216 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
    217 {
    218 	if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
    219 		LLVMTypeRef elem_type = LLVMGetElementType(t);
    220 		return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
    221 		                      LLVMGetVectorSize(t));
    222 	}
    223 	return to_float_type_scalar(ctx, t);
    224 }
    225 
    226 LLVMValueRef
    227 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
    228 {
    229 	LLVMTypeRef type = LLVMTypeOf(v);
    230 	return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
    231 }
    232 
    233 
    234 LLVMValueRef
    235 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
    236 		   LLVMTypeRef return_type, LLVMValueRef *params,
    237 		   unsigned param_count, unsigned attrib_mask)
    238 {
    239 	LLVMValueRef function, call;
    240 	bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
    241 				  !(attrib_mask & AC_FUNC_ATTR_LEGACY);
    242 
    243 	function = LLVMGetNamedFunction(ctx->module, name);
    244 	if (!function) {
    245 		LLVMTypeRef param_types[32], function_type;
    246 		unsigned i;
    247 
    248 		assert(param_count <= 32);
    249 
    250 		for (i = 0; i < param_count; ++i) {
    251 			assert(params[i]);
    252 			param_types[i] = LLVMTypeOf(params[i]);
    253 		}
    254 		function_type =
    255 		    LLVMFunctionType(return_type, param_types, param_count, 0);
    256 		function = LLVMAddFunction(ctx->module, name, function_type);
    257 
    258 		LLVMSetFunctionCallConv(function, LLVMCCallConv);
    259 		LLVMSetLinkage(function, LLVMExternalLinkage);
    260 
    261 		if (!set_callsite_attrs)
    262 			ac_add_func_attributes(ctx->context, function, attrib_mask);
    263 	}
    264 
    265 	call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
    266 	if (set_callsite_attrs)
    267 		ac_add_func_attributes(ctx->context, call, attrib_mask);
    268 	return call;
    269 }
    270 
    271 /**
    272  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
    273  * intrinsic names).
    274  */
    275 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
    276 {
    277 	LLVMTypeRef elem_type = type;
    278 
    279 	assert(bufsize >= 8);
    280 
    281 	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
    282 		int ret = snprintf(buf, bufsize, "v%u",
    283 					LLVMGetVectorSize(type));
    284 		if (ret < 0) {
    285 			char *type_name = LLVMPrintTypeToString(type);
    286 			fprintf(stderr, "Error building type name for: %s\n",
    287 				type_name);
    288 			return;
    289 		}
    290 		elem_type = LLVMGetElementType(type);
    291 		buf += ret;
    292 		bufsize -= ret;
    293 	}
    294 	switch (LLVMGetTypeKind(elem_type)) {
    295 	default: break;
    296 	case LLVMIntegerTypeKind:
    297 		snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
    298 		break;
    299 	case LLVMFloatTypeKind:
    300 		snprintf(buf, bufsize, "f32");
    301 		break;
    302 	case LLVMDoubleTypeKind:
    303 		snprintf(buf, bufsize, "f64");
    304 		break;
    305 	}
    306 }
    307 
    308 /**
    309  * Helper function that builds an LLVM IR PHI node and immediately adds
    310  * incoming edges.
    311  */
    312 LLVMValueRef
    313 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
    314 	     unsigned count_incoming, LLVMValueRef *values,
    315 	     LLVMBasicBlockRef *blocks)
    316 {
    317 	LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
    318 	LLVMAddIncoming(phi, values, blocks, count_incoming);
    319 	return phi;
    320 }
    321 
    322 /* Prevent optimizations (at least of memory accesses) across the current
    323  * point in the program by emitting empty inline assembly that is marked as
    324  * having side effects.
    325  *
    326  * Optionally, a value can be passed through the inline assembly to prevent
    327  * LLVM from hoisting calls to ReadNone functions.
    328  */
    329 void
    330 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
    331 			      LLVMValueRef *pvgpr)
    332 {
    333 	static int counter = 0;
    334 
    335 	LLVMBuilderRef builder = ctx->builder;
    336 	char code[16];
    337 
    338 	snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
    339 
    340 	if (!pvgpr) {
    341 		LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
    342 		LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
    343 		LLVMBuildCall(builder, inlineasm, NULL, 0, "");
    344 	} else {
    345 		LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
    346 		LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
    347 		LLVMValueRef vgpr = *pvgpr;
    348 		LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
    349 		unsigned vgpr_size = ac_get_type_size(vgpr_type);
    350 		LLVMValueRef vgpr0;
    351 
    352 		assert(vgpr_size % 4 == 0);
    353 
    354 		vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
    355 		vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
    356 		vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
    357 		vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
    358 		vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
    359 
    360 		*pvgpr = vgpr;
    361 	}
    362 }
    363 
    364 LLVMValueRef
    365 ac_build_ballot(struct ac_llvm_context *ctx,
    366 		LLVMValueRef value)
    367 {
    368 	LLVMValueRef args[3] = {
    369 		value,
    370 		ctx->i32_0,
    371 		LLVMConstInt(ctx->i32, LLVMIntNE, 0)
    372 	};
    373 
    374 	/* We currently have no other way to prevent LLVM from lifting the icmp
    375 	 * calls to a dominating basic block.
    376 	 */
    377 	ac_build_optimization_barrier(ctx, &args[0]);
    378 
    379 	if (LLVMTypeOf(args[0]) != ctx->i32)
    380 		args[0] = LLVMBuildBitCast(ctx->builder, args[0], ctx->i32, "");
    381 
    382 	return ac_build_intrinsic(ctx,
    383 				  "llvm.amdgcn.icmp.i32",
    384 				  ctx->i64, args, 3,
    385 				  AC_FUNC_ATTR_NOUNWIND |
    386 				  AC_FUNC_ATTR_READNONE |
    387 				  AC_FUNC_ATTR_CONVERGENT);
    388 }
    389 
    390 LLVMValueRef
    391 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
    392 {
    393 	LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
    394 	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
    395 	return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
    396 }
    397 
    398 LLVMValueRef
    399 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
    400 {
    401 	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
    402 	return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
    403 			     LLVMConstInt(ctx->i64, 0, 0), "");
    404 }
    405 
    406 LLVMValueRef
    407 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
    408 {
    409 	LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
    410 	LLVMValueRef vote_set = ac_build_ballot(ctx, value);
    411 
    412 	LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
    413 					 vote_set, active_set, "");
    414 	LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
    415 					  vote_set,
    416 					  LLVMConstInt(ctx->i64, 0, 0), "");
    417 	return LLVMBuildOr(ctx->builder, all, none, "");
    418 }
    419 
    420 LLVMValueRef
    421 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
    422 			       unsigned value_count, unsigned component)
    423 {
    424 	LLVMValueRef vec = NULL;
    425 
    426 	if (value_count == 1) {
    427 		return values[component];
    428 	} else if (!value_count)
    429 		unreachable("value_count is 0");
    430 
    431 	for (unsigned i = component; i < value_count + component; i++) {
    432 		LLVMValueRef value = values[i];
    433 
    434 		if (i == component)
    435 			vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
    436 		LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
    437 		vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
    438 	}
    439 	return vec;
    440 }
    441 
    442 LLVMValueRef
    443 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
    444 				LLVMValueRef *values,
    445 				unsigned value_count,
    446 				unsigned value_stride,
    447 				bool load,
    448 				bool always_vector)
    449 {
    450 	LLVMBuilderRef builder = ctx->builder;
    451 	LLVMValueRef vec = NULL;
    452 	unsigned i;
    453 
    454 	if (value_count == 1 && !always_vector) {
    455 		if (load)
    456 			return LLVMBuildLoad(builder, values[0], "");
    457 		return values[0];
    458 	} else if (!value_count)
    459 		unreachable("value_count is 0");
    460 
    461 	for (i = 0; i < value_count; i++) {
    462 		LLVMValueRef value = values[i * value_stride];
    463 		if (load)
    464 			value = LLVMBuildLoad(builder, value, "");
    465 
    466 		if (!i)
    467 			vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
    468 		LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
    469 		vec = LLVMBuildInsertElement(builder, vec, value, index, "");
    470 	}
    471 	return vec;
    472 }
    473 
    474 LLVMValueRef
    475 ac_build_gather_values(struct ac_llvm_context *ctx,
    476 		       LLVMValueRef *values,
    477 		       unsigned value_count)
    478 {
    479 	return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
    480 }
    481 
    482 LLVMValueRef
    483 ac_build_fdiv(struct ac_llvm_context *ctx,
    484 	      LLVMValueRef num,
    485 	      LLVMValueRef den)
    486 {
    487 	LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
    488 
    489 	/* Use v_rcp_f32 instead of precise division. */
    490 	if (!LLVMIsConstant(ret))
    491 		LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
    492 	return ret;
    493 }
    494 
    495 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
    496  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
    497  * already multiplied by two. id is the cube face number.
    498  */
    499 struct cube_selection_coords {
    500 	LLVMValueRef stc[2];
    501 	LLVMValueRef ma;
    502 	LLVMValueRef id;
    503 };
    504 
    505 static void
    506 build_cube_intrinsic(struct ac_llvm_context *ctx,
    507 		     LLVMValueRef in[3],
    508 		     struct cube_selection_coords *out)
    509 {
    510 	LLVMTypeRef f32 = ctx->f32;
    511 
    512 	out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
    513 					 f32, in, 3, AC_FUNC_ATTR_READNONE);
    514 	out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
    515 					 f32, in, 3, AC_FUNC_ATTR_READNONE);
    516 	out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
    517 				     f32, in, 3, AC_FUNC_ATTR_READNONE);
    518 	out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
    519 				     f32, in, 3, AC_FUNC_ATTR_READNONE);
    520 }
    521 
    522 /**
    523  * Build a manual selection sequence for cube face sc/tc coordinates and
    524  * major axis vector (multiplied by 2 for consistency) for the given
    525  * vec3 \p coords, for the face implied by \p selcoords.
    526  *
    527  * For the major axis, we always adjust the sign to be in the direction of
    528  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
    529  * the selcoords major axis.
    530  */
    531 static void build_cube_select(struct ac_llvm_context *ctx,
    532 			      const struct cube_selection_coords *selcoords,
    533 			      const LLVMValueRef *coords,
    534 			      LLVMValueRef *out_st,
    535 			      LLVMValueRef *out_ma)
    536 {
    537 	LLVMBuilderRef builder = ctx->builder;
    538 	LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
    539 	LLVMValueRef is_ma_positive;
    540 	LLVMValueRef sgn_ma;
    541 	LLVMValueRef is_ma_z, is_not_ma_z;
    542 	LLVMValueRef is_ma_y;
    543 	LLVMValueRef is_ma_x;
    544 	LLVMValueRef sgn;
    545 	LLVMValueRef tmp;
    546 
    547 	is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
    548 		selcoords->ma, LLVMConstReal(f32, 0.0), "");
    549 	sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
    550 		LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
    551 
    552 	is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
    553 	is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
    554 	is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
    555 		LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
    556 	is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
    557 
    558 	/* Select sc */
    559 	tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
    560 	sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
    561 		LLVMBuildSelect(builder, is_ma_z, sgn_ma,
    562 			LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
    563 	out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
    564 
    565 	/* Select tc */
    566 	tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
    567 	sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
    568 		LLVMConstReal(f32, -1.0), "");
    569 	out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
    570 
    571 	/* Select ma */
    572 	tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
    573 		LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
    574 	tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
    575 				 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
    576 	*out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
    577 }
    578 
    579 void
    580 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
    581 		       bool is_deriv, bool is_array, bool is_lod,
    582 		       LLVMValueRef *coords_arg,
    583 		       LLVMValueRef *derivs_arg)
    584 {
    585 
    586 	LLVMBuilderRef builder = ctx->builder;
    587 	struct cube_selection_coords selcoords;
    588 	LLVMValueRef coords[3];
    589 	LLVMValueRef invma;
    590 
    591 	if (is_array && !is_lod) {
    592 		LLVMValueRef tmp = coords_arg[3];
    593 		tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
    594 
    595 		/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
    596 		 *
    597 		 *    "For Array forms, the array layer used will be
    598 		 *
    599 		 *       max(0, min(d1, floor(layer+0.5)))
    600 		 *
    601 		 *     where d is the depth of the texture array and layer
    602 		 *     comes from the component indicated in the tables below.
    603 		 *     Workaroudn for an issue where the layer is taken from a
    604 		 *     helper invocation which happens to fall on a different
    605 		 *     layer due to extrapolation."
    606 		 *
    607 		 * VI and earlier attempt to implement this in hardware by
    608 		 * clamping the value of coords[2] = (8 * layer) + face.
    609 		 * Unfortunately, this means that the we end up with the wrong
    610 		 * face when clamping occurs.
    611 		 *
    612 		 * Clamp the layer earlier to work around the issue.
    613 		 */
    614 		if (ctx->chip_class <= VI) {
    615 			LLVMValueRef ge0;
    616 			ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
    617 			tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
    618 		}
    619 
    620 		coords_arg[3] = tmp;
    621 	}
    622 
    623 	build_cube_intrinsic(ctx, coords_arg, &selcoords);
    624 
    625 	invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
    626 			ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
    627 	invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
    628 
    629 	for (int i = 0; i < 2; ++i)
    630 		coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
    631 
    632 	coords[2] = selcoords.id;
    633 
    634 	if (is_deriv && derivs_arg) {
    635 		LLVMValueRef derivs[4];
    636 		int axis;
    637 
    638 		/* Convert cube derivatives to 2D derivatives. */
    639 		for (axis = 0; axis < 2; axis++) {
    640 			LLVMValueRef deriv_st[2];
    641 			LLVMValueRef deriv_ma;
    642 
    643 			/* Transform the derivative alongside the texture
    644 			 * coordinate. Mathematically, the correct formula is
    645 			 * as follows. Assume we're projecting onto the +Z face
    646 			 * and denote by dx/dh the derivative of the (original)
    647 			 * X texture coordinate with respect to horizontal
    648 			 * window coordinates. The projection onto the +Z face
    649 			 * plane is:
    650 			 *
    651 			 *   f(x,z) = x/z
    652 			 *
    653 			 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
    654 			 *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
    655 			 *
    656 			 * This motivatives the implementation below.
    657 			 *
    658 			 * Whether this actually gives the expected results for
    659 			 * apps that might feed in derivatives obtained via
    660 			 * finite differences is anyone's guess. The OpenGL spec
    661 			 * seems awfully quiet about how textureGrad for cube
    662 			 * maps should be handled.
    663 			 */
    664 			build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
    665 					  deriv_st, &deriv_ma);
    666 
    667 			deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
    668 
    669 			for (int i = 0; i < 2; ++i)
    670 				derivs[axis * 2 + i] =
    671 					LLVMBuildFSub(builder,
    672 						LLVMBuildFMul(builder, deriv_st[i], invma, ""),
    673 						LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
    674 		}
    675 
    676 		memcpy(derivs_arg, derivs, sizeof(derivs));
    677 	}
    678 
    679 	/* Shift the texture coordinate. This must be applied after the
    680 	 * derivative calculation.
    681 	 */
    682 	for (int i = 0; i < 2; ++i)
    683 		coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
    684 
    685 	if (is_array) {
    686 		/* for cube arrays coord.z = coord.w(array_index) * 8 + face */
    687 		/* coords_arg.w component - array_index for cube arrays */
    688 		LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), "");
    689 		coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], "");
    690 	}
    691 
    692 	memcpy(coords_arg, coords, sizeof(coords));
    693 }
    694 
    695 
    696 LLVMValueRef
    697 ac_build_fs_interp(struct ac_llvm_context *ctx,
    698 		   LLVMValueRef llvm_chan,
    699 		   LLVMValueRef attr_number,
    700 		   LLVMValueRef params,
    701 		   LLVMValueRef i,
    702 		   LLVMValueRef j)
    703 {
    704 	LLVMValueRef args[5];
    705 	LLVMValueRef p1;
    706 
    707 	if (HAVE_LLVM < 0x0400) {
    708 		LLVMValueRef ij[2];
    709 		ij[0] = LLVMBuildBitCast(ctx->builder, i, ctx->i32, "");
    710 		ij[1] = LLVMBuildBitCast(ctx->builder, j, ctx->i32, "");
    711 
    712 		args[0] = llvm_chan;
    713 		args[1] = attr_number;
    714 		args[2] = params;
    715 		args[3] = ac_build_gather_values(ctx, ij, 2);
    716 		return ac_build_intrinsic(ctx, "llvm.SI.fs.interp",
    717 					  ctx->f32, args, 4,
    718 					  AC_FUNC_ATTR_READNONE);
    719 	}
    720 
    721 	args[0] = i;
    722 	args[1] = llvm_chan;
    723 	args[2] = attr_number;
    724 	args[3] = params;
    725 
    726 	p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
    727 				ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
    728 
    729 	args[0] = p1;
    730 	args[1] = j;
    731 	args[2] = llvm_chan;
    732 	args[3] = attr_number;
    733 	args[4] = params;
    734 
    735 	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
    736 				  ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
    737 }
    738 
    739 LLVMValueRef
    740 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
    741 		       LLVMValueRef parameter,
    742 		       LLVMValueRef llvm_chan,
    743 		       LLVMValueRef attr_number,
    744 		       LLVMValueRef params)
    745 {
    746 	LLVMValueRef args[4];
    747 	if (HAVE_LLVM < 0x0400) {
    748 		args[0] = llvm_chan;
    749 		args[1] = attr_number;
    750 		args[2] = params;
    751 
    752 		return ac_build_intrinsic(ctx,
    753 					  "llvm.SI.fs.constant",
    754 					  ctx->f32, args, 3,
    755 					  AC_FUNC_ATTR_READNONE);
    756 	}
    757 
    758 	args[0] = parameter;
    759 	args[1] = llvm_chan;
    760 	args[2] = attr_number;
    761 	args[3] = params;
    762 
    763 	return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
    764 				  ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
    765 }
    766 
    767 LLVMValueRef
    768 ac_build_gep0(struct ac_llvm_context *ctx,
    769 	      LLVMValueRef base_ptr,
    770 	      LLVMValueRef index)
    771 {
    772 	LLVMValueRef indices[2] = {
    773 		LLVMConstInt(ctx->i32, 0, 0),
    774 		index,
    775 	};
    776 	return LLVMBuildGEP(ctx->builder, base_ptr,
    777 			    indices, 2, "");
    778 }
    779 
    780 void
    781 ac_build_indexed_store(struct ac_llvm_context *ctx,
    782 		       LLVMValueRef base_ptr, LLVMValueRef index,
    783 		       LLVMValueRef value)
    784 {
    785 	LLVMBuildStore(ctx->builder, value,
    786 		       ac_build_gep0(ctx, base_ptr, index));
    787 }
    788 
    789 /**
    790  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
    791  * It's equivalent to doing a load from &base_ptr[index].
    792  *
    793  * \param base_ptr  Where the array starts.
    794  * \param index     The element index into the array.
    795  * \param uniform   Whether the base_ptr and index can be assumed to be
    796  *                  dynamically uniform (i.e. load to an SGPR)
    797  * \param invariant Whether the load is invariant (no other opcodes affect it)
    798  */
    799 static LLVMValueRef
    800 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
    801 		     LLVMValueRef index, bool uniform, bool invariant)
    802 {
    803 	LLVMValueRef pointer, result;
    804 
    805 	pointer = ac_build_gep0(ctx, base_ptr, index);
    806 	if (uniform)
    807 		LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
    808 	result = LLVMBuildLoad(ctx->builder, pointer, "");
    809 	if (invariant)
    810 		LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
    811 	return result;
    812 }
    813 
    814 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
    815 			   LLVMValueRef index)
    816 {
    817 	return ac_build_load_custom(ctx, base_ptr, index, false, false);
    818 }
    819 
    820 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
    821 				     LLVMValueRef base_ptr, LLVMValueRef index)
    822 {
    823 	return ac_build_load_custom(ctx, base_ptr, index, false, true);
    824 }
    825 
    826 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
    827 				   LLVMValueRef base_ptr, LLVMValueRef index)
    828 {
    829 	return ac_build_load_custom(ctx, base_ptr, index, true, true);
    830 }
    831 
    832 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
    833  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
    834  * or v4i32 (num_channels=3,4).
    835  */
    836 void
    837 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
    838 			    LLVMValueRef rsrc,
    839 			    LLVMValueRef vdata,
    840 			    unsigned num_channels,
    841 			    LLVMValueRef voffset,
    842 			    LLVMValueRef soffset,
    843 			    unsigned inst_offset,
    844 			    bool glc,
    845 			    bool slc,
    846 			    bool writeonly_memory,
    847 			    bool swizzle_enable_hint)
    848 {
    849 	/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
    850 	 * (voffset is swizzled, but soffset isn't swizzled).
    851 	 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
    852 	 */
    853 	if (!swizzle_enable_hint) {
    854 		/* Split 3 channel stores, becase LLVM doesn't support 3-channel
    855 		 * intrinsics. */
    856 		if (num_channels == 3) {
    857 			LLVMValueRef v[3], v01;
    858 
    859 			for (int i = 0; i < 3; i++) {
    860 				v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
    861 						LLVMConstInt(ctx->i32, i, 0), "");
    862 			}
    863 			v01 = ac_build_gather_values(ctx, v, 2);
    864 
    865 			ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
    866 						    soffset, inst_offset, glc, slc,
    867 						    writeonly_memory, swizzle_enable_hint);
    868 			ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
    869 						    soffset, inst_offset + 8,
    870 						    glc, slc,
    871 						    writeonly_memory, swizzle_enable_hint);
    872 			return;
    873 		}
    874 
    875 		unsigned func = CLAMP(num_channels, 1, 3) - 1;
    876 		static const char *types[] = {"f32", "v2f32", "v4f32"};
    877 		char name[256];
    878 		LLVMValueRef offset = soffset;
    879 
    880 		if (inst_offset)
    881 			offset = LLVMBuildAdd(ctx->builder, offset,
    882 					      LLVMConstInt(ctx->i32, inst_offset, 0), "");
    883 		if (voffset)
    884 			offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
    885 
    886 		LLVMValueRef args[] = {
    887 			ac_to_float(ctx, vdata),
    888 			LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
    889 			LLVMConstInt(ctx->i32, 0, 0),
    890 			offset,
    891 			LLVMConstInt(ctx->i1, glc, 0),
    892 			LLVMConstInt(ctx->i1, slc, 0),
    893 		};
    894 
    895 		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
    896 			 types[func]);
    897 
    898 		ac_build_intrinsic(ctx, name, ctx->voidt,
    899 				   args, ARRAY_SIZE(args),
    900 				   writeonly_memory ?
    901 					   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
    902 					   AC_FUNC_ATTR_WRITEONLY);
    903 		return;
    904 	}
    905 
    906 	static unsigned dfmt[] = {
    907 		V_008F0C_BUF_DATA_FORMAT_32,
    908 		V_008F0C_BUF_DATA_FORMAT_32_32,
    909 		V_008F0C_BUF_DATA_FORMAT_32_32_32,
    910 		V_008F0C_BUF_DATA_FORMAT_32_32_32_32
    911 	};
    912 	assert(num_channels >= 1 && num_channels <= 4);
    913 
    914 	LLVMValueRef args[] = {
    915 		rsrc,
    916 		vdata,
    917 		LLVMConstInt(ctx->i32, num_channels, 0),
    918 		voffset ? voffset : LLVMGetUndef(ctx->i32),
    919 		soffset,
    920 		LLVMConstInt(ctx->i32, inst_offset, 0),
    921 		LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
    922 		LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
    923 		LLVMConstInt(ctx->i32, voffset != NULL, 0),
    924 		LLVMConstInt(ctx->i32, 0, 0), /* idxen */
    925 		LLVMConstInt(ctx->i32, glc, 0),
    926 		LLVMConstInt(ctx->i32, slc, 0),
    927 		LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
    928 	};
    929 
    930 	/* The instruction offset field has 12 bits */
    931 	assert(voffset || inst_offset < (1 << 12));
    932 
    933 	/* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
    934 	unsigned func = CLAMP(num_channels, 1, 3) - 1;
    935 	const char *types[] = {"i32", "v2i32", "v4i32"};
    936 	char name[256];
    937 	snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
    938 
    939 	ac_build_intrinsic(ctx, name, ctx->voidt,
    940 			   args, ARRAY_SIZE(args),
    941 			   AC_FUNC_ATTR_LEGACY);
    942 }
    943 
    944 LLVMValueRef
    945 ac_build_buffer_load(struct ac_llvm_context *ctx,
    946 		     LLVMValueRef rsrc,
    947 		     int num_channels,
    948 		     LLVMValueRef vindex,
    949 		     LLVMValueRef voffset,
    950 		     LLVMValueRef soffset,
    951 		     unsigned inst_offset,
    952 		     unsigned glc,
    953 		     unsigned slc,
    954 		     bool can_speculate,
    955 		     bool allow_smem)
    956 {
    957 	LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
    958 	if (voffset)
    959 		offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
    960 	if (soffset)
    961 		offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
    962 
    963 	/* TODO: VI and later generations can use SMEM with GLC=1.*/
    964 	if (allow_smem && !glc && !slc) {
    965 		assert(vindex == NULL);
    966 
    967 		LLVMValueRef result[4];
    968 
    969 		for (int i = 0; i < num_channels; i++) {
    970 			if (i) {
    971 				offset = LLVMBuildAdd(ctx->builder, offset,
    972 						      LLVMConstInt(ctx->i32, 4, 0), "");
    973 			}
    974 			LLVMValueRef args[2] = {rsrc, offset};
    975 			result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
    976 						       ctx->f32, args, 2,
    977 						       AC_FUNC_ATTR_READNONE |
    978 						       AC_FUNC_ATTR_LEGACY);
    979 		}
    980 		if (num_channels == 1)
    981 			return result[0];
    982 
    983 		if (num_channels == 3)
    984 			result[num_channels++] = LLVMGetUndef(ctx->f32);
    985 		return ac_build_gather_values(ctx, result, num_channels);
    986 	}
    987 
    988 	unsigned func = CLAMP(num_channels, 1, 3) - 1;
    989 
    990 	LLVMValueRef args[] = {
    991 		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
    992 		vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
    993 		offset,
    994 		LLVMConstInt(ctx->i1, glc, 0),
    995 		LLVMConstInt(ctx->i1, slc, 0)
    996 	};
    997 
    998 	LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
    999 			       ctx->v4f32};
   1000 	const char *type_names[] = {"f32", "v2f32", "v4f32"};
   1001 	char name[256];
   1002 
   1003 	snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
   1004 		 type_names[func]);
   1005 
   1006 	return ac_build_intrinsic(ctx, name, types[func], args,
   1007 				  ARRAY_SIZE(args),
   1008 				  ac_get_load_intr_attribs(can_speculate));
   1009 }
   1010 
   1011 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
   1012 					 LLVMValueRef rsrc,
   1013 					 LLVMValueRef vindex,
   1014 					 LLVMValueRef voffset,
   1015 					 bool can_speculate)
   1016 {
   1017 	LLVMValueRef args [] = {
   1018 		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
   1019 		vindex,
   1020 		voffset,
   1021 		ctx->i1false, /* glc */
   1022 		ctx->i1false, /* slc */
   1023 	};
   1024 
   1025 	return ac_build_intrinsic(ctx,
   1026 				  "llvm.amdgcn.buffer.load.format.v4f32",
   1027 				  ctx->v4f32, args, ARRAY_SIZE(args),
   1028 				  ac_get_load_intr_attribs(can_speculate));
   1029 }
   1030 
   1031 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
   1032                                                   LLVMValueRef rsrc,
   1033                                                   LLVMValueRef vindex,
   1034                                                   LLVMValueRef voffset,
   1035                                                   bool can_speculate)
   1036 {
   1037 	LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
   1038 	LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
   1039 	stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
   1040 
   1041 	LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
   1042 	                                              LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
   1043 	                                              elem_count, stride, "");
   1044 
   1045 	LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
   1046 	                                               LLVMConstInt(ctx->i32, 2, 0), "");
   1047 
   1048 	return ac_build_buffer_load_format(ctx, new_rsrc, vindex, voffset, can_speculate);
   1049 }
   1050 
   1051 /**
   1052  * Set range metadata on an instruction.  This can only be used on load and
   1053  * call instructions.  If you know an instruction can only produce the values
   1054  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
   1055  * \p lo is the minimum value inclusive.
   1056  * \p hi is the maximum value exclusive.
   1057  */
   1058 static void set_range_metadata(struct ac_llvm_context *ctx,
   1059 			       LLVMValueRef value, unsigned lo, unsigned hi)
   1060 {
   1061 	LLVMValueRef range_md, md_args[2];
   1062 	LLVMTypeRef type = LLVMTypeOf(value);
   1063 	LLVMContextRef context = LLVMGetTypeContext(type);
   1064 
   1065 	md_args[0] = LLVMConstInt(type, lo, false);
   1066 	md_args[1] = LLVMConstInt(type, hi, false);
   1067 	range_md = LLVMMDNodeInContext(context, md_args, 2);
   1068 	LLVMSetMetadata(value, ctx->range_md_kind, range_md);
   1069 }
   1070 
   1071 LLVMValueRef
   1072 ac_get_thread_id(struct ac_llvm_context *ctx)
   1073 {
   1074 	LLVMValueRef tid;
   1075 
   1076 	LLVMValueRef tid_args[2];
   1077 	tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
   1078 	tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
   1079 	tid_args[1] = ac_build_intrinsic(ctx,
   1080 					 "llvm.amdgcn.mbcnt.lo", ctx->i32,
   1081 					 tid_args, 2, AC_FUNC_ATTR_READNONE);
   1082 
   1083 	tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
   1084 				 ctx->i32, tid_args,
   1085 				 2, AC_FUNC_ATTR_READNONE);
   1086 	set_range_metadata(ctx, tid, 0, 64);
   1087 	return tid;
   1088 }
   1089 
   1090 /*
   1091  * SI implements derivatives using the local data store (LDS)
   1092  * All writes to the LDS happen in all executing threads at
   1093  * the same time. TID is the Thread ID for the current
   1094  * thread and is a value between 0 and 63, representing
   1095  * the thread's position in the wavefront.
   1096  *
   1097  * For the pixel shader threads are grouped into quads of four pixels.
   1098  * The TIDs of the pixels of a quad are:
   1099  *
   1100  *  +------+------+
   1101  *  |4n + 0|4n + 1|
   1102  *  +------+------+
   1103  *  |4n + 2|4n + 3|
   1104  *  +------+------+
   1105  *
   1106  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
   1107  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
   1108  * the current pixel's column, and masking with 0xfffffffe yields the TID
   1109  * of the left pixel of the current pixel's row.
   1110  *
   1111  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
   1112  * adding 2 yields the TID of the pixel below the top pixel.
   1113  */
   1114 LLVMValueRef
   1115 ac_build_ddxy(struct ac_llvm_context *ctx,
   1116 	      uint32_t mask,
   1117 	      int idx,
   1118 	      LLVMValueRef val)
   1119 {
   1120 	LLVMValueRef tl, trbl, args[2];
   1121 	LLVMValueRef result;
   1122 
   1123 	if (ctx->chip_class >= VI) {
   1124 		LLVMValueRef thread_id, tl_tid, trbl_tid;
   1125 		thread_id = ac_get_thread_id(ctx);
   1126 
   1127 		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
   1128 				      LLVMConstInt(ctx->i32, mask, false), "");
   1129 
   1130 		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
   1131 					LLVMConstInt(ctx->i32, idx, false), "");
   1132 
   1133 		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
   1134 				       LLVMConstInt(ctx->i32, 4, false), "");
   1135 		args[1] = val;
   1136 		tl = ac_build_intrinsic(ctx,
   1137 					"llvm.amdgcn.ds.bpermute", ctx->i32,
   1138 					args, 2,
   1139 					AC_FUNC_ATTR_READNONE |
   1140 					AC_FUNC_ATTR_CONVERGENT);
   1141 
   1142 		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
   1143 				       LLVMConstInt(ctx->i32, 4, false), "");
   1144 		trbl = ac_build_intrinsic(ctx,
   1145 					  "llvm.amdgcn.ds.bpermute", ctx->i32,
   1146 					  args, 2,
   1147 					  AC_FUNC_ATTR_READNONE |
   1148 					  AC_FUNC_ATTR_CONVERGENT);
   1149 	} else {
   1150 		uint32_t masks[2] = {};
   1151 
   1152 		switch (mask) {
   1153 		case AC_TID_MASK_TOP_LEFT:
   1154 			masks[0] = 0x8000;
   1155 			if (idx == 1)
   1156 				masks[1] = 0x8055;
   1157 			else
   1158 				masks[1] = 0x80aa;
   1159 
   1160 			break;
   1161 		case AC_TID_MASK_TOP:
   1162 			masks[0] = 0x8044;
   1163 			masks[1] = 0x80ee;
   1164 			break;
   1165 		case AC_TID_MASK_LEFT:
   1166 			masks[0] = 0x80a0;
   1167 			masks[1] = 0x80f5;
   1168 			break;
   1169 		default:
   1170 			assert(0);
   1171 		}
   1172 
   1173 		args[0] = val;
   1174 		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
   1175 
   1176 		tl = ac_build_intrinsic(ctx,
   1177 					"llvm.amdgcn.ds.swizzle", ctx->i32,
   1178 					args, 2,
   1179 					AC_FUNC_ATTR_READNONE |
   1180 					AC_FUNC_ATTR_CONVERGENT);
   1181 
   1182 		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
   1183 		trbl = ac_build_intrinsic(ctx,
   1184 					"llvm.amdgcn.ds.swizzle", ctx->i32,
   1185 					args, 2,
   1186 					AC_FUNC_ATTR_READNONE |
   1187 					AC_FUNC_ATTR_CONVERGENT);
   1188 	}
   1189 
   1190 	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
   1191 	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
   1192 	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
   1193 	return result;
   1194 }
   1195 
   1196 void
   1197 ac_build_sendmsg(struct ac_llvm_context *ctx,
   1198 		 uint32_t msg,
   1199 		 LLVMValueRef wave_id)
   1200 {
   1201 	LLVMValueRef args[2];
   1202 	const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.SI.sendmsg" : "llvm.amdgcn.s.sendmsg";
   1203 	args[0] = LLVMConstInt(ctx->i32, msg, false);
   1204 	args[1] = wave_id;
   1205 	ac_build_intrinsic(ctx, intr_name, ctx->voidt, args, 2, 0);
   1206 }
   1207 
   1208 LLVMValueRef
   1209 ac_build_imsb(struct ac_llvm_context *ctx,
   1210 	      LLVMValueRef arg,
   1211 	      LLVMTypeRef dst_type)
   1212 {
   1213 	const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.AMDGPU.flbit.i32" :
   1214 						       "llvm.amdgcn.sffbh.i32";
   1215 	LLVMValueRef msb = ac_build_intrinsic(ctx, intr_name,
   1216 					      dst_type, &arg, 1,
   1217 					      AC_FUNC_ATTR_READNONE);
   1218 
   1219 	/* The HW returns the last bit index from MSB, but NIR/TGSI wants
   1220 	 * the index from LSB. Invert it by doing "31 - msb". */
   1221 	msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
   1222 			   msb, "");
   1223 
   1224 	LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
   1225 	LLVMValueRef cond = LLVMBuildOr(ctx->builder,
   1226 					LLVMBuildICmp(ctx->builder, LLVMIntEQ,
   1227 						      arg, LLVMConstInt(ctx->i32, 0, 0), ""),
   1228 					LLVMBuildICmp(ctx->builder, LLVMIntEQ,
   1229 						      arg, all_ones, ""), "");
   1230 
   1231 	return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
   1232 }
   1233 
   1234 LLVMValueRef
   1235 ac_build_umsb(struct ac_llvm_context *ctx,
   1236 	      LLVMValueRef arg,
   1237 	      LLVMTypeRef dst_type)
   1238 {
   1239 	LLVMValueRef args[2] = {
   1240 		arg,
   1241 		ctx->i1true,
   1242 	};
   1243 	LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
   1244 					      dst_type, args, ARRAY_SIZE(args),
   1245 					      AC_FUNC_ATTR_READNONE);
   1246 
   1247 	/* The HW returns the last bit index from MSB, but TGSI/NIR wants
   1248 	 * the index from LSB. Invert it by doing "31 - msb". */
   1249 	msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
   1250 			   msb, "");
   1251 
   1252 	/* check for zero */
   1253 	return LLVMBuildSelect(ctx->builder,
   1254 			       LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg,
   1255 					     LLVMConstInt(ctx->i32, 0, 0), ""),
   1256 			       LLVMConstInt(ctx->i32, -1, true), msb, "");
   1257 }
   1258 
   1259 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
   1260 			   LLVMValueRef b)
   1261 {
   1262 	LLVMValueRef args[2] = {a, b};
   1263 	return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
   1264 				  AC_FUNC_ATTR_READNONE);
   1265 }
   1266 
   1267 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
   1268 			   LLVMValueRef b)
   1269 {
   1270 	LLVMValueRef args[2] = {a, b};
   1271 	return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
   1272 				  AC_FUNC_ATTR_READNONE);
   1273 }
   1274 
   1275 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
   1276 			   LLVMValueRef b)
   1277 {
   1278 	LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
   1279 	return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
   1280 }
   1281 
   1282 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
   1283 {
   1284 	if (HAVE_LLVM >= 0x0500) {
   1285 		return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
   1286 				     ctx->f32_1);
   1287 	}
   1288 
   1289 	LLVMValueRef args[3] = {
   1290 		value,
   1291 		LLVMConstReal(ctx->f32, 0),
   1292 		LLVMConstReal(ctx->f32, 1),
   1293 	};
   1294 
   1295 	return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
   1296 				  AC_FUNC_ATTR_READNONE |
   1297 				  AC_FUNC_ATTR_LEGACY);
   1298 }
   1299 
   1300 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
   1301 {
   1302 	LLVMValueRef args[9];
   1303 
   1304 	if (HAVE_LLVM >= 0x0500) {
   1305 		args[0] = LLVMConstInt(ctx->i32, a->target, 0);
   1306 		args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
   1307 
   1308 		if (a->compr) {
   1309 			LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
   1310 			LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
   1311 
   1312 			args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
   1313 						   v2i16, "");
   1314 			args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
   1315 						   v2i16, "");
   1316 			args[4] = LLVMConstInt(ctx->i1, a->done, 0);
   1317 			args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
   1318 
   1319 			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
   1320 					   ctx->voidt, args, 6, 0);
   1321 		} else {
   1322 			args[2] = a->out[0];
   1323 			args[3] = a->out[1];
   1324 			args[4] = a->out[2];
   1325 			args[5] = a->out[3];
   1326 			args[6] = LLVMConstInt(ctx->i1, a->done, 0);
   1327 			args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
   1328 
   1329 			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
   1330 					   ctx->voidt, args, 8, 0);
   1331 		}
   1332 		return;
   1333 	}
   1334 
   1335 	args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
   1336 	args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
   1337 	args[2] = LLVMConstInt(ctx->i32, a->done, 0);
   1338 	args[3] = LLVMConstInt(ctx->i32, a->target, 0);
   1339 	args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
   1340 	memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
   1341 
   1342 	ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
   1343 			   AC_FUNC_ATTR_LEGACY);
   1344 }
   1345 
   1346 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
   1347 				   struct ac_image_args *a)
   1348 {
   1349 	LLVMTypeRef dst_type;
   1350 	LLVMValueRef args[11];
   1351 	unsigned num_args = 0;
   1352 	const char *name = NULL;
   1353 	char intr_name[128], type[64];
   1354 
   1355 	if (HAVE_LLVM >= 0x0400) {
   1356 		bool sample = a->opcode == ac_image_sample ||
   1357 			      a->opcode == ac_image_gather4 ||
   1358 			      a->opcode == ac_image_get_lod;
   1359 
   1360 		if (sample)
   1361 			args[num_args++] = ac_to_float(ctx, a->addr);
   1362 		else
   1363 			args[num_args++] = a->addr;
   1364 
   1365 		args[num_args++] = a->resource;
   1366 		if (sample)
   1367 			args[num_args++] = a->sampler;
   1368 		args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
   1369 		if (sample)
   1370 			args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
   1371 		args[num_args++] = ctx->i1false; /* glc */
   1372 		args[num_args++] = ctx->i1false; /* slc */
   1373 		args[num_args++] = ctx->i1false; /* lwe */
   1374 		args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
   1375 
   1376 		switch (a->opcode) {
   1377 		case ac_image_sample:
   1378 			name = "llvm.amdgcn.image.sample";
   1379 			break;
   1380 		case ac_image_gather4:
   1381 			name = "llvm.amdgcn.image.gather4";
   1382 			break;
   1383 		case ac_image_load:
   1384 			name = "llvm.amdgcn.image.load";
   1385 			break;
   1386 		case ac_image_load_mip:
   1387 			name = "llvm.amdgcn.image.load.mip";
   1388 			break;
   1389 		case ac_image_get_lod:
   1390 			name = "llvm.amdgcn.image.getlod";
   1391 			break;
   1392 		case ac_image_get_resinfo:
   1393 			name = "llvm.amdgcn.image.getresinfo";
   1394 			break;
   1395 		default:
   1396 			unreachable("invalid image opcode");
   1397 		}
   1398 
   1399 		ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
   1400 					    sizeof(type));
   1401 
   1402 		snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
   1403 			name,
   1404 			a->compare ? ".c" : "",
   1405 			a->bias ? ".b" :
   1406 			a->lod ? ".l" :
   1407 			a->deriv ? ".d" :
   1408 			a->level_zero ? ".lz" : "",
   1409 			a->offset ? ".o" : "",
   1410 			type);
   1411 
   1412 		LLVMValueRef result =
   1413 			ac_build_intrinsic(ctx, intr_name,
   1414 					   ctx->v4f32, args, num_args,
   1415 					   AC_FUNC_ATTR_READNONE);
   1416 		if (!sample) {
   1417 			result = LLVMBuildBitCast(ctx->builder, result,
   1418 						  ctx->v4i32, "");
   1419 		}
   1420 		return result;
   1421 	}
   1422 
   1423 	args[num_args++] = a->addr;
   1424 	args[num_args++] = a->resource;
   1425 
   1426 	if (a->opcode == ac_image_load ||
   1427 	    a->opcode == ac_image_load_mip ||
   1428 	    a->opcode == ac_image_get_resinfo) {
   1429 		dst_type = ctx->v4i32;
   1430 	} else {
   1431 		dst_type = ctx->v4f32;
   1432 		args[num_args++] = a->sampler;
   1433 	}
   1434 
   1435 	args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
   1436 	args[num_args++] = LLVMConstInt(ctx->i32, a->unorm, 0);
   1437 	args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */
   1438 	args[num_args++] = LLVMConstInt(ctx->i32, a->da, 0);
   1439 	args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */
   1440 	args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */
   1441 	args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */
   1442 	args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */
   1443 
   1444 	switch (a->opcode) {
   1445 	case ac_image_sample:
   1446 		name = "llvm.SI.image.sample";
   1447 		break;
   1448 	case ac_image_gather4:
   1449 		name = "llvm.SI.gather4";
   1450 		break;
   1451 	case ac_image_load:
   1452 		name = "llvm.SI.image.load";
   1453 		break;
   1454 	case ac_image_load_mip:
   1455 		name = "llvm.SI.image.load.mip";
   1456 		break;
   1457 	case ac_image_get_lod:
   1458 		name = "llvm.SI.getlod";
   1459 		break;
   1460 	case ac_image_get_resinfo:
   1461 		name = "llvm.SI.getresinfo";
   1462 		break;
   1463 	}
   1464 
   1465 	ac_build_type_name_for_intr(LLVMTypeOf(a->addr), type, sizeof(type));
   1466 	snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.%s",
   1467 		name,
   1468 		a->compare ? ".c" : "",
   1469 		a->bias ? ".b" :
   1470 		a->lod ? ".l" :
   1471 		a->deriv ? ".d" :
   1472 		a->level_zero ? ".lz" : "",
   1473 		a->offset ? ".o" : "",
   1474 		type);
   1475 
   1476 	return ac_build_intrinsic(ctx, intr_name,
   1477 				  dst_type, args, num_args,
   1478 				  AC_FUNC_ATTR_READNONE |
   1479 				  AC_FUNC_ATTR_LEGACY);
   1480 }
   1481 
   1482 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
   1483 				    LLVMValueRef args[2])
   1484 {
   1485 	if (HAVE_LLVM >= 0x0500) {
   1486 		LLVMTypeRef v2f16 =
   1487 			LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
   1488 		LLVMValueRef res =
   1489 			ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
   1490 					   v2f16, args, 2,
   1491 					   AC_FUNC_ATTR_READNONE);
   1492 		return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
   1493 	}
   1494 
   1495 	return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
   1496 				  AC_FUNC_ATTR_READNONE |
   1497 				  AC_FUNC_ATTR_LEGACY);
   1498 }
   1499 
   1500 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
   1501 {
   1502 	assert(HAVE_LLVM >= 0x0600);
   1503 	return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
   1504 				  &i1, 1, AC_FUNC_ATTR_READNONE);
   1505 }
   1506 
   1507 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
   1508 {
   1509 	if (HAVE_LLVM >= 0x0600) {
   1510 		ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
   1511 				   &i1, 1, 0);
   1512 		return;
   1513 	}
   1514 
   1515 	LLVMValueRef value = LLVMBuildSelect(ctx->builder, i1,
   1516 					     LLVMConstReal(ctx->f32, 1),
   1517 					     LLVMConstReal(ctx->f32, -1), "");
   1518 	ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt,
   1519 			   &value, 1, AC_FUNC_ATTR_LEGACY);
   1520 }
   1521 
   1522 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
   1523 			  LLVMValueRef offset, LLVMValueRef width,
   1524 			  bool is_signed)
   1525 {
   1526 	LLVMValueRef args[] = {
   1527 		input,
   1528 		offset,
   1529 		width,
   1530 	};
   1531 
   1532 	if (HAVE_LLVM >= 0x0500) {
   1533 		return ac_build_intrinsic(ctx,
   1534 					  is_signed ? "llvm.amdgcn.sbfe.i32" :
   1535 						      "llvm.amdgcn.ubfe.i32",
   1536 					  ctx->i32, args, 3,
   1537 					  AC_FUNC_ATTR_READNONE);
   1538 	}
   1539 
   1540 	return ac_build_intrinsic(ctx,
   1541 				  is_signed ? "llvm.AMDGPU.bfe.i32" :
   1542 					      "llvm.AMDGPU.bfe.u32",
   1543 				  ctx->i32, args, 3,
   1544 				  AC_FUNC_ATTR_READNONE |
   1545 				  AC_FUNC_ATTR_LEGACY);
   1546 }
   1547 
   1548 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
   1549 {
   1550 	LLVMValueRef args[1] = {
   1551 		LLVMConstInt(ctx->i32, simm16, false),
   1552 	};
   1553 	ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
   1554 			   ctx->voidt, args, 1, 0);
   1555 }
   1556 
   1557 void ac_get_image_intr_name(const char *base_name,
   1558 			    LLVMTypeRef data_type,
   1559 			    LLVMTypeRef coords_type,
   1560 			    LLVMTypeRef rsrc_type,
   1561 			    char *out_name, unsigned out_len)
   1562 {
   1563         char coords_type_name[8];
   1564 
   1565         ac_build_type_name_for_intr(coords_type, coords_type_name,
   1566                             sizeof(coords_type_name));
   1567 
   1568         if (HAVE_LLVM <= 0x0309) {
   1569                 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
   1570         } else {
   1571                 char data_type_name[8];
   1572                 char rsrc_type_name[8];
   1573 
   1574                 ac_build_type_name_for_intr(data_type, data_type_name,
   1575                                         sizeof(data_type_name));
   1576                 ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
   1577                                         sizeof(rsrc_type_name));
   1578                 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
   1579                          data_type_name, coords_type_name, rsrc_type_name);
   1580         }
   1581 }
   1582 
   1583 #define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
   1584 #define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
   1585 
   1586 enum ac_ir_type {
   1587 	AC_IR_UNDEF,
   1588 	AC_IR_CONST,
   1589 	AC_IR_VALUE,
   1590 };
   1591 
   1592 struct ac_vs_exp_chan
   1593 {
   1594 	LLVMValueRef value;
   1595 	float const_float;
   1596 	enum ac_ir_type type;
   1597 };
   1598 
   1599 struct ac_vs_exp_inst {
   1600 	unsigned offset;
   1601 	LLVMValueRef inst;
   1602 	struct ac_vs_exp_chan chan[4];
   1603 };
   1604 
   1605 struct ac_vs_exports {
   1606 	unsigned num;
   1607 	struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
   1608 };
   1609 
   1610 /* Return true if the PARAM export has been eliminated. */
   1611 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
   1612 				      uint32_t num_outputs,
   1613 				      struct ac_vs_exp_inst *exp)
   1614 {
   1615 	unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
   1616 	bool is_zero[4] = {}, is_one[4] = {};
   1617 
   1618 	for (i = 0; i < 4; i++) {
   1619 		/* It's a constant expression. Undef outputs are eliminated too. */
   1620 		if (exp->chan[i].type == AC_IR_UNDEF) {
   1621 			is_zero[i] = true;
   1622 			is_one[i] = true;
   1623 		} else if (exp->chan[i].type == AC_IR_CONST) {
   1624 			if (exp->chan[i].const_float == 0)
   1625 				is_zero[i] = true;
   1626 			else if (exp->chan[i].const_float == 1)
   1627 				is_one[i] = true;
   1628 			else
   1629 				return false; /* other constant */
   1630 		} else
   1631 			return false;
   1632 	}
   1633 
   1634 	/* Only certain combinations of 0 and 1 can be eliminated. */
   1635 	if (is_zero[0] && is_zero[1] && is_zero[2])
   1636 		default_val = is_zero[3] ? 0 : 1;
   1637 	else if (is_one[0] && is_one[1] && is_one[2])
   1638 		default_val = is_zero[3] ? 2 : 3;
   1639 	else
   1640 		return false;
   1641 
   1642 	/* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
   1643 	LLVMInstructionEraseFromParent(exp->inst);
   1644 
   1645 	/* Change OFFSET to DEFAULT_VAL. */
   1646 	for (i = 0; i < num_outputs; i++) {
   1647 		if (vs_output_param_offset[i] == exp->offset) {
   1648 			vs_output_param_offset[i] =
   1649 				AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
   1650 			break;
   1651 		}
   1652 	}
   1653 	return true;
   1654 }
   1655 
   1656 static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
   1657 					   uint32_t num_outputs,
   1658 					   struct ac_vs_exports *processed,
   1659 				           struct ac_vs_exp_inst *exp)
   1660 {
   1661 	unsigned p, copy_back_channels = 0;
   1662 
   1663 	/* See if the output is already in the list of processed outputs.
   1664 	 * The LLVMValueRef comparison relies on SSA.
   1665 	 */
   1666 	for (p = 0; p < processed->num; p++) {
   1667 		bool different = false;
   1668 
   1669 		for (unsigned j = 0; j < 4; j++) {
   1670 			struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
   1671 			struct ac_vs_exp_chan *c2 = &exp->chan[j];
   1672 
   1673 			/* Treat undef as a match. */
   1674 			if (c2->type == AC_IR_UNDEF)
   1675 				continue;
   1676 
   1677 			/* If c1 is undef but c2 isn't, we can copy c2 to c1
   1678 			 * and consider the instruction duplicated.
   1679 			 */
   1680 			if (c1->type == AC_IR_UNDEF) {
   1681 				copy_back_channels |= 1 << j;
   1682 				continue;
   1683 			}
   1684 
   1685 			/* Test whether the channels are not equal. */
   1686 			if (c1->type != c2->type ||
   1687 			    (c1->type == AC_IR_CONST &&
   1688 			     c1->const_float != c2->const_float) ||
   1689 			    (c1->type == AC_IR_VALUE &&
   1690 			     c1->value != c2->value)) {
   1691 				different = true;
   1692 				break;
   1693 			}
   1694 		}
   1695 		if (!different)
   1696 			break;
   1697 
   1698 		copy_back_channels = 0;
   1699 	}
   1700 	if (p == processed->num)
   1701 		return false;
   1702 
   1703 	/* If a match was found, but the matching export has undef where the new
   1704 	 * one has a normal value, copy the normal value to the undef channel.
   1705 	 */
   1706 	struct ac_vs_exp_inst *match = &processed->exp[p];
   1707 
   1708 	while (copy_back_channels) {
   1709 		unsigned chan = u_bit_scan(&copy_back_channels);
   1710 
   1711 		assert(match->chan[chan].type == AC_IR_UNDEF);
   1712 		LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
   1713 			       exp->chan[chan].value);
   1714 		match->chan[chan] = exp->chan[chan];
   1715 	}
   1716 
   1717 	/* The PARAM export is duplicated. Kill it. */
   1718 	LLVMInstructionEraseFromParent(exp->inst);
   1719 
   1720 	/* Change OFFSET to the matching export. */
   1721 	for (unsigned i = 0; i < num_outputs; i++) {
   1722 		if (vs_output_param_offset[i] == exp->offset) {
   1723 			vs_output_param_offset[i] = match->offset;
   1724 			break;
   1725 		}
   1726 	}
   1727 	return true;
   1728 }
   1729 
   1730 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
   1731 			    LLVMValueRef main_fn,
   1732 			    uint8_t *vs_output_param_offset,
   1733 			    uint32_t num_outputs,
   1734 			    uint8_t *num_param_exports)
   1735 {
   1736 	LLVMBasicBlockRef bb;
   1737 	bool removed_any = false;
   1738 	struct ac_vs_exports exports;
   1739 
   1740 	exports.num = 0;
   1741 
   1742 	/* Process all LLVM instructions. */
   1743 	bb = LLVMGetFirstBasicBlock(main_fn);
   1744 	while (bb) {
   1745 		LLVMValueRef inst = LLVMGetFirstInstruction(bb);
   1746 
   1747 		while (inst) {
   1748 			LLVMValueRef cur = inst;
   1749 			inst = LLVMGetNextInstruction(inst);
   1750 			struct ac_vs_exp_inst exp;
   1751 
   1752 			if (LLVMGetInstructionOpcode(cur) != LLVMCall)
   1753 				continue;
   1754 
   1755 			LLVMValueRef callee = ac_llvm_get_called_value(cur);
   1756 
   1757 			if (!ac_llvm_is_function(callee))
   1758 				continue;
   1759 
   1760 			const char *name = LLVMGetValueName(callee);
   1761 			unsigned num_args = LLVMCountParams(callee);
   1762 
   1763 			/* Check if this is an export instruction. */
   1764 			if ((num_args != 9 && num_args != 8) ||
   1765 			    (strcmp(name, "llvm.SI.export") &&
   1766 			     strcmp(name, "llvm.amdgcn.exp.f32")))
   1767 				continue;
   1768 
   1769 			LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
   1770 			unsigned target = LLVMConstIntGetZExtValue(arg);
   1771 
   1772 			if (target < V_008DFC_SQ_EXP_PARAM)
   1773 				continue;
   1774 
   1775 			target -= V_008DFC_SQ_EXP_PARAM;
   1776 
   1777 			/* Parse the instruction. */
   1778 			memset(&exp, 0, sizeof(exp));
   1779 			exp.offset = target;
   1780 			exp.inst = cur;
   1781 
   1782 			for (unsigned i = 0; i < 4; i++) {
   1783 				LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
   1784 
   1785 				exp.chan[i].value = v;
   1786 
   1787 				if (LLVMIsUndef(v)) {
   1788 					exp.chan[i].type = AC_IR_UNDEF;
   1789 				} else if (LLVMIsAConstantFP(v)) {
   1790 					LLVMBool loses_info;
   1791 					exp.chan[i].type = AC_IR_CONST;
   1792 					exp.chan[i].const_float =
   1793 						LLVMConstRealGetDouble(v, &loses_info);
   1794 				} else {
   1795 					exp.chan[i].type = AC_IR_VALUE;
   1796 				}
   1797 			}
   1798 
   1799 			/* Eliminate constant and duplicated PARAM exports. */
   1800 			if (ac_eliminate_const_output(vs_output_param_offset,
   1801 						      num_outputs, &exp) ||
   1802 			    ac_eliminate_duplicated_output(vs_output_param_offset,
   1803 							   num_outputs, &exports,
   1804 							   &exp)) {
   1805 				removed_any = true;
   1806 			} else {
   1807 				exports.exp[exports.num++] = exp;
   1808 			}
   1809 		}
   1810 		bb = LLVMGetNextBasicBlock(bb);
   1811 	}
   1812 
   1813 	/* Remove holes in export memory due to removed PARAM exports.
   1814 	 * This is done by renumbering all PARAM exports.
   1815 	 */
   1816 	if (removed_any) {
   1817 		uint8_t old_offset[VARYING_SLOT_MAX];
   1818 		unsigned out, i;
   1819 
   1820 		/* Make a copy of the offsets. We need the old version while
   1821 		 * we are modifying some of them. */
   1822 		memcpy(old_offset, vs_output_param_offset,
   1823 		       sizeof(old_offset));
   1824 
   1825 		for (i = 0; i < exports.num; i++) {
   1826 			unsigned offset = exports.exp[i].offset;
   1827 
   1828 			/* Update vs_output_param_offset. Multiple outputs can
   1829 			 * have the same offset.
   1830 			 */
   1831 			for (out = 0; out < num_outputs; out++) {
   1832 				if (old_offset[out] == offset)
   1833 					vs_output_param_offset[out] = i;
   1834 			}
   1835 
   1836 			/* Change the PARAM offset in the instruction. */
   1837 			LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
   1838 				       LLVMConstInt(ctx->i32,
   1839 						    V_008DFC_SQ_EXP_PARAM + i, 0));
   1840 		}
   1841 		*num_param_exports = exports.num;
   1842 	}
   1843 }
   1844 
   1845 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
   1846 {
   1847 	LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
   1848 	ac_build_intrinsic(ctx,
   1849 			   "llvm.amdgcn.init.exec", ctx->voidt,
   1850 			   &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
   1851 }
   1852 
   1853 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
   1854 {
   1855 	unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
   1856 	ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
   1857 				     LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
   1858 				     "lds");
   1859 }
   1860 
   1861 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
   1862 			 LLVMValueRef dw_addr)
   1863 {
   1864 	return ac_build_load(ctx, ctx->lds, dw_addr);
   1865 }
   1866 
   1867 void ac_lds_store(struct ac_llvm_context *ctx,
   1868 		  LLVMValueRef dw_addr,
   1869 		  LLVMValueRef value)
   1870 {
   1871 	value = ac_to_integer(ctx, value);
   1872 	ac_build_indexed_store(ctx, ctx->lds,
   1873 			       dw_addr, value);
   1874 }
   1875 
   1876 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
   1877 			 LLVMTypeRef dst_type,
   1878 			 LLVMValueRef src0)
   1879 {
   1880 	LLVMValueRef params[2] = {
   1881 		src0,
   1882 
   1883 		/* The value of 1 means that ffs(x=0) = undef, so LLVM won't
   1884 		 * add special code to check for x=0. The reason is that
   1885 		 * the LLVM behavior for x=0 is different from what we
   1886 		 * need here. However, LLVM also assumes that ffs(x) is
   1887 		 * in [0, 31], but GLSL expects that ffs(0) = -1, so
   1888 		 * a conditional assignment to handle 0 is still required.
   1889 		 *
   1890 		 * The hardware already implements the correct behavior.
   1891 		 */
   1892 		LLVMConstInt(ctx->i1, 1, false),
   1893 	};
   1894 
   1895 	LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
   1896 					      params, 2,
   1897 					      AC_FUNC_ATTR_READNONE);
   1898 
   1899 	/* TODO: We need an intrinsic to skip this conditional. */
   1900 	/* Check for zero: */
   1901 	return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
   1902 							   LLVMIntEQ, src0,
   1903 							   ctx->i32_0, ""),
   1904 			       LLVMConstInt(ctx->i32, -1, 0), lsb, "");
   1905 }
   1906 
   1907 static struct ac_llvm_flow *
   1908 get_current_flow(struct ac_llvm_context *ctx)
   1909 {
   1910 	if (ctx->flow_depth > 0)
   1911 		return &ctx->flow[ctx->flow_depth - 1];
   1912 	return NULL;
   1913 }
   1914 
   1915 static struct ac_llvm_flow *
   1916 get_innermost_loop(struct ac_llvm_context *ctx)
   1917 {
   1918 	for (unsigned i = ctx->flow_depth; i > 0; --i) {
   1919 		if (ctx->flow[i - 1].loop_entry_block)
   1920 			return &ctx->flow[i - 1];
   1921 	}
   1922 	return NULL;
   1923 }
   1924 
   1925 static struct ac_llvm_flow *
   1926 push_flow(struct ac_llvm_context *ctx)
   1927 {
   1928 	struct ac_llvm_flow *flow;
   1929 
   1930 	if (ctx->flow_depth >= ctx->flow_depth_max) {
   1931 		unsigned new_max = MAX2(ctx->flow_depth << 1,
   1932 					AC_LLVM_INITIAL_CF_DEPTH);
   1933 
   1934 		ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
   1935 		ctx->flow_depth_max = new_max;
   1936 	}
   1937 
   1938 	flow = &ctx->flow[ctx->flow_depth];
   1939 	ctx->flow_depth++;
   1940 
   1941 	flow->next_block = NULL;
   1942 	flow->loop_entry_block = NULL;
   1943 	return flow;
   1944 }
   1945 
   1946 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
   1947 				int label_id)
   1948 {
   1949 	char buf[32];
   1950 	snprintf(buf, sizeof(buf), "%s%d", base, label_id);
   1951 	LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
   1952 }
   1953 
   1954 /* Append a basic block at the level of the parent flow.
   1955  */
   1956 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
   1957 					    const char *name)
   1958 {
   1959 	assert(ctx->flow_depth >= 1);
   1960 
   1961 	if (ctx->flow_depth >= 2) {
   1962 		struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
   1963 
   1964 		return LLVMInsertBasicBlockInContext(ctx->context,
   1965 						     flow->next_block, name);
   1966 	}
   1967 
   1968 	LLVMValueRef main_fn =
   1969 		LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
   1970 	return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
   1971 }
   1972 
   1973 /* Emit a branch to the given default target for the current block if
   1974  * applicable -- that is, if the current block does not already contain a
   1975  * branch from a break or continue.
   1976  */
   1977 static void emit_default_branch(LLVMBuilderRef builder,
   1978 				LLVMBasicBlockRef target)
   1979 {
   1980 	if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
   1981 		 LLVMBuildBr(builder, target);
   1982 }
   1983 
   1984 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
   1985 {
   1986 	struct ac_llvm_flow *flow = push_flow(ctx);
   1987 	flow->loop_entry_block = append_basic_block(ctx, "LOOP");
   1988 	flow->next_block = append_basic_block(ctx, "ENDLOOP");
   1989 	set_basicblock_name(flow->loop_entry_block, "loop", label_id);
   1990 	LLVMBuildBr(ctx->builder, flow->loop_entry_block);
   1991 	LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
   1992 }
   1993 
   1994 void ac_build_break(struct ac_llvm_context *ctx)
   1995 {
   1996 	struct ac_llvm_flow *flow = get_innermost_loop(ctx);
   1997 	LLVMBuildBr(ctx->builder, flow->next_block);
   1998 }
   1999 
   2000 void ac_build_continue(struct ac_llvm_context *ctx)
   2001 {
   2002 	struct ac_llvm_flow *flow = get_innermost_loop(ctx);
   2003 	LLVMBuildBr(ctx->builder, flow->loop_entry_block);
   2004 }
   2005 
   2006 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
   2007 {
   2008 	struct ac_llvm_flow *current_branch = get_current_flow(ctx);
   2009 	LLVMBasicBlockRef endif_block;
   2010 
   2011 	assert(!current_branch->loop_entry_block);
   2012 
   2013 	endif_block = append_basic_block(ctx, "ENDIF");
   2014 	emit_default_branch(ctx->builder, endif_block);
   2015 
   2016 	LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
   2017 	set_basicblock_name(current_branch->next_block, "else", label_id);
   2018 
   2019 	current_branch->next_block = endif_block;
   2020 }
   2021 
   2022 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
   2023 {
   2024 	struct ac_llvm_flow *current_branch = get_current_flow(ctx);
   2025 
   2026 	assert(!current_branch->loop_entry_block);
   2027 
   2028 	emit_default_branch(ctx->builder, current_branch->next_block);
   2029 	LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
   2030 	set_basicblock_name(current_branch->next_block, "endif", label_id);
   2031 
   2032 	ctx->flow_depth--;
   2033 }
   2034 
   2035 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
   2036 {
   2037 	struct ac_llvm_flow *current_loop = get_current_flow(ctx);
   2038 
   2039 	assert(current_loop->loop_entry_block);
   2040 
   2041 	emit_default_branch(ctx->builder, current_loop->loop_entry_block);
   2042 
   2043 	LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
   2044 	set_basicblock_name(current_loop->next_block, "endloop", label_id);
   2045 	ctx->flow_depth--;
   2046 }
   2047 
   2048 static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
   2049 			 int label_id)
   2050 {
   2051 	struct ac_llvm_flow *flow = push_flow(ctx);
   2052 	LLVMBasicBlockRef if_block;
   2053 
   2054 	if_block = append_basic_block(ctx, "IF");
   2055 	flow->next_block = append_basic_block(ctx, "ELSE");
   2056 	set_basicblock_name(if_block, "if", label_id);
   2057 	LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
   2058 	LLVMPositionBuilderAtEnd(ctx->builder, if_block);
   2059 }
   2060 
   2061 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
   2062 		 int label_id)
   2063 {
   2064 	LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
   2065 					  value, ctx->f32_0, "");
   2066 	if_cond_emit(ctx, cond, label_id);
   2067 }
   2068 
   2069 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
   2070 		  int label_id)
   2071 {
   2072 	LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
   2073 					  ac_to_integer(ctx, value),
   2074 					  ctx->i32_0, "");
   2075 	if_cond_emit(ctx, cond, label_id);
   2076 }
   2077