Home | History | Annotate | Download | only in a4xx
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include "pipe/p_state.h"
     30 #include "util/u_string.h"
     31 #include "util/u_memory.h"
     32 #include "util/u_inlines.h"
     33 #include "util/u_format.h"
     34 
     35 #include "freedreno_program.h"
     36 
     37 #include "fd4_program.h"
     38 #include "fd4_emit.h"
     39 #include "fd4_texture.h"
     40 #include "fd4_format.h"
     41 
     42 static void
     43 delete_shader_stateobj(struct fd4_shader_stateobj *so)
     44 {
     45 	ir3_shader_destroy(so->shader);
     46 	free(so);
     47 }
     48 
     49 static struct fd4_shader_stateobj *
     50 create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
     51 		enum shader_t type)
     52 {
     53 	struct fd_context *ctx = fd_context(pctx);
     54 	struct ir3_compiler *compiler = ctx->screen->compiler;
     55 	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
     56 	so->shader = ir3_shader_create(compiler, cso, type, &ctx->debug);
     57 	return so;
     58 }
     59 
     60 static void *
     61 fd4_fp_state_create(struct pipe_context *pctx,
     62 		const struct pipe_shader_state *cso)
     63 {
     64 	return create_shader_stateobj(pctx, cso, SHADER_FRAGMENT);
     65 }
     66 
     67 static void
     68 fd4_fp_state_delete(struct pipe_context *pctx, void *hwcso)
     69 {
     70 	struct fd4_shader_stateobj *so = hwcso;
     71 	delete_shader_stateobj(so);
     72 }
     73 
     74 static void *
     75 fd4_vp_state_create(struct pipe_context *pctx,
     76 		const struct pipe_shader_state *cso)
     77 {
     78 	return create_shader_stateobj(pctx, cso, SHADER_VERTEX);
     79 }
     80 
     81 static void
     82 fd4_vp_state_delete(struct pipe_context *pctx, void *hwcso)
     83 {
     84 	struct fd4_shader_stateobj *so = hwcso;
     85 	delete_shader_stateobj(so);
     86 }
     87 
     88 static void
     89 emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
     90 {
     91 	const struct ir3_info *si = &so->info;
     92 	enum adreno_state_block sb;
     93 	enum adreno_state_src src;
     94 	uint32_t i, sz, *bin;
     95 
     96 	if (so->type == SHADER_VERTEX) {
     97 		sb = SB_VERT_SHADER;
     98 	} else {
     99 		sb = SB_FRAG_SHADER;
    100 	}
    101 
    102 	if (fd_mesa_debug & FD_DBG_DIRECT) {
    103 		sz = si->sizedwords;
    104 		src = SS_DIRECT;
    105 		bin = fd_bo_map(so->bo);
    106 	} else {
    107 		sz = 0;
    108 		src = 2;  // enums different on a4xx..
    109 		bin = NULL;
    110 	}
    111 
    112 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
    113 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
    114 			CP_LOAD_STATE_0_STATE_SRC(src) |
    115 			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
    116 			CP_LOAD_STATE_0_NUM_UNIT(so->instrlen));
    117 	if (bin) {
    118 		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
    119 				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER));
    120 	} else {
    121 		OUT_RELOC(ring, so->bo, 0,
    122 				CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0);
    123 	}
    124 
    125 	/* for how clever coverity is, it is sometimes rather dull, and
    126 	 * doesn't realize that the only case where bin==NULL, sz==0:
    127 	 */
    128 	assume(bin || (sz == 0));
    129 
    130 	for (i = 0; i < sz; i++) {
    131 		OUT_RING(ring, bin[i]);
    132 	}
    133 }
    134 
    135 struct stage {
    136 	const struct ir3_shader_variant *v;
    137 	const struct ir3_info *i;
    138 	/* const sizes are in units of 4 * vec4 */
    139 	uint8_t constoff;
    140 	uint8_t constlen;
    141 	/* instr sizes are in units of 16 instructions */
    142 	uint8_t instroff;
    143 	uint8_t instrlen;
    144 };
    145 
    146 enum {
    147 	VS = 0,
    148 	FS = 1,
    149 	HS = 2,
    150 	DS = 3,
    151 	GS = 4,
    152 	MAX_STAGES
    153 };
    154 
    155 static void
    156 setup_stages(struct fd4_emit *emit, struct stage *s)
    157 {
    158 	unsigned i;
    159 
    160 	s[VS].v = fd4_emit_get_vp(emit);
    161 	s[FS].v = fd4_emit_get_fp(emit);
    162 
    163 	s[HS].v = s[DS].v = s[GS].v = NULL;  /* for now */
    164 
    165 	for (i = 0; i < MAX_STAGES; i++) {
    166 		if (s[i].v) {
    167 			s[i].i = &s[i].v->info;
    168 			/* constlen is in units of 4 * vec4: */
    169 			s[i].constlen = align(s[i].v->constlen, 4) / 4;
    170 			/* instrlen is already in units of 16 instr.. although
    171 			 * probably we should ditch that and not make the compiler
    172 			 * care about instruction group size of a3xx vs a4xx
    173 			 */
    174 			s[i].instrlen = s[i].v->instrlen;
    175 		} else {
    176 			s[i].i = NULL;
    177 			s[i].constlen = 0;
    178 			s[i].instrlen = 0;
    179 		}
    180 	}
    181 
    182 	/* NOTE: at least for gles2, blob partitions VS at bottom of const
    183 	 * space and FS taking entire remaining space.  We probably don't
    184 	 * need to do that the same way, but for now mimic what the blob
    185 	 * does to make it easier to diff against register values from blob
    186 	 *
    187 	 * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders
    188 	 * is run from external memory.
    189 	 */
    190 	if ((s[VS].instrlen + s[FS].instrlen) > 64) {
    191 		/* prioritize FS for internal memory: */
    192 		if (s[FS].instrlen < 64) {
    193 			/* if FS can fit, kick VS out to external memory: */
    194 			s[VS].instrlen = 0;
    195 		} else if (s[VS].instrlen < 64) {
    196 			/* otherwise if VS can fit, kick out FS: */
    197 			s[FS].instrlen = 0;
    198 		} else {
    199 			/* neither can fit, run both from external memory: */
    200 			s[VS].instrlen = 0;
    201 			s[FS].instrlen = 0;
    202 		}
    203 	}
    204 	s[VS].constlen = 66;
    205 	s[FS].constlen = 128 - s[VS].constlen;
    206 	s[VS].instroff = 0;
    207 	s[VS].constoff = 0;
    208 	s[FS].instroff = 64 - s[FS].instrlen;
    209 	s[FS].constoff = s[VS].constlen;
    210 	s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff;
    211 	s[HS].constoff = s[DS].constoff = s[GS].constoff = s[FS].constoff;
    212 }
    213 
    214 void
    215 fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
    216 		int nr, struct pipe_surface **bufs)
    217 {
    218 	struct stage s[MAX_STAGES];
    219 	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
    220 	uint32_t face_regid, coord_regid, zwcoord_regid;
    221 	enum a3xx_threadsize fssz;
    222 	int constmode;
    223 	int i, j;
    224 
    225 	debug_assert(nr <= ARRAY_SIZE(color_regid));
    226 
    227 	if (emit->key.binning_pass)
    228 		nr = 0;
    229 
    230 	setup_stages(emit, s);
    231 
    232 	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
    233 
    234 	/* blob seems to always use constmode currently: */
    235 	constmode = 1;
    236 
    237 	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
    238 	if (pos_regid == regid(63, 0)) {
    239 		/* hw dislikes when there is no position output, which can
    240 		 * happen for transform-feedback vertex shaders.  Just tell
    241 		 * the hw to use r0.x, with whatever random value is there:
    242 		 */
    243 		pos_regid = regid(0, 0);
    244 	}
    245 	posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH);
    246 	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
    247 	if (s[FS].v->color0_mrt) {
    248 		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
    249 		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
    250 			ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR);
    251 	} else {
    252 		color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0);
    253 		color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1);
    254 		color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2);
    255 		color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3);
    256 		color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4);
    257 		color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5);
    258 		color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6);
    259 		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
    260 	}
    261 
    262 	/* TODO get these dynamically: */
    263 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
    264 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
    265 	zwcoord_regid = s[FS].v->frag_coord ? regid(0,2) : regid(63,0);
    266 
    267 	/* we could probably divide this up into things that need to be
    268 	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
    269 	 */
    270 
    271 	OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1);
    272 	OUT_RING(ring, 0x00000003);
    273 
    274 	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
    275 	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
    276 			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
    277 			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
    278 			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
    279 			 * flush some caches? I think we only need to set those
    280 			 * bits if we have updated const or shader..
    281 			 */
    282 			A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
    283 			A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
    284 	OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
    285 			A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
    286 			A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) |
    287 			A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid));
    288 	OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) |
    289 			0x3f3f000 |           /* XXX */
    290 			A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid));
    291 	OUT_RING(ring, A4XX_HLSQ_CONTROL_3_REG_REGID(s[FS].v->pos_regid) |
    292 			0xfcfcfc00);
    293 	OUT_RING(ring, 0x00fcfcfc);   /* XXX HLSQ_CONTROL_4 */
    294 
    295 	OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5);
    296 	OUT_RING(ring, A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) |
    297 			A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
    298 			A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) |
    299 			A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff));
    300 	OUT_RING(ring, A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) |
    301 			A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
    302 			A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) |
    303 			A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff));
    304 	OUT_RING(ring, A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) |
    305 			A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
    306 			A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) |
    307 			A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff));
    308 	OUT_RING(ring, A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) |
    309 			A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
    310 			A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) |
    311 			A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff));
    312 	OUT_RING(ring, A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) |
    313 			A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
    314 			A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) |
    315 			A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff));
    316 
    317 	OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1);
    318 	OUT_RING(ring, 0x140010 | /* XXX */
    319 			COND(emit->key.binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS));
    320 
    321 	OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1);
    322 	OUT_RING(ring, 0x7f | /* XXX */
    323 			COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) |
    324 			COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) |
    325 			COND(s[VS].instrlen && s[FS].instrlen,
    326 					A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER));
    327 
    328 	OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1);
    329 	OUT_RING(ring, s[VS].v->instrlen);      /* SP_VS_LENGTH_REG */
    330 
    331 	OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3);
    332 	OUT_RING(ring, A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
    333 			A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
    334 			A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
    335 			A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
    336 			A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
    337 			A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
    338 			COND(s[VS].v->has_samp, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE));
    339 	OUT_RING(ring, A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) |
    340 			A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in));
    341 	OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
    342 			A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
    343 			A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in));
    344 
    345 	struct ir3_shader_linkage l = {0};
    346 	ir3_link_shaders(&l, s[VS].v, s[FS].v);
    347 
    348 	for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
    349 		uint32_t reg = 0;
    350 
    351 		OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1);
    352 
    353 		reg |= A4XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
    354 		reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
    355 		j++;
    356 
    357 		reg |= A4XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
    358 		reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
    359 		j++;
    360 
    361 		OUT_RING(ring, reg);
    362 	}
    363 
    364 	for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) {
    365 		uint32_t reg = 0;
    366 
    367 		OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1);
    368 
    369 		reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8);
    370 		reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8);
    371 		reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8);
    372 		reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8);
    373 
    374 		OUT_RING(ring, reg);
    375 	}
    376 
    377 	OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2);
    378 	OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
    379 			A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff));
    380 	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */
    381 
    382 	if (emit->key.binning_pass) {
    383 		OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
    384 		OUT_RING(ring, 0x00000000);         /* SP_FS_LENGTH_REG */
    385 
    386 		OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
    387 		OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
    388 				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
    389 				A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) |
    390 				A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) |
    391 				A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
    392 				A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
    393 				A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE);
    394 		OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
    395 				0x80000000);
    396 
    397 		OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
    398 		OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
    399 				A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
    400 		OUT_RING(ring, 0x00000000);
    401 	} else {
    402 		OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
    403 		OUT_RING(ring, s[FS].v->instrlen);  /* SP_FS_LENGTH_REG */
    404 
    405 		OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
    406 		OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
    407 				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
    408 				A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
    409 				A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
    410 				A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
    411 				A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
    412 				A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
    413 				COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
    414 		OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
    415 				0x80000000 |      /* XXX */
    416 				COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
    417 				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
    418 				COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD));
    419 
    420 		OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
    421 		OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
    422 				A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
    423 		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
    424 	}
    425 
    426 	OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1);
    427 	OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
    428 			A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff));
    429 
    430 	OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1);
    431 	OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
    432 			A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff));
    433 
    434 	OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1);
    435 	OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
    436 			A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff));
    437 
    438 	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1);
    439 	OUT_RING(ring, A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) |
    440 			COND(s[FS].v->total_in > 0, A4XX_RB_RENDER_CONTROL2_VARYING) |
    441 			COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
    442 			COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD |
    443 					A4XX_RB_RENDER_CONTROL2_YCOORD |
    444 					A4XX_RB_RENDER_CONTROL2_ZCOORD |
    445 					A4XX_RB_RENDER_CONTROL2_WCOORD));
    446 
    447 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
    448 	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(nr) |
    449 			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
    450 
    451 	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
    452 	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(nr) |
    453 			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
    454 			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
    455 
    456 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
    457 	for (i = 0; i < 8; i++) {
    458 		enum a4xx_color_fmt format = 0;
    459 		bool srgb = false;
    460 		if (i < nr) {
    461 			format = fd4_emit_format(bufs[i]);
    462 			if (bufs[i] && !emit->no_decode_srgb)
    463 				srgb = util_format_is_srgb(bufs[i]->format);
    464 		}
    465 		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
    466 				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
    467 				COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
    468 				COND(emit->key.half_precision,
    469 					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
    470 	}
    471 
    472 	if (emit->key.binning_pass) {
    473 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
    474 		OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) |
    475 				0x40000000 |      /* XXX */
    476 				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
    477 		OUT_RING(ring, 0x00000000);
    478 	} else {
    479 		uint32_t vinterp[8], vpsrepl[8];
    480 
    481 		memset(vinterp, 0, sizeof(vinterp));
    482 		memset(vpsrepl, 0, sizeof(vpsrepl));
    483 
    484 		/* looks like we need to do int varyings in the frag
    485 		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
    486 		 *
    487 		 *    (sy)(ss)nop
    488 		 *    (sy)ldlv.u32 r0.x,l[r0.x], 1
    489 		 *    ldlv.u32 r0.y,l[r0.x+1], 1
    490 		 *    (ss)bary.f (ei)r63.x, 0, r0.x
    491 		 *    (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x
    492 		 *    (rpt5)nop
    493 		 *    sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0
    494 		 *
    495 		 * Possibly on later a4xx variants we'll be able to use
    496 		 * something like the code below instead of workaround
    497 		 * in the shader:
    498 		 */
    499 		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
    500 		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
    501 			/* NOTE: varyings are packed, so if compmask is 0xb
    502 			 * then first, third, and fourth component occupy
    503 			 * three consecutive varying slots:
    504 			 */
    505 			unsigned compmask = s[FS].v->inputs[j].compmask;
    506 
    507 			uint32_t inloc = s[FS].v->inputs[j].inloc;
    508 
    509 			if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) ||
    510 					(s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
    511 				uint32_t loc = inloc;
    512 
    513 				for (i = 0; i < 4; i++) {
    514 					if (compmask & (1 << i)) {
    515 						vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
    516 						//flatshade[loc / 32] |= 1 << (loc % 32);
    517 						loc++;
    518 					}
    519 				}
    520 			}
    521 
    522 			gl_varying_slot slot = s[FS].v->inputs[j].slot;
    523 
    524 			/* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */
    525 			if (slot >= VARYING_SLOT_VAR0) {
    526 				unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
    527 				/* Replace the .xy coordinates with S/T from the point sprite. Set
    528 				 * interpolation bits for .zw such that they become .01
    529 				 */
    530 				if (emit->sprite_coord_enable & texmask) {
    531 					/* mask is two 2-bit fields, where:
    532 					 *   '01' -> S
    533 					 *   '10' -> T
    534 					 *   '11' -> 1 - T  (flip mode)
    535 					 */
    536 					unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001;
    537 					uint32_t loc = inloc;
    538 					if (compmask & 0x1) {
    539 						vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
    540 						loc++;
    541 					}
    542 					if (compmask & 0x2) {
    543 						vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
    544 						loc++;
    545 					}
    546 					if (compmask & 0x4) {
    547 						/* .z <- 0.0f */
    548 						vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2);
    549 						loc++;
    550 					}
    551 					if (compmask & 0x8) {
    552 						/* .w <- 1.0f */
    553 						vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2);
    554 						loc++;
    555 					}
    556 				}
    557 			}
    558 		}
    559 
    560 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
    561 		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
    562 				A4XX_VPC_ATTR_THRDASSIGN(1) |
    563 				COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) |
    564 				0x40000000 |      /* XXX */
    565 				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
    566 		OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) |
    567 				A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in));
    568 
    569 		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8);
    570 		for (i = 0; i < 8; i++)
    571 			OUT_RING(ring, vinterp[i]);     /* VPC_VARYING_INTERP[i].MODE */
    572 
    573 		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
    574 		for (i = 0; i < 8; i++)
    575 			OUT_RING(ring, vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
    576 	}
    577 
    578 	if (s[VS].instrlen)
    579 		emit_shader(ring, s[VS].v);
    580 
    581 	if (!emit->key.binning_pass)
    582 		if (s[FS].instrlen)
    583 			emit_shader(ring, s[FS].v);
    584 }
    585 
    586 void
    587 fd4_prog_init(struct pipe_context *pctx)
    588 {
    589 	pctx->create_fs_state = fd4_fp_state_create;
    590 	pctx->delete_fs_state = fd4_fp_state_delete;
    591 
    592 	pctx->create_vs_state = fd4_vp_state_create;
    593 	pctx->delete_vs_state = fd4_vp_state_delete;
    594 
    595 	fd_prog_init(pctx);
    596 }
    597