Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  */
     23 #include "r600_sq.h"
     24 #include "r600_opcodes.h"
     25 #include "r600_formats.h"
     26 #include "r600d.h"
     27 
     28 #include <errno.h>
     29 #include <byteswap.h>
     30 #include "util/u_memory.h"
     31 #include "pipe/p_shader_tokens.h"
     32 
     33 #define NUM_OF_CYCLES 3
     34 #define NUM_OF_COMPONENTS 4
     35 
     36 static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
     37 {
     38 	if(alu->is_op3)
     39 		return 3;
     40 
     41 	switch (bc->chip_class) {
     42 	case R600:
     43 	case R700:
     44 		switch (alu->inst) {
     45 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
     46 			return 0;
     47 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
     48 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
     49 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
     50 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
     51 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
     52 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
     53 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
     54 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
     55 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
     56 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
     57 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
     58 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
     59 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
     60 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
     61 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
     62 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
     63 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
     64 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
     65 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
     66 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
     67 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
     68 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
     69 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
     70 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
     71 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
     72 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
     73 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
     74 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
     75 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
     76 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
     77 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
     78 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
     79 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
     80 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
     81 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
     82 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
     83 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
     84 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
     85 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
     86 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
     87 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
     88 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
     89 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
     90 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
     91 			return 2;
     92 
     93 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
     94 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
     95 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
     96 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
     97 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
     98 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
     99 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
    100 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
    101 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
    102 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
    103 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
    104 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
    105 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
    106 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
    107 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
    108 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
    109 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
    110 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
    111 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
    112 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
    113 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
    114 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
    115 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
    116 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
    117 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
    118 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
    119 			return 1;
    120 		default: R600_ERR(
    121 			"Need instruction operand number for 0x%x.\n", alu->inst);
    122 		}
    123 		break;
    124 	case EVERGREEN:
    125 	case CAYMAN:
    126 		switch (alu->inst) {
    127 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
    128 			return 0;
    129 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
    130 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
    131 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT:
    132 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT:
    133 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT:
    134 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
    135 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
    136 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
    137 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
    138 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
    139 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE:
    140 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT:
    141 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT:
    142 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
    143 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT:
    144 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
    145 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
    146 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT:
    147 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT:
    148 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT:
    149 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT:
    150 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
    151 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT:
    152 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
    153 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT:
    154 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
    155 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT:
    156 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT:
    157 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
    158 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
    159 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
    160 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
    161 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
    162 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
    163 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
    164 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
    165 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT:
    166 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
    167 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
    168 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
    169 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY:
    170 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW:
    171 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT:
    172 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT:
    173 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT:
    174 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT:
    175 			return 2;
    176 
    177 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
    178 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
    179 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
    180 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
    181 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
    182 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
    183 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
    184 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
    185 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
    186 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
    187 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
    188 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
    189 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
    190 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
    191 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
    192 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
    193 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT:
    194 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT:
    195 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
    196 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
    197 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE:
    198 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT:
    199 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0:
    200 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT:
    201 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT:
    202 			return 1;
    203 		default: R600_ERR(
    204 			"Need instruction operand number for 0x%x.\n", alu->inst);
    205 		}
    206 		break;
    207 	}
    208 
    209 	return 3;
    210 }
    211 
    212 int r700_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id);
    213 
    214 static struct r600_bytecode_cf *r600_bytecode_cf(void)
    215 {
    216 	struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
    217 
    218 	if (cf == NULL)
    219 		return NULL;
    220 	LIST_INITHEAD(&cf->list);
    221 	LIST_INITHEAD(&cf->alu);
    222 	LIST_INITHEAD(&cf->vtx);
    223 	LIST_INITHEAD(&cf->tex);
    224 	return cf;
    225 }
    226 
    227 static struct r600_bytecode_alu *r600_bytecode_alu(void)
    228 {
    229 	struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
    230 
    231 	if (alu == NULL)
    232 		return NULL;
    233 	LIST_INITHEAD(&alu->list);
    234 	return alu;
    235 }
    236 
    237 static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
    238 {
    239 	struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
    240 
    241 	if (vtx == NULL)
    242 		return NULL;
    243 	LIST_INITHEAD(&vtx->list);
    244 	return vtx;
    245 }
    246 
    247 static struct r600_bytecode_tex *r600_bytecode_tex(void)
    248 {
    249 	struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
    250 
    251 	if (tex == NULL)
    252 		return NULL;
    253 	LIST_INITHEAD(&tex->list);
    254 	return tex;
    255 }
    256 
    257 void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
    258 {
    259 	if ((chip_class == R600) &&
    260 	    (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) {
    261 		bc->ar_handling = AR_HANDLE_RV6XX;
    262 		bc->r6xx_nop_after_rel_dst = 1;
    263 	} else {
    264 		bc->ar_handling = AR_HANDLE_NORMAL;
    265 		bc->r6xx_nop_after_rel_dst = 0;
    266 	}
    267 
    268 	LIST_INITHEAD(&bc->cf);
    269 	bc->chip_class = chip_class;
    270 }
    271 
    272 static int r600_bytecode_add_cf(struct r600_bytecode *bc)
    273 {
    274 	struct r600_bytecode_cf *cf = r600_bytecode_cf();
    275 
    276 	if (cf == NULL)
    277 		return -ENOMEM;
    278 	LIST_ADDTAIL(&cf->list, &bc->cf);
    279 	if (bc->cf_last) {
    280 		cf->id = bc->cf_last->id + 2;
    281 		if (bc->cf_last->eg_alu_extended) {
    282 			/* take into account extended alu size */
    283 			cf->id += 2;
    284 			bc->ndw += 2;
    285 		}
    286 	}
    287 	bc->cf_last = cf;
    288 	bc->ncf++;
    289 	bc->ndw += 2;
    290 	bc->force_add_cf = 0;
    291 	bc->ar_loaded = 0;
    292 	return 0;
    293 }
    294 
    295 int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output)
    296 {
    297 	int r;
    298 
    299 	if (output->gpr >= bc->ngpr)
    300 		bc->ngpr = output->gpr + 1;
    301 
    302 	if (bc->cf_last && (bc->cf_last->inst == output->inst ||
    303 		(bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
    304 		output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
    305 		output->type == bc->cf_last->output.type &&
    306 		output->elem_size == bc->cf_last->output.elem_size &&
    307 		output->swizzle_x == bc->cf_last->output.swizzle_x &&
    308 		output->swizzle_y == bc->cf_last->output.swizzle_y &&
    309 		output->swizzle_z == bc->cf_last->output.swizzle_z &&
    310 		output->swizzle_w == bc->cf_last->output.swizzle_w &&
    311 		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
    312 
    313 		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
    314 			(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
    315 
    316 			bc->cf_last->output.end_of_program |= output->end_of_program;
    317 			bc->cf_last->output.inst = output->inst;
    318 			bc->cf_last->output.gpr = output->gpr;
    319 			bc->cf_last->output.array_base = output->array_base;
    320 			bc->cf_last->output.burst_count += output->burst_count;
    321 			return 0;
    322 
    323 		} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
    324 			output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
    325 
    326 			bc->cf_last->output.end_of_program |= output->end_of_program;
    327 			bc->cf_last->output.inst = output->inst;
    328 			bc->cf_last->output.burst_count += output->burst_count;
    329 			return 0;
    330 		}
    331 	}
    332 
    333 	r = r600_bytecode_add_cf(bc);
    334 	if (r)
    335 		return r;
    336 	bc->cf_last->inst = output->inst;
    337 	memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
    338 	return 0;
    339 }
    340 
    341 /* alu instructions that can ony exits once per group */
    342 static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    343 {
    344 	switch (bc->chip_class) {
    345 	case R600:
    346 	case R700:
    347 		return !alu->is_op3 && (
    348 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
    349 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
    350 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
    351 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
    352 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
    353 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
    354 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
    355 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
    356 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
    357 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
    358 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
    359 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
    360 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
    361 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
    362 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
    363 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
    364 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
    365 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
    366 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
    367 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
    368 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
    369 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
    370 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
    371 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
    372 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
    373 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
    374 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
    375 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
    376 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
    377 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
    378 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
    379 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
    380 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
    381 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
    382 	case EVERGREEN:
    383 	case CAYMAN:
    384 	default:
    385 		return !alu->is_op3 && (
    386 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
    387 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
    388 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
    389 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
    390 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
    391 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
    392 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
    393 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
    394 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
    395 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
    396 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
    397 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
    398 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
    399 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
    400 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
    401 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
    402 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
    403 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
    404 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
    405 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
    406 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
    407 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
    408 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
    409 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
    410 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
    411 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
    412 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
    413 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
    414 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
    415 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
    416 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
    417 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
    418 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
    419 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
    420 	}
    421 }
    422 
    423 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    424 {
    425 	switch (bc->chip_class) {
    426 	case R600:
    427 	case R700:
    428 		return !alu->is_op3 && (
    429 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
    430 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
    431 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
    432 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
    433 	case EVERGREEN:
    434 	case CAYMAN:
    435 	default:
    436 		return !alu->is_op3 && (
    437 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
    438 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
    439 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
    440 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
    441 	}
    442 }
    443 
    444 static int is_alu_cube_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    445 {
    446 	switch (bc->chip_class) {
    447 	case R600:
    448 	case R700:
    449 		return !alu->is_op3 &&
    450 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
    451 	case EVERGREEN:
    452 	case CAYMAN:
    453 	default:
    454 		return !alu->is_op3 &&
    455 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
    456 	}
    457 }
    458 
    459 static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    460 {
    461 	switch (bc->chip_class) {
    462 	case R600:
    463 	case R700:
    464 		return !alu->is_op3 && (
    465 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
    466 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
    467 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
    468 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
    469 	case EVERGREEN:
    470 	case CAYMAN:
    471 	default:
    472 		return !alu->is_op3 && (
    473 			alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
    474 	}
    475 }
    476 
    477 static int is_opcode_in_range(unsigned opcode, unsigned min, unsigned max)
    478 {
    479 	return min <= opcode && opcode <= max;
    480 }
    481 
    482 /* ALU instructions that can only execute on the vector unit:
    483  *
    484  * opcode ranges:
    485  * R6xx/R7xx:
    486  *   op3 : [0x08 - 0x0B]
    487  *   op2 : 0x07, [0x15 - 0x18], [0x1B - 0x1D], [0x50 - 0x53], [0x7A - 0x7E]
    488  *
    489  * EVERGREEN:
    490  *   op3: [0x04 - 0x11]
    491  *   op2: [0xA0 - 0xE2]
    492  */
    493 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    494 {
    495 	switch (bc->chip_class) {
    496 	case R600:
    497 	case R700:
    498 		if (alu->is_op3)
    499 			return is_opcode_in_range(alu->inst,
    500 					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64,
    501 					V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD_64_D2);
    502 		else
    503 			return (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FREXP_64) ||
    504 					is_opcode_in_range(alu->inst,
    505 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA,
    506 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT) ||
    507 					is_opcode_in_range(alu->inst,
    508 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_64,
    509 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT32_TO_FLT64) ||
    510 					is_opcode_in_range(alu->inst,
    511 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4,
    512 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4) ||
    513 					is_opcode_in_range(alu->inst,
    514 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LDEXP_64,
    515 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_64);
    516 
    517 	case EVERGREEN:
    518 		if (alu->is_op3)
    519 			return is_opcode_in_range(alu->inst,
    520 					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_BFE_UINT,
    521 					EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_LDS_IDX_OP);
    522 		else
    523 			return is_opcode_in_range(alu->inst,
    524 					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_BFM_INT,
    525 					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P20);
    526 	case CAYMAN:
    527 	default:
    528 		assert(0);
    529 		return 0;
    530 	}
    531 }
    532 
    533 /* ALU instructions that can only execute on the trans unit:
    534  *
    535  * opcode ranges:
    536  * R600:
    537  *   op3: 0x0C
    538  *   op2: [0x60 - 0x79]
    539  *
    540  * R700:
    541  *   op3: 0x0C
    542  *   op2: [0x60 - 0x6F], [0x73 - 0x79]
    543  *
    544  * EVERGREEN:
    545  *   op3: 0x1F
    546  *   op2: [0x81 - 0x9C]
    547  */
    548 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    549 {
    550 
    551 	switch (bc->chip_class) {
    552 	case R600:
    553 		if (alu->is_op3)
    554 			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
    555 		else
    556 			return is_opcode_in_range(alu->inst,
    557 					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
    558 					V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
    559 	case R700:
    560 		if (alu->is_op3)
    561 			return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
    562 		else
    563 			return is_opcode_in_range(alu->inst,
    564 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT,
    565 						V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS) ||
    566 					is_opcode_in_range(alu->inst,
    567 							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT,
    568 							V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
    569 	case EVERGREEN:
    570 		if (alu->is_op3)
    571 			return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
    572 		else
    573 			return is_opcode_in_range(alu->inst,
    574 					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE,
    575 					EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
    576 	case CAYMAN:
    577 	default:
    578 		assert(0);
    579 		return 0;
    580 	}
    581 }
    582 
    583 /* alu instructions that can execute on any unit */
    584 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    585 {
    586 	return !is_alu_vec_unit_inst(bc, alu) &&
    587 		!is_alu_trans_unit_inst(bc, alu);
    588 }
    589 
    590 static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
    591 {
    592 	switch (bc->chip_class) {
    593 	case R600:
    594 	case R700:
    595 		return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
    596 	case EVERGREEN:
    597 	case CAYMAN:
    598 	default:
    599 		return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
    600 	}
    601 }
    602 
    603 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
    604 			    struct r600_bytecode_alu *assignment[5])
    605 {
    606 	struct r600_bytecode_alu *alu;
    607 	unsigned i, chan, trans;
    608 	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
    609 
    610 	for (i = 0; i < max_slots; i++)
    611 		assignment[i] = NULL;
    612 
    613 	for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
    614 		chan = alu->dst.chan;
    615 		if (max_slots == 4)
    616 			trans = 0;
    617 		else if (is_alu_trans_unit_inst(bc, alu))
    618 			trans = 1;
    619 		else if (is_alu_vec_unit_inst(bc, alu))
    620 			trans = 0;
    621 		else if (assignment[chan])
    622 			trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
    623 		else
    624 			trans = 0;
    625 
    626 		if (trans) {
    627 			if (assignment[4]) {
    628 				assert(0); /* ALU.Trans has already been allocated. */
    629 				return -1;
    630 			}
    631 			assignment[4] = alu;
    632 		} else {
    633 			if (assignment[chan]) {
    634 				assert(0); /* ALU.chan has already been allocated. */
    635 				return -1;
    636 			}
    637 			assignment[chan] = alu;
    638 		}
    639 
    640 		if (alu->last)
    641 			break;
    642 	}
    643 	return 0;
    644 }
    645 
    646 struct alu_bank_swizzle {
    647 	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
    648 	int	hw_cfile_addr[4];
    649 	int	hw_cfile_elem[4];
    650 };
    651 
    652 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
    653 	[SQ_ALU_VEC_012] = { 0, 1, 2 },
    654 	[SQ_ALU_VEC_021] = { 0, 2, 1 },
    655 	[SQ_ALU_VEC_120] = { 1, 2, 0 },
    656 	[SQ_ALU_VEC_102] = { 1, 0, 2 },
    657 	[SQ_ALU_VEC_201] = { 2, 0, 1 },
    658 	[SQ_ALU_VEC_210] = { 2, 1, 0 }
    659 };
    660 
    661 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
    662 	[SQ_ALU_SCL_210] = { 2, 1, 0 },
    663 	[SQ_ALU_SCL_122] = { 1, 2, 2 },
    664 	[SQ_ALU_SCL_212] = { 2, 1, 2 },
    665 	[SQ_ALU_SCL_221] = { 2, 2, 1 }
    666 };
    667 
    668 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
    669 {
    670 	int i, cycle, component;
    671 	/* set up gpr use */
    672 	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
    673 		for (component = 0; component < NUM_OF_COMPONENTS; component++)
    674 			 bs->hw_gpr[cycle][component] = -1;
    675 	for (i = 0; i < 4; i++)
    676 		bs->hw_cfile_addr[i] = -1;
    677 	for (i = 0; i < 4; i++)
    678 		bs->hw_cfile_elem[i] = -1;
    679 }
    680 
    681 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
    682 {
    683 	if (bs->hw_gpr[cycle][chan] == -1)
    684 		bs->hw_gpr[cycle][chan] = sel;
    685 	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
    686 		/* Another scalar operation has already used the GPR read port for the channel. */
    687 		return -1;
    688 	}
    689 	return 0;
    690 }
    691 
    692 static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
    693 {
    694 	int res, num_res = 4;
    695 	if (bc->chip_class >= R700) {
    696 		num_res = 2;
    697 		chan /= 2;
    698 	}
    699 	for (res = 0; res < num_res; ++res) {
    700 		if (bs->hw_cfile_addr[res] == -1) {
    701 			bs->hw_cfile_addr[res] = sel;
    702 			bs->hw_cfile_elem[res] = chan;
    703 			return 0;
    704 		} else if (bs->hw_cfile_addr[res] == sel &&
    705 			bs->hw_cfile_elem[res] == chan)
    706 			return 0; /* Read for this scalar element already reserved, nothing to do here. */
    707 	}
    708 	/* All cfile read ports are used, cannot reference vector element. */
    709 	return -1;
    710 }
    711 
    712 static int is_gpr(unsigned sel)
    713 {
    714 	return (sel >= 0 && sel <= 127);
    715 }
    716 
    717 /* CB constants start at 512, and get translated to a kcache index when ALU
    718  * clauses are constructed. Note that we handle kcache constants the same way
    719  * as (the now gone) cfile constants, is that really required? */
    720 static int is_cfile(unsigned sel)
    721 {
    722 	return (sel > 255 && sel < 512) ||
    723 		(sel > 511 && sel < 4607) || /* Kcache before translation. */
    724 		(sel > 127 && sel < 192); /* Kcache after translation. */
    725 }
    726 
    727 static int is_const(int sel)
    728 {
    729 	return is_cfile(sel) ||
    730 		(sel >= V_SQ_ALU_SRC_0 &&
    731 		sel <= V_SQ_ALU_SRC_LITERAL);
    732 }
    733 
    734 static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
    735 			struct alu_bank_swizzle *bs, int bank_swizzle)
    736 {
    737 	int r, src, num_src, sel, elem, cycle;
    738 
    739 	num_src = r600_bytecode_get_num_operands(bc, alu);
    740 	for (src = 0; src < num_src; src++) {
    741 		sel = alu->src[src].sel;
    742 		elem = alu->src[src].chan;
    743 		if (is_gpr(sel)) {
    744 			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
    745 			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
    746 				/* Nothing to do; special-case optimization,
    747 				 * second source uses first sources reservation. */
    748 				continue;
    749 			else {
    750 				r = reserve_gpr(bs, sel, elem, cycle);
    751 				if (r)
    752 					return r;
    753 			}
    754 		} else if (is_cfile(sel)) {
    755 			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
    756 			if (r)
    757 				return r;
    758 		}
    759 		/* No restrictions on PV, PS, literal or special constants. */
    760 	}
    761 	return 0;
    762 }
    763 
    764 static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
    765 			struct alu_bank_swizzle *bs, int bank_swizzle)
    766 {
    767 	int r, src, num_src, const_count, sel, elem, cycle;
    768 
    769 	num_src = r600_bytecode_get_num_operands(bc, alu);
    770 	for (const_count = 0, src = 0; src < num_src; ++src) {
    771 		sel = alu->src[src].sel;
    772 		elem = alu->src[src].chan;
    773 		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
    774 			if (const_count >= 2)
    775 				/* More than two references to a constant in
    776 				 * transcendental operation. */
    777 				return -1;
    778 			else
    779 				const_count++;
    780 		}
    781 		if (is_cfile(sel)) {
    782 			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
    783 			if (r)
    784 				return r;
    785 		}
    786 	}
    787 	for (src = 0; src < num_src; ++src) {
    788 		sel = alu->src[src].sel;
    789 		elem = alu->src[src].chan;
    790 		if (is_gpr(sel)) {
    791 			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
    792 			if (cycle < const_count)
    793 				/* Cycle for GPR load conflicts with
    794 				 * constant load in transcendental operation. */
    795 				return -1;
    796 			r = reserve_gpr(bs, sel, elem, cycle);
    797 			if (r)
    798 				return r;
    799 		}
    800 		/* PV PS restrictions */
    801 		if (const_count && (sel == 254 || sel == 255)) {
    802 			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
    803 			if (cycle < const_count)
    804 				return -1;
    805 		}
    806 	}
    807 	return 0;
    808 }
    809 
    810 static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
    811 				      struct r600_bytecode_alu *slots[5])
    812 {
    813 	struct alu_bank_swizzle bs;
    814 	int bank_swizzle[5];
    815 	int i, r = 0, forced = 1;
    816 	boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
    817 	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
    818 
    819 	for (i = 0; i < max_slots; i++) {
    820 		if (slots[i]) {
    821 			if (slots[i]->bank_swizzle_force) {
    822 				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
    823 			} else {
    824 				forced = 0;
    825 			}
    826 		}
    827 
    828 		if (i < 4 && slots[i])
    829 			scalar_only = false;
    830 	}
    831 	if (forced)
    832 		return 0;
    833 
    834 	/* Just check every possible combination of bank swizzle.
    835 	 * Not very efficent, but works on the first try in most of the cases. */
    836 	for (i = 0; i < 4; i++)
    837 		if (!slots[i] || !slots[i]->bank_swizzle_force)
    838 			bank_swizzle[i] = SQ_ALU_VEC_012;
    839 		else
    840 			bank_swizzle[i] = slots[i]->bank_swizzle;
    841 
    842 	bank_swizzle[4] = SQ_ALU_SCL_210;
    843 	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
    844 
    845 		if (max_slots == 4) {
    846 			for (i = 0; i < max_slots; i++) {
    847 				if (bank_swizzle[i] == SQ_ALU_VEC_210)
    848 				  return -1;
    849 			}
    850 		}
    851 		init_bank_swizzle(&bs);
    852 		if (scalar_only == false) {
    853 			for (i = 0; i < 4; i++) {
    854 				if (slots[i]) {
    855 					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
    856 					if (r)
    857 						break;
    858 				}
    859 			}
    860 		} else
    861 			r = 0;
    862 
    863 		if (!r && slots[4] && max_slots == 5) {
    864 			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
    865 		}
    866 		if (!r) {
    867 			for (i = 0; i < max_slots; i++) {
    868 				if (slots[i])
    869 					slots[i]->bank_swizzle = bank_swizzle[i];
    870 			}
    871 			return 0;
    872 		}
    873 
    874 		if (scalar_only) {
    875 			bank_swizzle[4]++;
    876 		} else {
    877 			for (i = 0; i < max_slots; i++) {
    878 				if (!slots[i] || !slots[i]->bank_swizzle_force) {
    879 					bank_swizzle[i]++;
    880 					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
    881 						break;
    882 					else
    883 						bank_swizzle[i] = SQ_ALU_VEC_012;
    884 				}
    885 			}
    886 		}
    887 	}
    888 
    889 	/* Couldn't find a working swizzle. */
    890 	return -1;
    891 }
    892 
    893 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
    894 				  struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
    895 {
    896 	struct r600_bytecode_alu *prev[5];
    897 	int gpr[5], chan[5];
    898 	int i, j, r, src, num_src;
    899 	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
    900 
    901 	r = assign_alu_units(bc, alu_prev, prev);
    902 	if (r)
    903 		return r;
    904 
    905 	for (i = 0; i < max_slots; ++i) {
    906 		if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
    907 			gpr[i] = prev[i]->dst.sel;
    908 			/* cube writes more than PV.X */
    909 			if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
    910 				chan[i] = 0;
    911 			else
    912 				chan[i] = prev[i]->dst.chan;
    913 		} else
    914 			gpr[i] = -1;
    915 	}
    916 
    917 	for (i = 0; i < max_slots; ++i) {
    918 		struct r600_bytecode_alu *alu = slots[i];
    919 		if(!alu)
    920 			continue;
    921 
    922 		num_src = r600_bytecode_get_num_operands(bc, alu);
    923 		for (src = 0; src < num_src; ++src) {
    924 			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
    925 				continue;
    926 
    927 			if (bc->chip_class < CAYMAN) {
    928 				if (alu->src[src].sel == gpr[4] &&
    929 				    alu->src[src].chan == chan[4] &&
    930 				    alu_prev->pred_sel == alu->pred_sel) {
    931 					alu->src[src].sel = V_SQ_ALU_SRC_PS;
    932 					alu->src[src].chan = 0;
    933 					continue;
    934 				}
    935 			}
    936 
    937 			for (j = 0; j < 4; ++j) {
    938 				if (alu->src[src].sel == gpr[j] &&
    939 					alu->src[src].chan == j &&
    940 				      alu_prev->pred_sel == alu->pred_sel) {
    941 					alu->src[src].sel = V_SQ_ALU_SRC_PV;
    942 					alu->src[src].chan = chan[j];
    943 					break;
    944 				}
    945 			}
    946 		}
    947 	}
    948 
    949 	return 0;
    950 }
    951 
    952 void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
    953 {
    954 	switch(value) {
    955 	case 0:
    956 		*sel = V_SQ_ALU_SRC_0;
    957 		break;
    958 	case 1:
    959 		*sel = V_SQ_ALU_SRC_1_INT;
    960 		break;
    961 	case -1:
    962 		*sel = V_SQ_ALU_SRC_M_1_INT;
    963 		break;
    964 	case 0x3F800000: /* 1.0f */
    965 		*sel = V_SQ_ALU_SRC_1;
    966 		break;
    967 	case 0x3F000000: /* 0.5f */
    968 		*sel = V_SQ_ALU_SRC_0_5;
    969 		break;
    970 	case 0xBF800000: /* -1.0f */
    971 		*sel = V_SQ_ALU_SRC_1;
    972 		*neg ^= 1;
    973 		break;
    974 	case 0xBF000000: /* -0.5f */
    975 		*sel = V_SQ_ALU_SRC_0_5;
    976 		*neg ^= 1;
    977 		break;
    978 	default:
    979 		*sel = V_SQ_ALU_SRC_LITERAL;
    980 		break;
    981 	}
    982 }
    983 
    984 /* compute how many literal are needed */
    985 static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
    986 				 uint32_t literal[4], unsigned *nliteral)
    987 {
    988 	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
    989 	unsigned i, j;
    990 
    991 	for (i = 0; i < num_src; ++i) {
    992 		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
    993 			uint32_t value = alu->src[i].value;
    994 			unsigned found = 0;
    995 			for (j = 0; j < *nliteral; ++j) {
    996 				if (literal[j] == value) {
    997 					found = 1;
    998 					break;
    999 				}
   1000 			}
   1001 			if (!found) {
   1002 				if (*nliteral >= 4)
   1003 					return -EINVAL;
   1004 				literal[(*nliteral)++] = value;
   1005 			}
   1006 		}
   1007 	}
   1008 	return 0;
   1009 }
   1010 
   1011 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc,
   1012 					struct r600_bytecode_alu *alu,
   1013 					uint32_t literal[4], unsigned nliteral)
   1014 {
   1015 	unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
   1016 	unsigned i, j;
   1017 
   1018 	for (i = 0; i < num_src; ++i) {
   1019 		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
   1020 			uint32_t value = alu->src[i].value;
   1021 			for (j = 0; j < nliteral; ++j) {
   1022 				if (literal[j] == value) {
   1023 					alu->src[i].chan = j;
   1024 					break;
   1025 				}
   1026 			}
   1027 		}
   1028 	}
   1029 }
   1030 
   1031 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
   1032 			     struct r600_bytecode_alu *alu_prev)
   1033 {
   1034 	struct r600_bytecode_alu *prev[5];
   1035 	struct r600_bytecode_alu *result[5] = { NULL };
   1036 
   1037 	uint32_t literal[4], prev_literal[4];
   1038 	unsigned nliteral = 0, prev_nliteral = 0;
   1039 
   1040 	int i, j, r, src, num_src;
   1041 	int num_once_inst = 0;
   1042 	int have_mova = 0, have_rel = 0;
   1043 	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
   1044 
   1045 	r = assign_alu_units(bc, alu_prev, prev);
   1046 	if (r)
   1047 		return r;
   1048 
   1049 	for (i = 0; i < max_slots; ++i) {
   1050 		if (prev[i]) {
   1051 		      if (prev[i]->pred_sel)
   1052 			      return 0;
   1053 		      if (is_alu_once_inst(bc, prev[i]))
   1054 			      return 0;
   1055 		}
   1056 		if (slots[i]) {
   1057 			if (slots[i]->pred_sel)
   1058 				return 0;
   1059 			if (is_alu_once_inst(bc, slots[i]))
   1060 				return 0;
   1061 		}
   1062 	}
   1063 
   1064 	for (i = 0; i < max_slots; ++i) {
   1065 		struct r600_bytecode_alu *alu;
   1066 
   1067 		if (num_once_inst > 0)
   1068 		   return 0;
   1069 
   1070 		/* check number of literals */
   1071 		if (prev[i]) {
   1072 			if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral))
   1073 				return 0;
   1074 			if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
   1075 				return 0;
   1076 			if (is_alu_mova_inst(bc, prev[i])) {
   1077 				if (have_rel)
   1078 					return 0;
   1079 				have_mova = 1;
   1080 			}
   1081 			num_once_inst += is_alu_once_inst(bc, prev[i]);
   1082 		}
   1083 		if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral))
   1084 			return 0;
   1085 
   1086 		/* Let's check used slots. */
   1087 		if (prev[i] && !slots[i]) {
   1088 			result[i] = prev[i];
   1089 			continue;
   1090 		} else if (prev[i] && slots[i]) {
   1091 			if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
   1092 				/* Trans unit is still free try to use it. */
   1093 				if (is_alu_any_unit_inst(bc, slots[i])) {
   1094 					result[i] = prev[i];
   1095 					result[4] = slots[i];
   1096 				} else if (is_alu_any_unit_inst(bc, prev[i])) {
   1097 					if (slots[i]->dst.sel == prev[i]->dst.sel &&
   1098 						(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
   1099 						(prev[i]->dst.write == 1 || prev[i]->is_op3))
   1100 						return 0;
   1101 
   1102 					result[i] = slots[i];
   1103 					result[4] = prev[i];
   1104 				} else
   1105 					return 0;
   1106 			} else
   1107 				return 0;
   1108 		} else if(!slots[i]) {
   1109 			continue;
   1110 		} else {
   1111 			if (max_slots == 5 && slots[i] && prev[4] &&
   1112 					slots[i]->dst.sel == prev[4]->dst.sel &&
   1113 					slots[i]->dst.chan == prev[4]->dst.chan &&
   1114 					(slots[i]->dst.write == 1 || slots[i]->is_op3) &&
   1115 					(prev[4]->dst.write == 1 || prev[4]->is_op3))
   1116 				return 0;
   1117 
   1118 			result[i] = slots[i];
   1119 		}
   1120 
   1121 		alu = slots[i];
   1122 		num_once_inst += is_alu_once_inst(bc, alu);
   1123 
   1124 		/* don't reschedule NOPs */
   1125 		if (is_nop_inst(bc, alu))
   1126 			return 0;
   1127 
   1128 		/* Let's check dst gpr. */
   1129 		if (alu->dst.rel) {
   1130 			if (have_mova)
   1131 				return 0;
   1132 			have_rel = 1;
   1133 		}
   1134 
   1135 		/* Let's check source gprs */
   1136 		num_src = r600_bytecode_get_num_operands(bc, alu);
   1137 		for (src = 0; src < num_src; ++src) {
   1138 			if (alu->src[src].rel) {
   1139 				if (have_mova)
   1140 					return 0;
   1141 				have_rel = 1;
   1142 			}
   1143 
   1144 			/* Constants don't matter. */
   1145 			if (!is_gpr(alu->src[src].sel))
   1146 				continue;
   1147 
   1148 			for (j = 0; j < max_slots; ++j) {
   1149 				if (!prev[j] || !(prev[j]->dst.write || prev[j]->is_op3))
   1150 					continue;
   1151 
   1152 				/* If it's relative then we can't determin which gpr is really used. */
   1153 				if (prev[j]->dst.chan == alu->src[src].chan &&
   1154 					(prev[j]->dst.sel == alu->src[src].sel ||
   1155 					prev[j]->dst.rel || alu->src[src].rel))
   1156 					return 0;
   1157 			}
   1158 		}
   1159 	}
   1160 
   1161 	/* more than one PRED_ or KILL_ ? */
   1162 	if (num_once_inst > 1)
   1163 		return 0;
   1164 
   1165 	/* check if the result can still be swizzlet */
   1166 	r = check_and_set_bank_swizzle(bc, result);
   1167 	if (r)
   1168 		return 0;
   1169 
   1170 	/* looks like everything worked out right, apply the changes */
   1171 
   1172 	/* undo adding previus literals */
   1173 	bc->cf_last->ndw -= align(prev_nliteral, 2);
   1174 
   1175 	/* sort instructions */
   1176 	for (i = 0; i < max_slots; ++i) {
   1177 		slots[i] = result[i];
   1178 		if (result[i]) {
   1179 			LIST_DEL(&result[i]->list);
   1180 			result[i]->last = 0;
   1181 			LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
   1182 		}
   1183 	}
   1184 
   1185 	/* determine new last instruction */
   1186 	LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
   1187 
   1188 	/* determine new first instruction */
   1189 	for (i = 0; i < max_slots; ++i) {
   1190 		if (result[i]) {
   1191 			bc->cf_last->curr_bs_head = result[i];
   1192 			break;
   1193 		}
   1194 	}
   1195 
   1196 	bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
   1197 	bc->cf_last->prev2_bs_head = NULL;
   1198 
   1199 	return 0;
   1200 }
   1201 
   1202 /* we'll keep kcache sets sorted by bank & addr */
   1203 static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
   1204 		struct r600_bytecode_kcache *kcache,
   1205 		unsigned bank, unsigned line)
   1206 {
   1207 	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
   1208 
   1209 	for (i = 0; i < kcache_banks; i++) {
   1210 		if (kcache[i].mode) {
   1211 			int d;
   1212 
   1213 			if (kcache[i].bank < bank)
   1214 				continue;
   1215 
   1216 			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
   1217 					kcache[i].bank > bank) {
   1218 				/* try to insert new line */
   1219 				if (kcache[kcache_banks-1].mode) {
   1220 					/* all sets are in use */
   1221 					return -ENOMEM;
   1222 				}
   1223 
   1224 				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
   1225 				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
   1226 				kcache[i].bank = bank;
   1227 				kcache[i].addr = line;
   1228 				return 0;
   1229 			}
   1230 
   1231 			d = line - kcache[i].addr;
   1232 
   1233 			if (d == -1) {
   1234 				kcache[i].addr--;
   1235 				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
   1236 					/* we are prepending the line to the current set,
   1237 					 * discarding the existing second line,
   1238 					 * so we'll have to insert line+2 after it */
   1239 					line += 2;
   1240 					continue;
   1241 				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
   1242 					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
   1243 					return 0;
   1244 				} else {
   1245 					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
   1246 					return -ENOMEM;
   1247 				}
   1248 			} else if (d == 1) {
   1249 				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
   1250 				return 0;
   1251 			} else if (d == 0)
   1252 				return 0;
   1253 		} else { /* free kcache set - use it */
   1254 			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
   1255 			kcache[i].bank = bank;
   1256 			kcache[i].addr = line;
   1257 			return 0;
   1258 		}
   1259 	}
   1260 	return -ENOMEM;
   1261 }
   1262 
   1263 static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
   1264 		struct r600_bytecode_kcache *kcache,
   1265 		struct r600_bytecode_alu *alu)
   1266 {
   1267 	int i, r;
   1268 
   1269 	for (i = 0; i < 3; i++) {
   1270 		unsigned bank, line, sel = alu->src[i].sel;
   1271 
   1272 		if (sel < 512)
   1273 			continue;
   1274 
   1275 		bank = alu->src[i].kc_bank;
   1276 		line = (sel-512)>>4;
   1277 
   1278 		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
   1279 			return r;
   1280 	}
   1281 	return 0;
   1282 }
   1283 
   1284 static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
   1285 		struct r600_bytecode_alu *alu,
   1286 		struct r600_bytecode_kcache * kcache)
   1287 {
   1288 	int i, j;
   1289 
   1290 	/* Alter the src operands to refer to the kcache. */
   1291 	for (i = 0; i < 3; ++i) {
   1292 		static const unsigned int base[] = {128, 160, 256, 288};
   1293 		unsigned int line, sel = alu->src[i].sel, found = 0;
   1294 
   1295 		if (sel < 512)
   1296 			continue;
   1297 
   1298 		sel -= 512;
   1299 		line = sel>>4;
   1300 
   1301 		for (j = 0; j < 4 && !found; ++j) {
   1302 			switch (kcache[j].mode) {
   1303 			case V_SQ_CF_KCACHE_NOP:
   1304 			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
   1305 				R600_ERR("unexpected kcache line mode\n");
   1306 				return -ENOMEM;
   1307 			default:
   1308 				if (kcache[j].bank == alu->src[i].kc_bank &&
   1309 						kcache[j].addr <= line &&
   1310 						line < kcache[j].addr + kcache[j].mode) {
   1311 					alu->src[i].sel = sel - (kcache[j].addr<<4);
   1312 					alu->src[i].sel += base[j];
   1313 					found=1;
   1314 			    }
   1315 			}
   1316 		}
   1317 	}
   1318 	return 0;
   1319 }
   1320 
   1321 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
   1322 {
   1323 	struct r600_bytecode_kcache kcache_sets[4];
   1324 	struct r600_bytecode_kcache *kcache = kcache_sets;
   1325 	int r;
   1326 
   1327 	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
   1328 
   1329 	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
   1330 		/* can't alloc, need to start new clause */
   1331 		if ((r = r600_bytecode_add_cf(bc))) {
   1332 			return r;
   1333 		}
   1334 		bc->cf_last->inst = type;
   1335 
   1336 		/* retry with the new clause */
   1337 		kcache = bc->cf_last->kcache;
   1338 		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
   1339 			/* can't alloc again- should never happen */
   1340 			return r;
   1341 		}
   1342 	} else {
   1343 		/* update kcache sets */
   1344 		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
   1345 	}
   1346 
   1347 	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
   1348 	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
   1349 		if (bc->chip_class < EVERGREEN)
   1350 			return -ENOMEM;
   1351 		bc->cf_last->eg_alu_extended = 1;
   1352 	}
   1353 
   1354 	return 0;
   1355 }
   1356 
   1357 static int insert_nop_r6xx(struct r600_bytecode *bc)
   1358 {
   1359 	struct r600_bytecode_alu alu;
   1360 	int r, i;
   1361 
   1362 	for (i = 0; i < 4; i++) {
   1363 		memset(&alu, 0, sizeof(alu));
   1364 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
   1365 		alu.src[0].chan = i;
   1366 		alu.dst.chan = i;
   1367 		alu.last = (i == 3);
   1368 		r = r600_bytecode_add_alu(bc, &alu);
   1369 		if (r)
   1370 			return r;
   1371 	}
   1372 	return 0;
   1373 }
   1374 
   1375 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
   1376 static int load_ar_r6xx(struct r600_bytecode *bc)
   1377 {
   1378 	struct r600_bytecode_alu alu;
   1379 	int r;
   1380 
   1381 	if (bc->ar_loaded)
   1382 		return 0;
   1383 
   1384 	/* hack to avoid making MOVA the last instruction in the clause */
   1385 	if ((bc->cf_last->ndw>>1) >= 110)
   1386 		bc->force_add_cf = 1;
   1387 
   1388 	memset(&alu, 0, sizeof(alu));
   1389 	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
   1390 	alu.src[0].sel = bc->ar_reg;
   1391 	alu.last = 1;
   1392 	alu.index_mode = INDEX_MODE_LOOP;
   1393 	r = r600_bytecode_add_alu(bc, &alu);
   1394 	if (r)
   1395 		return r;
   1396 
   1397 	/* no requirement to set uses waterfall on MOVA_GPR_INT */
   1398 	bc->ar_loaded = 1;
   1399 	return 0;
   1400 }
   1401 
   1402 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
   1403 static int load_ar(struct r600_bytecode *bc)
   1404 {
   1405 	struct r600_bytecode_alu alu;
   1406 	int r;
   1407 
   1408 	if (bc->ar_handling)
   1409 		return load_ar_r6xx(bc);
   1410 
   1411 	if (bc->ar_loaded)
   1412 		return 0;
   1413 
   1414 	/* hack to avoid making MOVA the last instruction in the clause */
   1415 	if ((bc->cf_last->ndw>>1) >= 110)
   1416 		bc->force_add_cf = 1;
   1417 
   1418 	memset(&alu, 0, sizeof(alu));
   1419 	alu.inst = BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
   1420 	alu.src[0].sel = bc->ar_reg;
   1421 	alu.last = 1;
   1422 	r = r600_bytecode_add_alu(bc, &alu);
   1423 	if (r)
   1424 		return r;
   1425 
   1426 	bc->cf_last->r6xx_uses_waterfall = 1;
   1427 	bc->ar_loaded = 1;
   1428 	return 0;
   1429 }
   1430 
   1431 int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, int type)
   1432 {
   1433 	struct r600_bytecode_alu *nalu = r600_bytecode_alu();
   1434 	struct r600_bytecode_alu *lalu;
   1435 	int i, r;
   1436 
   1437 	if (nalu == NULL)
   1438 		return -ENOMEM;
   1439 	memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
   1440 
   1441 	if (bc->cf_last != NULL && bc->cf_last->inst != type) {
   1442 		/* check if we could add it anyway */
   1443 		if (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU) &&
   1444 			type == BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE)) {
   1445 			LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
   1446 				if (lalu->execute_mask) {
   1447 					bc->force_add_cf = 1;
   1448 					break;
   1449 				}
   1450 			}
   1451 		} else
   1452 			bc->force_add_cf = 1;
   1453 	}
   1454 
   1455 	/* cf can contains only alu or only vtx or only tex */
   1456 	if (bc->cf_last == NULL || bc->force_add_cf) {
   1457 		r = r600_bytecode_add_cf(bc);
   1458 		if (r) {
   1459 			free(nalu);
   1460 			return r;
   1461 		}
   1462 	}
   1463 	bc->cf_last->inst = type;
   1464 
   1465 	/* Check AR usage and load it if required */
   1466 	for (i = 0; i < 3; i++)
   1467 		if (nalu->src[i].rel && !bc->ar_loaded)
   1468 			load_ar(bc);
   1469 
   1470 	if (nalu->dst.rel && !bc->ar_loaded)
   1471 		load_ar(bc);
   1472 
   1473 	/* Setup the kcache for this ALU instruction. This will start a new
   1474 	 * ALU clause if needed. */
   1475 	if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
   1476 		free(nalu);
   1477 		return r;
   1478 	}
   1479 
   1480 	if (!bc->cf_last->curr_bs_head) {
   1481 		bc->cf_last->curr_bs_head = nalu;
   1482 	}
   1483 	/* number of gpr == the last gpr used in any alu */
   1484 	for (i = 0; i < 3; i++) {
   1485 		if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
   1486 			bc->ngpr = nalu->src[i].sel + 1;
   1487 		}
   1488 		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
   1489 			r600_bytecode_special_constants(nalu->src[i].value,
   1490 				&nalu->src[i].sel, &nalu->src[i].neg);
   1491 	}
   1492 	if (nalu->dst.sel >= bc->ngpr) {
   1493 		bc->ngpr = nalu->dst.sel + 1;
   1494 	}
   1495 	LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
   1496 	/* each alu use 2 dwords */
   1497 	bc->cf_last->ndw += 2;
   1498 	bc->ndw += 2;
   1499 
   1500 	/* process cur ALU instructions for bank swizzle */
   1501 	if (nalu->last) {
   1502 		uint32_t literal[4];
   1503 		unsigned nliteral;
   1504 		struct r600_bytecode_alu *slots[5];
   1505 		int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
   1506 		r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
   1507 		if (r)
   1508 			return r;
   1509 
   1510 		if (bc->cf_last->prev_bs_head) {
   1511 			r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
   1512 			if (r)
   1513 				return r;
   1514 		}
   1515 
   1516 		if (bc->cf_last->prev_bs_head) {
   1517 			r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
   1518 			if (r)
   1519 				return r;
   1520 		}
   1521 
   1522 		r = check_and_set_bank_swizzle(bc, slots);
   1523 		if (r)
   1524 			return r;
   1525 
   1526 		for (i = 0, nliteral = 0; i < max_slots; i++) {
   1527 			if (slots[i]) {
   1528 				r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral);
   1529 				if (r)
   1530 					return r;
   1531 			}
   1532 		}
   1533 		bc->cf_last->ndw += align(nliteral, 2);
   1534 
   1535 		/* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
   1536 		 * worst case */
   1537 		if ((bc->cf_last->ndw >> 1) >= 120) {
   1538 			bc->force_add_cf = 1;
   1539 		}
   1540 
   1541 		bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
   1542 		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
   1543 		bc->cf_last->curr_bs_head = NULL;
   1544 	}
   1545 
   1546 	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
   1547 		insert_nop_r6xx(bc);
   1548 
   1549 	return 0;
   1550 }
   1551 
   1552 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
   1553 {
   1554 	return r600_bytecode_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
   1555 }
   1556 
   1557 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
   1558 {
   1559 	switch (bc->chip_class) {
   1560 	case R600:
   1561 		return 8;
   1562 
   1563 	case R700:
   1564 	case EVERGREEN:
   1565 	case CAYMAN:
   1566 		return 16;
   1567 
   1568 	default:
   1569 		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
   1570 		return 8;
   1571 	}
   1572 }
   1573 
   1574 static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
   1575 {
   1576 	switch (bc->chip_class) {
   1577 	case R700:
   1578 	case R600:
   1579 		return bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
   1580 		       bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC;
   1581 	case EVERGREEN:
   1582 		return bc->cf_last->inst != EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
   1583 	case CAYMAN:
   1584 		return bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
   1585 	default:
   1586 		R600_ERR("Unknown chip class %d.\n", bc->chip_class);
   1587 		return FALSE;
   1588 	}
   1589 }
   1590 
   1591 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
   1592 {
   1593 	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
   1594 	int r;
   1595 
   1596 	if (nvtx == NULL)
   1597 		return -ENOMEM;
   1598 	memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
   1599 
   1600 	/* cf can contains only alu or only vtx or only tex */
   1601 	if (bc->cf_last == NULL ||
   1602 	    last_inst_was_not_vtx_fetch(bc) ||
   1603 	    bc->force_add_cf) {
   1604 		r = r600_bytecode_add_cf(bc);
   1605 		if (r) {
   1606 			free(nvtx);
   1607 			return r;
   1608 		}
   1609 		switch (bc->chip_class) {
   1610 		case R600:
   1611 		case R700:
   1612 			bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
   1613 			break;
   1614 		case EVERGREEN:
   1615 			bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX;
   1616 			break;
   1617 		case CAYMAN:
   1618 			bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
   1619 			break;
   1620 		default:
   1621 			R600_ERR("Unknown chip class %d.\n", bc->chip_class);
   1622 			return -EINVAL;
   1623 		}
   1624 	}
   1625 	LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
   1626 	/* each fetch use 4 dwords */
   1627 	bc->cf_last->ndw += 4;
   1628 	bc->ndw += 4;
   1629 	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
   1630 		bc->force_add_cf = 1;
   1631 
   1632 	bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
   1633 	bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
   1634 
   1635 	return 0;
   1636 }
   1637 
   1638 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
   1639 {
   1640 	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
   1641 	int r;
   1642 
   1643 	if (ntex == NULL)
   1644 		return -ENOMEM;
   1645 	memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
   1646 
   1647 	/* we can't fetch data und use it as texture lookup address in the same TEX clause */
   1648 	if (bc->cf_last != NULL &&
   1649 		bc->cf_last->inst == BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX)) {
   1650 		struct r600_bytecode_tex *ttex;
   1651 		LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
   1652 			if (ttex->dst_gpr == ntex->src_gpr) {
   1653 				bc->force_add_cf = 1;
   1654 				break;
   1655 			}
   1656 		}
   1657 		/* slight hack to make gradients always go into same cf */
   1658 		if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
   1659 			bc->force_add_cf = 1;
   1660 	}
   1661 
   1662 	/* cf can contains only alu or only vtx or only tex */
   1663 	if (bc->cf_last == NULL ||
   1664 		bc->cf_last->inst != BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX) ||
   1665 	        bc->force_add_cf) {
   1666 		r = r600_bytecode_add_cf(bc);
   1667 		if (r) {
   1668 			free(ntex);
   1669 			return r;
   1670 		}
   1671 		bc->cf_last->inst = BC_INST(bc, V_SQ_CF_WORD1_SQ_CF_INST_TEX);
   1672 	}
   1673 	if (ntex->src_gpr >= bc->ngpr) {
   1674 		bc->ngpr = ntex->src_gpr + 1;
   1675 	}
   1676 	if (ntex->dst_gpr >= bc->ngpr) {
   1677 		bc->ngpr = ntex->dst_gpr + 1;
   1678 	}
   1679 	LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
   1680 	/* each texture fetch use 4 dwords */
   1681 	bc->cf_last->ndw += 4;
   1682 	bc->ndw += 4;
   1683 	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
   1684 		bc->force_add_cf = 1;
   1685 	return 0;
   1686 }
   1687 
   1688 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, int inst)
   1689 {
   1690 	int r;
   1691 	r = r600_bytecode_add_cf(bc);
   1692 	if (r)
   1693 		return r;
   1694 
   1695 	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
   1696 	bc->cf_last->inst = inst;
   1697 	return 0;
   1698 }
   1699 
   1700 int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
   1701 {
   1702 	return r600_bytecode_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
   1703 }
   1704 
   1705 /* common to all 3 families */
   1706 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
   1707 {
   1708 	bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
   1709 			S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
   1710 			S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
   1711 			S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
   1712 	if (bc->chip_class < CAYMAN)
   1713 		bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
   1714 	id++;
   1715 	bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
   1716 				S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
   1717 				S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
   1718 				S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
   1719 				S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
   1720 				S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
   1721 				S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
   1722 				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
   1723 				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
   1724 				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
   1725 	bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
   1726 				S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
   1727 	if (bc->chip_class < CAYMAN)
   1728 		bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
   1729 	id++;
   1730 	bc->bytecode[id++] = 0;
   1731 	return 0;
   1732 }
   1733 
   1734 /* common to all 3 families */
   1735 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
   1736 {
   1737 	bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
   1738 				S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
   1739 				S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
   1740 				S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
   1741 	bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
   1742 				S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
   1743 				S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
   1744 				S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
   1745 				S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
   1746 				S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
   1747 				S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
   1748 				S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
   1749 				S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
   1750 				S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
   1751 				S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
   1752 	bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
   1753 				S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
   1754 				S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
   1755 				S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
   1756 				S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
   1757 				S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
   1758 				S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
   1759 				S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
   1760 	bc->bytecode[id++] = 0;
   1761 	return 0;
   1762 }
   1763 
   1764 /* r600 only, r700/eg bits in r700_asm.c */
   1765 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
   1766 {
   1767 	/* don't replace gpr by pv or ps for destination register */
   1768 	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
   1769 				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
   1770 				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
   1771 				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
   1772 				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
   1773 				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
   1774 				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
   1775 				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
   1776 				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
   1777 				S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
   1778 				S_SQ_ALU_WORD0_LAST(alu->last);
   1779 
   1780 	if (alu->is_op3) {
   1781 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
   1782 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
   1783 					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
   1784 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
   1785 					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
   1786 					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
   1787 					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
   1788 					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
   1789 					S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
   1790 					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
   1791 	} else {
   1792 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
   1793 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
   1794 					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
   1795 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
   1796 					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
   1797 					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
   1798 					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
   1799 					S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
   1800 					S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
   1801 					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
   1802 					S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
   1803 					S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
   1804 	}
   1805 	return 0;
   1806 }
   1807 
   1808 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
   1809 {
   1810 	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
   1811 	*bytecode++ = cf->inst |
   1812 			S_SQ_CF_WORD1_BARRIER(1) |
   1813 			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
   1814 }
   1815 
   1816 /* common for r600/r700 - eg in eg_asm.c */
   1817 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
   1818 {
   1819 	unsigned id = cf->id;
   1820 
   1821 	switch (cf->inst) {
   1822 	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   1823 	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   1824 	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   1825 	case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   1826 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
   1827 			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
   1828 			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
   1829 			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
   1830 
   1831 		bc->bytecode[id++] = cf->inst |
   1832 			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
   1833 			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
   1834 			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
   1835 					S_SQ_CF_ALU_WORD1_BARRIER(1) |
   1836 					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
   1837 					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
   1838 		break;
   1839 	case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   1840 	case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   1841 	case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
   1842 		if (bc->chip_class == R700)
   1843 			r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
   1844 		else
   1845 			r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
   1846 		break;
   1847 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   1848 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   1849 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
   1850 			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
   1851 			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
   1852 			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
   1853 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
   1854 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
   1855 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
   1856 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
   1857 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
   1858 			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
   1859 			cf->output.inst |
   1860 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
   1861 		break;
   1862 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
   1863 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
   1864 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
   1865 	case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
   1866 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
   1867 			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
   1868 			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
   1869 			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
   1870 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
   1871 			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
   1872 			cf->output.inst |
   1873 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
   1874 			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
   1875 			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
   1876 		break;
   1877 	case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   1878 	case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   1879 	case V_SQ_CF_WORD1_SQ_CF_INST_POP:
   1880 	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   1881 	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   1882 	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   1883 	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   1884 	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   1885 	case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   1886 	case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   1887 		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
   1888 		bc->bytecode[id++] = cf->inst |
   1889 					S_SQ_CF_WORD1_BARRIER(1) |
   1890 			                S_SQ_CF_WORD1_COND(cf->cond) |
   1891 			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
   1892 
   1893 		break;
   1894 	default:
   1895 		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
   1896 		return -EINVAL;
   1897 	}
   1898 	return 0;
   1899 }
   1900 
   1901 int r600_bytecode_build(struct r600_bytecode *bc)
   1902 {
   1903 	struct r600_bytecode_cf *cf;
   1904 	struct r600_bytecode_alu *alu;
   1905 	struct r600_bytecode_vtx *vtx;
   1906 	struct r600_bytecode_tex *tex;
   1907 	uint32_t literal[4];
   1908 	unsigned nliteral;
   1909 	unsigned addr;
   1910 	int i, r;
   1911 
   1912 	if (bc->callstack[0].max > 0)
   1913 		bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
   1914 	if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
   1915 		bc->nstack = 1;
   1916 	}
   1917 
   1918 	/* first path compute addr of each CF block */
   1919 	/* addr start after all the CF instructions */
   1920 	addr = bc->cf_last->id + 2;
   1921 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
   1922 		if (bc->chip_class >= EVERGREEN) {
   1923 			switch (cf->inst) {
   1924 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   1925 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   1926 				/* fetch node need to be 16 bytes aligned*/
   1927 				addr += 3;
   1928 				addr &= 0xFFFFFFFCUL;
   1929 				break;
   1930 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   1931 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   1932 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   1933 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   1934 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   1935 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   1936 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
   1937 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
   1938 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
   1939 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
   1940 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
   1941 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
   1942 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
   1943 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
   1944 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
   1945 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
   1946 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
   1947 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
   1948 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
   1949 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
   1950 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
   1951 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
   1952 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   1953 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   1954 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
   1955 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   1956 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   1957 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   1958 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   1959 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   1960 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   1961 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   1962 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
   1963 			case CF_NATIVE:
   1964 				break;
   1965 			default:
   1966 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
   1967 				return -EINVAL;
   1968 			}
   1969 		} else {
   1970 			switch (cf->inst) {
   1971 			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   1972 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   1973 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
   1974 				/* fetch node need to be 16 bytes aligned*/
   1975 				addr += 3;
   1976 				addr &= 0xFFFFFFFCUL;
   1977 				break;
   1978 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   1979 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   1980 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   1981 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   1982 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   1983 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   1984 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
   1985 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
   1986 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
   1987 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
   1988 			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   1989 			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   1990 			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
   1991 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   1992 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   1993 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   1994 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   1995 			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   1996 			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   1997 				break;
   1998 			default:
   1999 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
   2000 				return -EINVAL;
   2001 			}
   2002 		}
   2003 		cf->addr = addr;
   2004 		addr += cf->ndw;
   2005 		bc->ndw = cf->addr + cf->ndw;
   2006 	}
   2007 	free(bc->bytecode);
   2008 	bc->bytecode = calloc(1, bc->ndw * 4);
   2009 	if (bc->bytecode == NULL)
   2010 		return -ENOMEM;
   2011 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
   2012 		addr = cf->addr;
   2013 		if (bc->chip_class >= EVERGREEN) {
   2014 			r = eg_bytecode_cf_build(bc, cf);
   2015 			if (r)
   2016 				return r;
   2017 
   2018 			switch (cf->inst) {
   2019 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   2020 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   2021 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   2022 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   2023 				nliteral = 0;
   2024 				memset(literal, 0, sizeof(literal));
   2025 				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
   2026 					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
   2027 					if (r)
   2028 						return r;
   2029 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
   2030 					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
   2031 
   2032 					switch(bc->chip_class) {
   2033 					case EVERGREEN: /* eg alu is same encoding as r700 */
   2034 					case CAYMAN:
   2035 						r = r700_bytecode_alu_build(bc, alu, addr);
   2036 						break;
   2037 					default:
   2038 						R600_ERR("unknown chip class %d.\n", bc->chip_class);
   2039 						return -EINVAL;
   2040 					}
   2041 					if (r)
   2042 						return r;
   2043 					addr += 2;
   2044 					if (alu->last) {
   2045 						for (i = 0; i < align(nliteral, 2); ++i) {
   2046 							bc->bytecode[addr++] = literal[i];
   2047 						}
   2048 						nliteral = 0;
   2049 						memset(literal, 0, sizeof(literal));
   2050 					}
   2051 				}
   2052 				break;
   2053 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   2054 				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
   2055 					r = r600_bytecode_vtx_build(bc, vtx, addr);
   2056 					if (r)
   2057 						return r;
   2058 					addr += 4;
   2059 				}
   2060 				break;
   2061 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   2062 				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
   2063 					assert(bc->chip_class >= EVERGREEN);
   2064 					r = r600_bytecode_vtx_build(bc, vtx, addr);
   2065 					if (r)
   2066 						return r;
   2067 					addr += 4;
   2068 				}
   2069 				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
   2070 					r = r600_bytecode_tex_build(bc, tex, addr);
   2071 					if (r)
   2072 						return r;
   2073 					addr += 4;
   2074 				}
   2075 				break;
   2076 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   2077 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   2078 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
   2079 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
   2080 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
   2081 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
   2082 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
   2083 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
   2084 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
   2085 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
   2086 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
   2087 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
   2088 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
   2089 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
   2090 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
   2091 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
   2092 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
   2093 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
   2094 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   2095 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   2096 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   2097 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   2098 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   2099 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   2100 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   2101 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
   2102 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   2103 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   2104 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
   2105 				break;
   2106 			case CF_NATIVE:
   2107 				break;
   2108 			default:
   2109 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
   2110 				return -EINVAL;
   2111 			}
   2112 		} else {
   2113 			r = r600_bytecode_cf_build(bc, cf);
   2114 			if (r)
   2115 				return r;
   2116 
   2117 			switch (cf->inst) {
   2118 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   2119 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   2120 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   2121 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   2122 				nliteral = 0;
   2123 				memset(literal, 0, sizeof(literal));
   2124 				LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
   2125 					r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
   2126 					if (r)
   2127 						return r;
   2128 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
   2129 					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
   2130 
   2131 					switch(bc->chip_class) {
   2132 					case R600:
   2133 						r = r600_bytecode_alu_build(bc, alu, addr);
   2134 						break;
   2135 					case R700:
   2136 						r = r700_bytecode_alu_build(bc, alu, addr);
   2137 						break;
   2138 					default:
   2139 						R600_ERR("unknown chip class %d.\n", bc->chip_class);
   2140 						return -EINVAL;
   2141 					}
   2142 					if (r)
   2143 						return r;
   2144 					addr += 2;
   2145 					if (alu->last) {
   2146 						for (i = 0; i < align(nliteral, 2); ++i) {
   2147 							bc->bytecode[addr++] = literal[i];
   2148 						}
   2149 						nliteral = 0;
   2150 						memset(literal, 0, sizeof(literal));
   2151 					}
   2152 				}
   2153 				break;
   2154 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   2155 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
   2156 				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
   2157 					r = r600_bytecode_vtx_build(bc, vtx, addr);
   2158 					if (r)
   2159 						return r;
   2160 					addr += 4;
   2161 				}
   2162 				break;
   2163 			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   2164 				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
   2165 					r = r600_bytecode_tex_build(bc, tex, addr);
   2166 					if (r)
   2167 						return r;
   2168 					addr += 4;
   2169 				}
   2170 				break;
   2171 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   2172 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   2173 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
   2174 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
   2175 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
   2176 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
   2177 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   2178 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   2179 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   2180 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   2181 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   2182 			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   2183 			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   2184 			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
   2185 			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   2186 			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   2187 				break;
   2188 			default:
   2189 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
   2190 				return -EINVAL;
   2191 			}
   2192 		}
   2193 	}
   2194 	return 0;
   2195 }
   2196 
   2197 void r600_bytecode_clear(struct r600_bytecode *bc)
   2198 {
   2199 	struct r600_bytecode_cf *cf = NULL, *next_cf;
   2200 
   2201 	free(bc->bytecode);
   2202 	bc->bytecode = NULL;
   2203 
   2204 	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
   2205 		struct r600_bytecode_alu *alu = NULL, *next_alu;
   2206 		struct r600_bytecode_tex *tex = NULL, *next_tex;
   2207 		struct r600_bytecode_tex *vtx = NULL, *next_vtx;
   2208 
   2209 		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
   2210 			free(alu);
   2211 		}
   2212 
   2213 		LIST_INITHEAD(&cf->alu);
   2214 
   2215 		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
   2216 			free(tex);
   2217 		}
   2218 
   2219 		LIST_INITHEAD(&cf->tex);
   2220 
   2221 		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
   2222 			free(vtx);
   2223 		}
   2224 
   2225 		LIST_INITHEAD(&cf->vtx);
   2226 
   2227 		free(cf);
   2228 	}
   2229 
   2230 	LIST_INITHEAD(&cf->list);
   2231 }
   2232 
   2233 void r600_bytecode_dump(struct r600_bytecode *bc)
   2234 {
   2235 	struct r600_bytecode_cf *cf = NULL;
   2236 	struct r600_bytecode_alu *alu = NULL;
   2237 	struct r600_bytecode_vtx *vtx = NULL;
   2238 	struct r600_bytecode_tex *tex = NULL;
   2239 
   2240 	unsigned i, id;
   2241 	uint32_t literal[4];
   2242 	unsigned nliteral;
   2243 	char chip = '6';
   2244 
   2245 	switch (bc->chip_class) {
   2246 	case R700:
   2247 		chip = '7';
   2248 		break;
   2249 	case EVERGREEN:
   2250 		chip = 'E';
   2251 		break;
   2252 	case CAYMAN:
   2253 		chip = 'C';
   2254 		break;
   2255 	case R600:
   2256 	default:
   2257 		chip = '6';
   2258 		break;
   2259 	}
   2260 	fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
   2261 	fprintf(stderr, "     %c\n", chip);
   2262 
   2263 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
   2264 		id = cf->id;
   2265 
   2266 		if (bc->chip_class >= EVERGREEN) {
   2267 			switch (cf->inst) {
   2268 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   2269 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   2270 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   2271 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   2272 				if (cf->eg_alu_extended) {
   2273 					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
   2274 					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
   2275 					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
   2276 					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
   2277 					id++;
   2278 					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
   2279 					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
   2280 					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
   2281 					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
   2282 					id++;
   2283 				}
   2284 
   2285 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
   2286 				fprintf(stderr, "ADDR:%d ", cf->addr);
   2287 				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
   2288 				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
   2289 				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
   2290 				id++;
   2291 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
   2292 				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
   2293 				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
   2294 				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
   2295 				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
   2296 				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
   2297 				break;
   2298 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   2299 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   2300 				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
   2301 				fprintf(stderr, "ADDR:%d\n", cf->addr);
   2302 				id++;
   2303 				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
   2304 				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
   2305 				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
   2306 				break;
   2307 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   2308 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   2309 				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
   2310 				fprintf(stderr, "GPR:%X ", cf->output.gpr);
   2311 				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
   2312 				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
   2313 				fprintf(stderr, "TYPE:%X\n", cf->output.type);
   2314 				id++;
   2315 				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
   2316 				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
   2317 				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
   2318 				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
   2319 				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
   2320 				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
   2321 				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
   2322 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
   2323 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
   2324 				break;
   2325 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
   2326 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
   2327 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
   2328 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
   2329 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
   2330 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
   2331 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
   2332 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
   2333 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
   2334 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
   2335 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
   2336 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
   2337 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
   2338 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
   2339 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
   2340 			case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
   2341 				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
   2342 					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2343 					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
   2344 					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2345 					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
   2346 				fprintf(stderr, "GPR:%X ", cf->output.gpr);
   2347 				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
   2348 				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
   2349 				fprintf(stderr, "TYPE:%X\n", cf->output.type);
   2350 				id++;
   2351 				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
   2352 					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2353 					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
   2354 					(EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2355 					 EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
   2356 				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
   2357 				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
   2358 				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
   2359 				fprintf(stderr, "INST:%d ", cf->output.inst);
   2360 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
   2361 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
   2362 				break;
   2363 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   2364 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   2365 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
   2366 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   2367 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   2368 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   2369 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   2370 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   2371 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   2372 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   2373 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
   2374 				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
   2375 				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
   2376 				id++;
   2377 				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
   2378 				fprintf(stderr, "INST:0x%x ", EG_G_SQ_CF_WORD1_CF_INST(cf->inst));
   2379 				fprintf(stderr, "COND:%X ", cf->cond);
   2380 				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
   2381 				break;
   2382 			case CF_NATIVE:
   2383 				fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
   2384 				fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
   2385 				break;
   2386 			default:
   2387 				R600_ERR("Unknown instruction %0x\n", cf->inst);
   2388 			}
   2389 		} else {
   2390 			switch (cf->inst) {
   2391 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU:
   2392 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
   2393 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
   2394 			case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
   2395 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
   2396 				fprintf(stderr, "ADDR:%d ", cf->addr);
   2397 				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
   2398 				fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
   2399 				fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
   2400 				id++;
   2401 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
   2402 				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALU_WORD1_CF_INST(cf->inst));
   2403 				fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
   2404 				fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
   2405 				fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
   2406 				fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
   2407 				break;
   2408 			case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
   2409 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
   2410 			case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
   2411 				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
   2412 				fprintf(stderr, "ADDR:%d\n", cf->addr);
   2413 				id++;
   2414 				fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
   2415 				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
   2416 				fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
   2417 				break;
   2418 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
   2419 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
   2420 				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
   2421 				fprintf(stderr, "GPR:%X ", cf->output.gpr);
   2422 				fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
   2423 				fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
   2424 				fprintf(stderr, "TYPE:%X\n", cf->output.type);
   2425 				id++;
   2426 				fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
   2427 				fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
   2428 				fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
   2429 				fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
   2430 				fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
   2431 				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
   2432 				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst));
   2433 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
   2434 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
   2435 				break;
   2436 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
   2437 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
   2438 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
   2439 			case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
   2440 				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
   2441 					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2442 					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
   2443 				fprintf(stderr, "GPR:%X ", cf->output.gpr);
   2444 				fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
   2445 				fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
   2446 				fprintf(stderr, "TYPE:%X\n", cf->output.type);
   2447 				id++;
   2448 				fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
   2449 					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
   2450 					R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
   2451 				fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
   2452 				fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
   2453 				fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
   2454 				fprintf(stderr, "INST:%d ", cf->output.inst);
   2455 				fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
   2456 				fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
   2457 				break;
   2458 			case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
   2459 			case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
   2460 			case V_SQ_CF_WORD1_SQ_CF_INST_POP:
   2461 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
   2462 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10:
   2463 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
   2464 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
   2465 			case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
   2466 			case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
   2467 			case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
   2468 				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
   2469 				fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
   2470 				id++;
   2471 				fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
   2472 				fprintf(stderr, "INST:0x%x ", R600_G_SQ_CF_WORD1_CF_INST(cf->inst));
   2473 				fprintf(stderr, "COND:%X ", cf->cond);
   2474 				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
   2475 				break;
   2476 			default:
   2477 				R600_ERR("Unknown instruction %0x\n", cf->inst);
   2478 			}
   2479 		}
   2480 
   2481 		id = cf->addr;
   2482 		nliteral = 0;
   2483 		LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
   2484 			r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
   2485 
   2486 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2487 			fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
   2488 			fprintf(stderr, "REL:%d ", alu->src[0].rel);
   2489 			fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
   2490 			fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
   2491 			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
   2492 			fprintf(stderr, "REL:%d ", alu->src[1].rel);
   2493 			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
   2494 			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
   2495 			fprintf(stderr, "IM:%d) ", alu->index_mode);
   2496 			fprintf(stderr, "PRED_SEL:%d ", alu->pred_sel);
   2497 			fprintf(stderr, "LAST:%d)\n", alu->last);
   2498 			id++;
   2499 			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
   2500 			fprintf(stderr, "INST:0x%x ", alu->inst);
   2501 			fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
   2502 			fprintf(stderr, "CHAN:%d ", alu->dst.chan);
   2503 			fprintf(stderr, "REL:%d ", alu->dst.rel);
   2504 			fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
   2505 			fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
   2506 			if (alu->is_op3) {
   2507 				fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
   2508 				fprintf(stderr, "REL:%d ", alu->src[2].rel);
   2509 				fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
   2510 				fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
   2511 			} else {
   2512 				fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
   2513 				fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
   2514 				fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
   2515 				fprintf(stderr, "OMOD:%d ", alu->omod);
   2516 				fprintf(stderr, "EXECUTE_MASK:%d ", alu->execute_mask);
   2517 				fprintf(stderr, "UPDATE_PRED:%d\n", alu->update_pred);
   2518 			}
   2519 
   2520 			id++;
   2521 			if (alu->last) {
   2522 				for (i = 0; i < nliteral; i++, id++) {
   2523 					float *f = (float*)(bc->bytecode + id);
   2524 					fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
   2525 							*(bc->bytecode + id));
   2526 				}
   2527 				id += nliteral & 1;
   2528 				nliteral = 0;
   2529 			}
   2530 		}
   2531 
   2532 		LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
   2533 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2534 			fprintf(stderr, "INST:0x%x ", tex->inst);
   2535 			fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
   2536 			fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
   2537 			fprintf(stderr, "REL:%d)\n", tex->src_rel);
   2538 			id++;
   2539 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2540 			fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
   2541 			fprintf(stderr, "REL:%d ", tex->dst_rel);
   2542 			fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
   2543 			fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
   2544 			fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
   2545 			fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
   2546 			fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
   2547 			fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
   2548 			fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
   2549 			fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
   2550 			fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
   2551 			id++;
   2552 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2553 			fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
   2554 			fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
   2555 			fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
   2556 			fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
   2557 			fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
   2558 			fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
   2559 			fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
   2560 			fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
   2561 			id++;
   2562 			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
   2563 			id++;
   2564 		}
   2565 
   2566 		LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
   2567 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2568 			fprintf(stderr, "INST:%d ", vtx->inst);
   2569 			fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
   2570 			fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
   2571 			id++;
   2572 			/* This assumes that no semantic fetches exist */
   2573 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2574 			fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
   2575 			fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
   2576 			if (bc->chip_class < CAYMAN)
   2577 				fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
   2578 			else
   2579 				fprintf(stderr, "SEL_Y:%d) ", 0);
   2580 			fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
   2581 			fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
   2582 			fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
   2583 			fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
   2584 			fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
   2585 			fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
   2586 			fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
   2587 			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
   2588 			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
   2589 			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
   2590 			id++;
   2591 			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
   2592 			fprintf(stderr, "ENDIAN:%d ", vtx->endian);
   2593 			fprintf(stderr, "OFFSET:%d\n", vtx->offset);
   2594 			/* XXX */
   2595 			id++;
   2596 			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
   2597 			id++;
   2598 		}
   2599 	}
   2600 
   2601 	fprintf(stderr, "--------------------------------------\n");
   2602 }
   2603 
   2604 static void r600_vertex_data_type(enum pipe_format pformat,
   2605 				  unsigned *format,
   2606 				  unsigned *num_format, unsigned *format_comp, unsigned *endian)
   2607 {
   2608 	const struct util_format_description *desc;
   2609 	unsigned i;
   2610 
   2611 	*format = 0;
   2612 	*num_format = 0;
   2613 	*format_comp = 0;
   2614 	*endian = ENDIAN_NONE;
   2615 
   2616 	desc = util_format_description(pformat);
   2617 	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
   2618 		goto out_unknown;
   2619 	}
   2620 
   2621 	/* Find the first non-VOID channel. */
   2622 	for (i = 0; i < 4; i++) {
   2623 		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
   2624 			break;
   2625 		}
   2626 	}
   2627 
   2628 	*endian = r600_endian_swap(desc->channel[i].size);
   2629 
   2630 	switch (desc->channel[i].type) {
   2631 	/* Half-floats, floats, ints */
   2632 	case UTIL_FORMAT_TYPE_FLOAT:
   2633 		switch (desc->channel[i].size) {
   2634 		case 16:
   2635 			switch (desc->nr_channels) {
   2636 			case 1:
   2637 				*format = FMT_16_FLOAT;
   2638 				break;
   2639 			case 2:
   2640 				*format = FMT_16_16_FLOAT;
   2641 				break;
   2642 			case 3:
   2643 			case 4:
   2644 				*format = FMT_16_16_16_16_FLOAT;
   2645 				break;
   2646 			}
   2647 			break;
   2648 		case 32:
   2649 			switch (desc->nr_channels) {
   2650 			case 1:
   2651 				*format = FMT_32_FLOAT;
   2652 				break;
   2653 			case 2:
   2654 				*format = FMT_32_32_FLOAT;
   2655 				break;
   2656 			case 3:
   2657 				*format = FMT_32_32_32_FLOAT;
   2658 				break;
   2659 			case 4:
   2660 				*format = FMT_32_32_32_32_FLOAT;
   2661 				break;
   2662 			}
   2663 			break;
   2664 		default:
   2665 			goto out_unknown;
   2666 		}
   2667 		break;
   2668 		/* Unsigned ints */
   2669 	case UTIL_FORMAT_TYPE_UNSIGNED:
   2670 		/* Signed ints */
   2671 	case UTIL_FORMAT_TYPE_SIGNED:
   2672 		switch (desc->channel[i].size) {
   2673 		case 8:
   2674 			switch (desc->nr_channels) {
   2675 			case 1:
   2676 				*format = FMT_8;
   2677 				break;
   2678 			case 2:
   2679 				*format = FMT_8_8;
   2680 				break;
   2681 			case 3:
   2682 			case 4:
   2683 				*format = FMT_8_8_8_8;
   2684 				break;
   2685 			}
   2686 			break;
   2687 		case 10:
   2688 			if (desc->nr_channels != 4)
   2689 				goto out_unknown;
   2690 
   2691 			*format = FMT_2_10_10_10;
   2692 			break;
   2693 		case 16:
   2694 			switch (desc->nr_channels) {
   2695 			case 1:
   2696 				*format = FMT_16;
   2697 				break;
   2698 			case 2:
   2699 				*format = FMT_16_16;
   2700 				break;
   2701 			case 3:
   2702 			case 4:
   2703 				*format = FMT_16_16_16_16;
   2704 				break;
   2705 			}
   2706 			break;
   2707 		case 32:
   2708 			switch (desc->nr_channels) {
   2709 			case 1:
   2710 				*format = FMT_32;
   2711 				break;
   2712 			case 2:
   2713 				*format = FMT_32_32;
   2714 				break;
   2715 			case 3:
   2716 				*format = FMT_32_32_32;
   2717 				break;
   2718 			case 4:
   2719 				*format = FMT_32_32_32_32;
   2720 				break;
   2721 			}
   2722 			break;
   2723 		default:
   2724 			goto out_unknown;
   2725 		}
   2726 		break;
   2727 	default:
   2728 		goto out_unknown;
   2729 	}
   2730 
   2731 	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
   2732 		*format_comp = 1;
   2733 	}
   2734 
   2735 	*num_format = 0;
   2736 	if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
   2737 	    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
   2738 		if (!desc->channel[i].normalized) {
   2739 			if (desc->channel[i].pure_integer)
   2740 				*num_format = 1;
   2741 			else
   2742 				*num_format = 2;
   2743 		}
   2744 	}
   2745 	return;
   2746 out_unknown:
   2747 	R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
   2748 }
   2749 
   2750 int r600_vertex_elements_build_fetch_shader(struct r600_context *rctx, struct r600_vertex_element *ve)
   2751 {
   2752 	static int dump_shaders = -1;
   2753 
   2754 	struct r600_bytecode bc;
   2755 	struct r600_bytecode_vtx vtx;
   2756 	struct pipe_vertex_element *elements = ve->elements;
   2757 	const struct util_format_description *desc;
   2758 	unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
   2759 	unsigned format, num_format, format_comp, endian;
   2760 	uint32_t *bytecode;
   2761 	int i, j, r;
   2762 
   2763 	memset(&bc, 0, sizeof(bc));
   2764 	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
   2765 
   2766 	for (i = 0; i < ve->count; i++) {
   2767 		if (elements[i].instance_divisor > 1) {
   2768 			if (rctx->chip_class == CAYMAN) {
   2769 				for (j = 0; j < 4; j++) {
   2770 					struct r600_bytecode_alu alu;
   2771 					memset(&alu, 0, sizeof(alu));
   2772 					alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2773 					alu.src[0].sel = 0;
   2774 					alu.src[0].chan = 3;
   2775 					alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2776 					alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
   2777 					alu.dst.sel = i + 1;
   2778 					alu.dst.chan = j;
   2779 					alu.dst.write = j == 3;
   2780 					alu.last = j == 3;
   2781 					if ((r = r600_bytecode_add_alu(&bc, &alu))) {
   2782 						r600_bytecode_clear(&bc);
   2783 						return r;
   2784 					}
   2785 				}
   2786 			} else {
   2787 				struct r600_bytecode_alu alu;
   2788 				memset(&alu, 0, sizeof(alu));
   2789 				alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2790 				alu.src[0].sel = 0;
   2791 				alu.src[0].chan = 3;
   2792 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2793 				alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
   2794 				alu.dst.sel = i + 1;
   2795 				alu.dst.chan = 3;
   2796 				alu.dst.write = 1;
   2797 				alu.last = 1;
   2798 				if ((r = r600_bytecode_add_alu(&bc, &alu))) {
   2799 					r600_bytecode_clear(&bc);
   2800 					return r;
   2801 				}
   2802 			}
   2803 		}
   2804 	}
   2805 
   2806 	for (i = 0; i < ve->count; i++) {
   2807 		r600_vertex_data_type(ve->elements[i].src_format,
   2808 				      &format, &num_format, &format_comp, &endian);
   2809 
   2810 		desc = util_format_description(ve->elements[i].src_format);
   2811 		if (desc == NULL) {
   2812 			r600_bytecode_clear(&bc);
   2813 			R600_ERR("unknown format %d\n", ve->elements[i].src_format);
   2814 			return -EINVAL;
   2815 		}
   2816 
   2817 		if (elements[i].src_offset > 65535) {
   2818 			r600_bytecode_clear(&bc);
   2819 			R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
   2820 			return -EINVAL;
   2821 		}
   2822 
   2823 		memset(&vtx, 0, sizeof(vtx));
   2824 		vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
   2825 		vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
   2826 		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
   2827 		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
   2828 		vtx.mega_fetch_count = 0x1F;
   2829 		vtx.dst_gpr = i + 1;
   2830 		vtx.dst_sel_x = desc->swizzle[0];
   2831 		vtx.dst_sel_y = desc->swizzle[1];
   2832 		vtx.dst_sel_z = desc->swizzle[2];
   2833 		vtx.dst_sel_w = desc->swizzle[3];
   2834 		vtx.data_format = format;
   2835 		vtx.num_format_all = num_format;
   2836 		vtx.format_comp_all = format_comp;
   2837 		vtx.srf_mode_all = 1;
   2838 		vtx.offset = elements[i].src_offset;
   2839 		vtx.endian = endian;
   2840 
   2841 		if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
   2842 			r600_bytecode_clear(&bc);
   2843 			return r;
   2844 		}
   2845 	}
   2846 
   2847 	r600_bytecode_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
   2848 
   2849 	if ((r = r600_bytecode_build(&bc))) {
   2850 		r600_bytecode_clear(&bc);
   2851 		return r;
   2852 	}
   2853 
   2854 	if (dump_shaders == -1)
   2855 		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
   2856 
   2857 	if (dump_shaders) {
   2858 		fprintf(stderr, "--------------------------------------------------------------\n");
   2859 		r600_bytecode_dump(&bc);
   2860 		fprintf(stderr, "______________________________________________________________\n");
   2861 	}
   2862 
   2863 	ve->fs_size = bc.ndw*4;
   2864 
   2865 	ve->fetch_shader = (struct r600_resource*)
   2866 			pipe_buffer_create(rctx->context.screen,
   2867 					   PIPE_BIND_CUSTOM,
   2868 					   PIPE_USAGE_IMMUTABLE, ve->fs_size);
   2869 	if (ve->fetch_shader == NULL) {
   2870 		r600_bytecode_clear(&bc);
   2871 		return -ENOMEM;
   2872 	}
   2873 
   2874 	bytecode = rctx->ws->buffer_map(ve->fetch_shader->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
   2875 	if (bytecode == NULL) {
   2876 		r600_bytecode_clear(&bc);
   2877 		pipe_resource_reference((struct pipe_resource**)&ve->fetch_shader, NULL);
   2878 		return -ENOMEM;
   2879 	}
   2880 
   2881 	if (R600_BIG_ENDIAN) {
   2882 		for (i = 0; i < ve->fs_size / 4; ++i) {
   2883 			bytecode[i] = bswap_32(bc.bytecode[i]);
   2884 		}
   2885 	} else {
   2886 		memcpy(bytecode, bc.bytecode, ve->fs_size);
   2887 	}
   2888 
   2889 	rctx->ws->buffer_unmap(ve->fetch_shader->cs_buf);
   2890 	r600_bytecode_clear(&bc);
   2891 
   2892 	if (rctx->chip_class >= EVERGREEN)
   2893 		evergreen_fetch_shader(&rctx->context, ve);
   2894 	else
   2895 		r600_fetch_shader(&rctx->context, ve);
   2896 
   2897 	return 0;
   2898 }
   2899