1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /** 29 * @file 30 * Texture sampling -- common code. 31 * 32 * @author Jose Fonseca <jfonseca (at) vmware.com> 33 */ 34 35 #include "pipe/p_defines.h" 36 #include "pipe/p_state.h" 37 #include "util/u_format.h" 38 #include "util/u_math.h" 39 #include "util/u_cpu_detect.h" 40 #include "lp_bld_arit.h" 41 #include "lp_bld_const.h" 42 #include "lp_bld_debug.h" 43 #include "lp_bld_printf.h" 44 #include "lp_bld_flow.h" 45 #include "lp_bld_sample.h" 46 #include "lp_bld_swizzle.h" 47 #include "lp_bld_type.h" 48 #include "lp_bld_logic.h" 49 #include "lp_bld_pack.h" 50 #include "lp_bld_quad.h" 51 #include "lp_bld_bitarit.h" 52 53 54 /* 55 * Bri-linear factor. Should be greater than one. 56 */ 57 #define BRILINEAR_FACTOR 2 58 59 /** 60 * Does the given texture wrap mode allow sampling the texture border color? 61 * XXX maybe move this into gallium util code. 62 */ 63 boolean 64 lp_sampler_wrap_mode_uses_border_color(unsigned mode, 65 unsigned min_img_filter, 66 unsigned mag_img_filter) 67 { 68 switch (mode) { 69 case PIPE_TEX_WRAP_REPEAT: 70 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 71 case PIPE_TEX_WRAP_MIRROR_REPEAT: 72 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 73 return FALSE; 74 case PIPE_TEX_WRAP_CLAMP: 75 case PIPE_TEX_WRAP_MIRROR_CLAMP: 76 if (min_img_filter == PIPE_TEX_FILTER_NEAREST && 77 mag_img_filter == PIPE_TEX_FILTER_NEAREST) { 78 return FALSE; 79 } else { 80 return TRUE; 81 } 82 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 83 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 84 return TRUE; 85 default: 86 assert(0 && "unexpected wrap mode"); 87 return FALSE; 88 } 89 } 90 91 92 /** 93 * Initialize lp_sampler_static_texture_state object with the gallium 94 * texture/sampler_view state (this contains the parts which are 95 * considered static). 96 */ 97 void 98 lp_sampler_static_texture_state(struct lp_static_texture_state *state, 99 const struct pipe_sampler_view *view) 100 { 101 const struct pipe_resource *texture; 102 103 memset(state, 0, sizeof *state); 104 105 if (!view || !view->texture) 106 return; 107 108 texture = view->texture; 109 110 state->format = view->format; 111 state->swizzle_r = view->swizzle_r; 112 state->swizzle_g = view->swizzle_g; 113 state->swizzle_b = view->swizzle_b; 114 state->swizzle_a = view->swizzle_a; 115 116 state->target = view->target; 117 state->pot_width = util_is_power_of_two(texture->width0); 118 state->pot_height = util_is_power_of_two(texture->height0); 119 state->pot_depth = util_is_power_of_two(texture->depth0); 120 state->level_zero_only = !view->u.tex.last_level; 121 122 /* 123 * the layer / element / level parameters are all either dynamic 124 * state or handled transparently wrt execution. 125 */ 126 } 127 128 129 /** 130 * Initialize lp_sampler_static_sampler_state object with the gallium sampler 131 * state (this contains the parts which are considered static). 132 */ 133 void 134 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state, 135 const struct pipe_sampler_state *sampler) 136 { 137 memset(state, 0, sizeof *state); 138 139 if (!sampler) 140 return; 141 142 /* 143 * We don't copy sampler state over unless it is actually enabled, to avoid 144 * spurious recompiles, as the sampler static state is part of the shader 145 * key. 146 * 147 * Ideally the state tracker or cso_cache module would make all state 148 * canonical, but until that happens it's better to be safe than sorry here. 149 * 150 * XXX: Actually there's much more than can be done here, especially 151 * regarding 1D/2D/3D/CUBE textures, wrap modes, etc. 152 */ 153 154 state->wrap_s = sampler->wrap_s; 155 state->wrap_t = sampler->wrap_t; 156 state->wrap_r = sampler->wrap_r; 157 state->min_img_filter = sampler->min_img_filter; 158 state->mag_img_filter = sampler->mag_img_filter; 159 state->seamless_cube_map = sampler->seamless_cube_map; 160 161 if (sampler->max_lod > 0.0f) { 162 state->min_mip_filter = sampler->min_mip_filter; 163 } else { 164 state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE; 165 } 166 167 if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE || 168 state->min_img_filter != state->mag_img_filter) { 169 if (sampler->lod_bias != 0.0f) { 170 state->lod_bias_non_zero = 1; 171 } 172 173 /* If min_lod == max_lod we can greatly simplify mipmap selection. 174 * This is a case that occurs during automatic mipmap generation. 175 */ 176 if (sampler->min_lod == sampler->max_lod) { 177 state->min_max_lod_equal = 1; 178 } else { 179 if (sampler->min_lod > 0.0f) { 180 state->apply_min_lod = 1; 181 } 182 183 /* 184 * XXX this won't do anything with the mesa state tracker which always 185 * sets max_lod to not more than actually present mip maps... 186 */ 187 if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) { 188 state->apply_max_lod = 1; 189 } 190 } 191 } 192 193 state->compare_mode = sampler->compare_mode; 194 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) { 195 state->compare_func = sampler->compare_func; 196 } 197 198 state->normalized_coords = sampler->normalized_coords; 199 } 200 201 202 /** 203 * Generate code to compute coordinate gradient (rho). 204 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y 205 * 206 * The resulting rho has bld->levelf format (per quad or per element). 207 */ 208 static LLVMValueRef 209 lp_build_rho(struct lp_build_sample_context *bld, 210 unsigned texture_unit, 211 LLVMValueRef s, 212 LLVMValueRef t, 213 LLVMValueRef r, 214 LLVMValueRef cube_rho, 215 const struct lp_derivatives *derivs) 216 { 217 struct gallivm_state *gallivm = bld->gallivm; 218 struct lp_build_context *int_size_bld = &bld->int_size_in_bld; 219 struct lp_build_context *float_size_bld = &bld->float_size_in_bld; 220 struct lp_build_context *float_bld = &bld->float_bld; 221 struct lp_build_context *coord_bld = &bld->coord_bld; 222 struct lp_build_context *rho_bld = &bld->lodf_bld; 223 const unsigned dims = bld->dims; 224 LLVMValueRef ddx_ddy[2] = {NULL}; 225 LLVMBuilderRef builder = bld->gallivm->builder; 226 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 227 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 228 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); 229 LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); 230 LLVMValueRef rho_vec; 231 LLVMValueRef int_size, float_size; 232 LLVMValueRef rho; 233 LLVMValueRef first_level, first_level_vec; 234 unsigned length = coord_bld->type.length; 235 unsigned num_quads = length / 4; 236 boolean rho_per_quad = rho_bld->type.length != length; 237 boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1); 238 unsigned i; 239 LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 240 LLVMValueRef rho_xvec, rho_yvec; 241 242 /* Note that all simplified calculations will only work for isotropic filtering */ 243 244 /* 245 * rho calcs are always per quad except for explicit derivs (excluding 246 * the messy cube maps for now) when requested. 247 */ 248 249 first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, 250 bld->context_ptr, texture_unit); 251 first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level); 252 int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE); 253 float_size = lp_build_int_to_float(float_size_bld, int_size); 254 255 if (cube_rho) { 256 LLVMValueRef cubesize; 257 LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); 258 259 /* 260 * Cube map code did already everything except size mul and per-quad extraction. 261 * Luckily cube maps are always quadratic! 262 */ 263 if (rho_per_quad) { 264 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 265 rho_bld->type, cube_rho, 0); 266 } 267 else { 268 rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4); 269 } 270 /* Could optimize this for single quad just skip the broadcast */ 271 cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 272 rho_bld->type, float_size, index0); 273 /* skipping sqrt hence returning rho squared */ 274 cubesize = lp_build_mul(rho_bld, cubesize, cubesize); 275 rho = lp_build_mul(rho_bld, cubesize, rho); 276 } 277 else if (derivs) { 278 LLVMValueRef ddmax[3], ddx[3], ddy[3]; 279 for (i = 0; i < dims; i++) { 280 LLVMValueRef floatdim; 281 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 282 283 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 284 coord_bld->type, float_size, indexi); 285 286 /* 287 * note that for rho_per_quad case could reduce math (at some shuffle 288 * cost), but for now use same code to per-pixel lod case. 289 */ 290 if (no_rho_opt) { 291 ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]); 292 ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]); 293 ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]); 294 ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]); 295 } 296 else { 297 LLVMValueRef tmpx, tmpy; 298 tmpx = lp_build_abs(coord_bld, derivs->ddx[i]); 299 tmpy = lp_build_abs(coord_bld, derivs->ddy[i]); 300 ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy); 301 ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]); 302 } 303 } 304 if (no_rho_opt) { 305 rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]); 306 rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]); 307 if (dims > 2) { 308 rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]); 309 rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]); 310 } 311 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec); 312 /* skipping sqrt hence returning rho squared */ 313 } 314 else { 315 rho = ddmax[0]; 316 if (dims > 1) { 317 rho = lp_build_max(coord_bld, rho, ddmax[1]); 318 if (dims > 2) { 319 rho = lp_build_max(coord_bld, rho, ddmax[2]); 320 } 321 } 322 } 323 if (rho_per_quad) { 324 /* 325 * rho_vec contains per-pixel rho, convert to scalar per quad. 326 */ 327 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 328 rho_bld->type, rho, 0); 329 } 330 } 331 else { 332 /* 333 * This looks all a bit complex, but it's not that bad 334 * (the shuffle code makes it look worse than it is). 335 * Still, might not be ideal for all cases. 336 */ 337 static const unsigned char swizzle0[] = { /* no-op swizzle */ 338 0, LP_BLD_SWIZZLE_DONTCARE, 339 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 340 }; 341 static const unsigned char swizzle1[] = { 342 1, LP_BLD_SWIZZLE_DONTCARE, 343 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 344 }; 345 static const unsigned char swizzle2[] = { 346 2, LP_BLD_SWIZZLE_DONTCARE, 347 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 348 }; 349 350 if (dims < 2) { 351 ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s); 352 } 353 else if (dims >= 2) { 354 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t); 355 if (dims > 2) { 356 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r); 357 } 358 } 359 360 if (no_rho_opt) { 361 static const unsigned char swizzle01[] = { /* no-op swizzle */ 362 0, 1, 363 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 364 }; 365 static const unsigned char swizzle23[] = { 366 2, 3, 367 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 368 }; 369 LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4]; 370 371 for (i = 0; i < num_quads; i++) { 372 shuffles[i*4+0] = shuffles[i*4+1] = index0; 373 shuffles[i*4+2] = shuffles[i*4+3] = index1; 374 } 375 floatdim = LLVMBuildShuffleVector(builder, float_size, float_size, 376 LLVMConstVector(shuffles, length), ""); 377 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim); 378 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]); 379 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01); 380 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23); 381 rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt); 382 383 if (dims > 2) { 384 static const unsigned char swizzle02[] = { 385 0, 2, 386 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 387 }; 388 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 389 coord_bld->type, float_size, index2); 390 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim); 391 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]); 392 ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02); 393 rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]); 394 } 395 396 rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 397 rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 398 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec); 399 400 if (rho_per_quad) { 401 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 402 rho_bld->type, rho, 0); 403 } 404 else { 405 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4); 406 } 407 /* skipping sqrt hence returning rho squared */ 408 } 409 else { 410 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]); 411 if (dims > 2) { 412 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]); 413 } 414 else { 415 ddx_ddy[1] = NULL; /* silence compiler warning */ 416 } 417 418 if (dims < 2) { 419 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0); 420 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2); 421 } 422 else if (dims == 2) { 423 static const unsigned char swizzle02[] = { 424 0, 2, 425 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 426 }; 427 static const unsigned char swizzle13[] = { 428 1, 3, 429 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 430 }; 431 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02); 432 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13); 433 } 434 else { 435 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH]; 436 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH]; 437 assert(dims == 3); 438 for (i = 0; i < num_quads; i++) { 439 shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i); 440 shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2); 441 shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i); 442 shuffles1[4*i + 3] = i32undef; 443 shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1); 444 shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3); 445 shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2); 446 shuffles2[4*i + 3] = i32undef; 447 } 448 rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1], 449 LLVMConstVector(shuffles1, length), ""); 450 rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1], 451 LLVMConstVector(shuffles2, length), ""); 452 } 453 454 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec); 455 456 if (bld->coord_type.length > 4) { 457 /* expand size to each quad */ 458 if (dims > 1) { 459 /* could use some broadcast_vector helper for this? */ 460 LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4]; 461 for (i = 0; i < num_quads; i++) { 462 src[i] = float_size; 463 } 464 float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads); 465 } 466 else { 467 float_size = lp_build_broadcast_scalar(coord_bld, float_size); 468 } 469 rho_vec = lp_build_mul(coord_bld, rho_vec, float_size); 470 471 if (dims <= 1) { 472 rho = rho_vec; 473 } 474 else { 475 if (dims >= 2) { 476 LLVMValueRef rho_s, rho_t, rho_r; 477 478 rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 479 rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 480 481 rho = lp_build_max(coord_bld, rho_s, rho_t); 482 483 if (dims >= 3) { 484 rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2); 485 rho = lp_build_max(coord_bld, rho, rho_r); 486 } 487 } 488 } 489 if (rho_per_quad) { 490 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 491 rho_bld->type, rho, 0); 492 } 493 else { 494 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4); 495 } 496 } 497 else { 498 if (dims <= 1) { 499 rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, ""); 500 } 501 rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); 502 503 if (dims <= 1) { 504 rho = rho_vec; 505 } 506 else { 507 if (dims >= 2) { 508 LLVMValueRef rho_s, rho_t, rho_r; 509 510 rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); 511 rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); 512 513 rho = lp_build_max(float_bld, rho_s, rho_t); 514 515 if (dims >= 3) { 516 rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); 517 rho = lp_build_max(float_bld, rho, rho_r); 518 } 519 } 520 } 521 if (!rho_per_quad) { 522 rho = lp_build_broadcast_scalar(rho_bld, rho); 523 } 524 } 525 } 526 } 527 528 return rho; 529 } 530 531 532 /* 533 * Bri-linear lod computation 534 * 535 * Use a piece-wise linear approximation of log2 such that: 536 * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc. 537 * - linear approximation for values in the neighborhood of 0.5, 1.5., etc, 538 * with the steepness specified in 'factor' 539 * - exact result for 0.5, 1.5, etc. 540 * 541 * 542 * 1.0 - /----* 543 * / 544 * / 545 * / 546 * 0.5 - * 547 * / 548 * / 549 * / 550 * 0.0 - *----/ 551 * 552 * | | 553 * 2^0 2^1 554 * 555 * This is a technique also commonly used in hardware: 556 * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html 557 * 558 * TODO: For correctness, this should only be applied when texture is known to 559 * have regular mipmaps, i.e., mipmaps derived from the base level. 560 * 561 * TODO: This could be done in fixed point, where applicable. 562 */ 563 static void 564 lp_build_brilinear_lod(struct lp_build_context *bld, 565 LLVMValueRef lod, 566 double factor, 567 LLVMValueRef *out_lod_ipart, 568 LLVMValueRef *out_lod_fpart) 569 { 570 LLVMValueRef lod_fpart; 571 double pre_offset = (factor - 0.5)/factor - 0.5; 572 double post_offset = 1 - factor; 573 574 if (0) { 575 lp_build_printf(bld->gallivm, "lod = %f\n", lod); 576 } 577 578 lod = lp_build_add(bld, lod, 579 lp_build_const_vec(bld->gallivm, bld->type, pre_offset)); 580 581 lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart); 582 583 lod_fpart = lp_build_mad(bld, lod_fpart, 584 lp_build_const_vec(bld->gallivm, bld->type, factor), 585 lp_build_const_vec(bld->gallivm, bld->type, post_offset)); 586 587 /* 588 * It's not necessary to clamp lod_fpart since: 589 * - the above expression will never produce numbers greater than one. 590 * - the mip filtering branch is only taken if lod_fpart is positive 591 */ 592 593 *out_lod_fpart = lod_fpart; 594 595 if (0) { 596 lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart); 597 lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart); 598 } 599 } 600 601 602 /* 603 * Combined log2 and brilinear lod computation. 604 * 605 * It's in all identical to calling lp_build_fast_log2() and 606 * lp_build_brilinear_lod() above, but by combining we can compute the integer 607 * and fractional part independently. 608 */ 609 static void 610 lp_build_brilinear_rho(struct lp_build_context *bld, 611 LLVMValueRef rho, 612 double factor, 613 LLVMValueRef *out_lod_ipart, 614 LLVMValueRef *out_lod_fpart) 615 { 616 LLVMValueRef lod_ipart; 617 LLVMValueRef lod_fpart; 618 619 const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor); 620 const double post_offset = 1 - 2*factor; 621 622 assert(bld->type.floating); 623 624 assert(lp_check_value(bld->type, rho)); 625 626 /* 627 * The pre factor will make the intersections with the exact powers of two 628 * happen precisely where we want them to be, which means that the integer 629 * part will not need any post adjustments. 630 */ 631 rho = lp_build_mul(bld, rho, 632 lp_build_const_vec(bld->gallivm, bld->type, pre_factor)); 633 634 /* ipart = ifloor(log2(rho)) */ 635 lod_ipart = lp_build_extract_exponent(bld, rho, 0); 636 637 /* fpart = rho / 2**ipart */ 638 lod_fpart = lp_build_extract_mantissa(bld, rho); 639 640 lod_fpart = lp_build_mad(bld, lod_fpart, 641 lp_build_const_vec(bld->gallivm, bld->type, factor), 642 lp_build_const_vec(bld->gallivm, bld->type, post_offset)); 643 644 /* 645 * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since: 646 * - the above expression will never produce numbers greater than one. 647 * - the mip filtering branch is only taken if lod_fpart is positive 648 */ 649 650 *out_lod_ipart = lod_ipart; 651 *out_lod_fpart = lod_fpart; 652 } 653 654 655 /** 656 * Fast implementation of iround(log2(sqrt(x))), based on 657 * log2(x^n) == n*log2(x). 658 * 659 * Gives accurate results all the time. 660 * (Could be trivially extended to handle other power-of-two roots.) 661 */ 662 static LLVMValueRef 663 lp_build_ilog2_sqrt(struct lp_build_context *bld, 664 LLVMValueRef x) 665 { 666 LLVMBuilderRef builder = bld->gallivm->builder; 667 LLVMValueRef ipart; 668 struct lp_type i_type = lp_int_type(bld->type); 669 LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1); 670 671 assert(bld->type.floating); 672 673 assert(lp_check_value(bld->type, x)); 674 675 /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */ 676 ipart = lp_build_extract_exponent(bld, x, 1); 677 ipart = LLVMBuildAShr(builder, ipart, one, ""); 678 679 return ipart; 680 } 681 682 683 /** 684 * Generate code to compute texture level of detail (lambda). 685 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y 686 * \param lod_bias optional float vector with the shader lod bias 687 * \param explicit_lod optional float vector with the explicit lod 688 * \param cube_rho rho calculated by cube coord mapping (optional) 689 * \param out_lod_ipart integer part of lod 690 * \param out_lod_fpart float part of lod (never larger than 1 but may be negative) 691 * \param out_lod_positive (mask) if lod is positive (i.e. texture is minified) 692 * 693 * The resulting lod can be scalar per quad or be per element. 694 */ 695 void 696 lp_build_lod_selector(struct lp_build_sample_context *bld, 697 unsigned texture_unit, 698 unsigned sampler_unit, 699 LLVMValueRef s, 700 LLVMValueRef t, 701 LLVMValueRef r, 702 LLVMValueRef cube_rho, 703 const struct lp_derivatives *derivs, 704 LLVMValueRef lod_bias, /* optional */ 705 LLVMValueRef explicit_lod, /* optional */ 706 unsigned mip_filter, 707 LLVMValueRef *out_lod_ipart, 708 LLVMValueRef *out_lod_fpart, 709 LLVMValueRef *out_lod_positive) 710 711 { 712 LLVMBuilderRef builder = bld->gallivm->builder; 713 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 714 struct lp_build_context *lodf_bld = &bld->lodf_bld; 715 LLVMValueRef lod; 716 717 *out_lod_ipart = bld->lodi_bld.zero; 718 *out_lod_positive = bld->lodi_bld.zero; 719 *out_lod_fpart = lodf_bld->zero; 720 721 /* 722 * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture Magnification: 723 * "Implementations may either unconditionally assume c = 0 for the minification 724 * vs. magnification switch-over point, or may choose to make c depend on the 725 * combination of minification and magnification modes as follows: if the 726 * magnification filter is given by LINEAR and the minification filter is given 727 * by NEAREST_MIPMAP_NEAREST or NEAREST_MIPMAP_LINEAR, then c = 0.5. This is 728 * done to ensure that a minified texture does not appear "sharper" than a 729 * magnified texture. Otherwise c = 0." 730 * And 3.9.11 Texture Minification: 731 * "If lod is less than or equal to the constant c (see section 3.9.12) the 732 * texture is said to be magnified; if it is greater, the texture is minified." 733 * So, using 0 as switchover point always, and using magnification for lod == 0. 734 * Note that the always c = 0 behavior is new (first appearing in GL 3.1 spec), 735 * old GL versions required 0.5 for the modes listed above. 736 * I have no clue about the (undocumented) wishes of d3d9/d3d10 here! 737 */ 738 739 if (bld->static_sampler_state->min_max_lod_equal) { 740 /* User is forcing sampling from a particular mipmap level. 741 * This is hit during mipmap generation. 742 */ 743 LLVMValueRef min_lod = 744 dynamic_state->min_lod(dynamic_state, bld->gallivm, 745 bld->context_ptr, sampler_unit); 746 747 lod = lp_build_broadcast_scalar(lodf_bld, min_lod); 748 } 749 else { 750 if (explicit_lod) { 751 if (bld->num_lods != bld->coord_type.length) 752 lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, 753 lodf_bld->type, explicit_lod, 0); 754 else 755 lod = explicit_lod; 756 } 757 else { 758 LLVMValueRef rho; 759 boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && 760 (bld->dims > 1)) || cube_rho; 761 762 rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs); 763 764 /* 765 * Compute lod = log2(rho) 766 */ 767 768 if (!lod_bias && 769 !bld->static_sampler_state->lod_bias_non_zero && 770 !bld->static_sampler_state->apply_max_lod && 771 !bld->static_sampler_state->apply_min_lod) { 772 /* 773 * Special case when there are no post-log2 adjustments, which 774 * saves instructions but keeping the integer and fractional lod 775 * computations separate from the start. 776 */ 777 778 if (mip_filter == PIPE_TEX_MIPFILTER_NONE || 779 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { 780 /* 781 * Don't actually need both values all the time, lod_ipart is 782 * needed for nearest mipfilter, lod_positive if min != mag. 783 */ 784 if (rho_squared) { 785 *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho); 786 } 787 else { 788 *out_lod_ipart = lp_build_ilog2(lodf_bld, rho); 789 } 790 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 791 rho, lodf_bld->one); 792 return; 793 } 794 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && 795 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) && 796 !rho_squared) { 797 /* 798 * This can't work if rho is squared. Not sure if it could be 799 * fixed while keeping it worthwile, could also do sqrt here 800 * but brilinear and no_rho_opt seems like a combination not 801 * making much sense anyway so just use ordinary path below. 802 */ 803 lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR, 804 out_lod_ipart, out_lod_fpart); 805 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 806 rho, lodf_bld->one); 807 return; 808 } 809 } 810 811 if (0) { 812 lod = lp_build_log2(lodf_bld, rho); 813 } 814 else { 815 lod = lp_build_fast_log2(lodf_bld, rho); 816 } 817 if (rho_squared) { 818 /* log2(x^2) == 0.5*log2(x) */ 819 lod = lp_build_mul(lodf_bld, lod, 820 lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F)); 821 } 822 823 /* add shader lod bias */ 824 if (lod_bias) { 825 if (bld->num_lods != bld->coord_type.length) 826 lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, 827 lodf_bld->type, lod_bias, 0); 828 lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias"); 829 } 830 } 831 832 /* add sampler lod bias */ 833 if (bld->static_sampler_state->lod_bias_non_zero) { 834 LLVMValueRef sampler_lod_bias = 835 dynamic_state->lod_bias(dynamic_state, bld->gallivm, 836 bld->context_ptr, sampler_unit); 837 sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld, 838 sampler_lod_bias); 839 lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias"); 840 } 841 842 /* clamp lod */ 843 if (bld->static_sampler_state->apply_max_lod) { 844 LLVMValueRef max_lod = 845 dynamic_state->max_lod(dynamic_state, bld->gallivm, 846 bld->context_ptr, sampler_unit); 847 max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod); 848 849 lod = lp_build_min(lodf_bld, lod, max_lod); 850 } 851 if (bld->static_sampler_state->apply_min_lod) { 852 LLVMValueRef min_lod = 853 dynamic_state->min_lod(dynamic_state, bld->gallivm, 854 bld->context_ptr, sampler_unit); 855 min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod); 856 857 lod = lp_build_max(lodf_bld, lod, min_lod); 858 } 859 } 860 861 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 862 lod, lodf_bld->zero); 863 864 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { 865 if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { 866 lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR, 867 out_lod_ipart, out_lod_fpart); 868 } 869 else { 870 lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart); 871 } 872 873 lp_build_name(*out_lod_fpart, "lod_fpart"); 874 } 875 else { 876 *out_lod_ipart = lp_build_iround(lodf_bld, lod); 877 } 878 879 lp_build_name(*out_lod_ipart, "lod_ipart"); 880 881 return; 882 } 883 884 885 /** 886 * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod 887 * to actual mip level. 888 * Note: this is all scalar per quad code. 889 * \param lod_ipart int texture level of detail 890 * \param level_out returns integer 891 * \param out_of_bounds returns per coord out_of_bounds mask if provided 892 */ 893 void 894 lp_build_nearest_mip_level(struct lp_build_sample_context *bld, 895 unsigned texture_unit, 896 LLVMValueRef lod_ipart, 897 LLVMValueRef *level_out, 898 LLVMValueRef *out_of_bounds) 899 { 900 struct lp_build_context *leveli_bld = &bld->leveli_bld; 901 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 902 LLVMValueRef first_level, last_level, level; 903 904 first_level = dynamic_state->first_level(dynamic_state, bld->gallivm, 905 bld->context_ptr, texture_unit); 906 last_level = dynamic_state->last_level(dynamic_state, bld->gallivm, 907 bld->context_ptr, texture_unit); 908 first_level = lp_build_broadcast_scalar(leveli_bld, first_level); 909 last_level = lp_build_broadcast_scalar(leveli_bld, last_level); 910 911 level = lp_build_add(leveli_bld, lod_ipart, first_level); 912 913 if (out_of_bounds) { 914 LLVMValueRef out, out1; 915 out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level); 916 out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level); 917 out = lp_build_or(leveli_bld, out, out1); 918 if (bld->num_mips == bld->coord_bld.type.length) { 919 *out_of_bounds = out; 920 } 921 else if (bld->num_mips == 1) { 922 *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out); 923 } 924 else { 925 assert(bld->num_mips == bld->coord_bld.type.length / 4); 926 *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm, 927 leveli_bld->type, 928 bld->int_coord_bld.type, 929 out); 930 } 931 level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds); 932 *level_out = level; 933 } 934 else { 935 /* clamp level to legal range of levels */ 936 *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level); 937 938 } 939 } 940 941 942 /** 943 * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s) 944 * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod 945 * part accordingly. 946 * Later, we'll sample from those two mipmap levels and interpolate between them. 947 */ 948 void 949 lp_build_linear_mip_levels(struct lp_build_sample_context *bld, 950 unsigned texture_unit, 951 LLVMValueRef lod_ipart, 952 LLVMValueRef *lod_fpart_inout, 953 LLVMValueRef *level0_out, 954 LLVMValueRef *level1_out) 955 { 956 LLVMBuilderRef builder = bld->gallivm->builder; 957 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 958 struct lp_build_context *leveli_bld = &bld->leveli_bld; 959 struct lp_build_context *levelf_bld = &bld->levelf_bld; 960 LLVMValueRef first_level, last_level; 961 LLVMValueRef clamp_min; 962 LLVMValueRef clamp_max; 963 964 assert(bld->num_lods == bld->num_mips); 965 966 first_level = dynamic_state->first_level(dynamic_state, bld->gallivm, 967 bld->context_ptr, texture_unit); 968 last_level = dynamic_state->last_level(dynamic_state, bld->gallivm, 969 bld->context_ptr, texture_unit); 970 first_level = lp_build_broadcast_scalar(leveli_bld, first_level); 971 last_level = lp_build_broadcast_scalar(leveli_bld, last_level); 972 973 *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level); 974 *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one); 975 976 /* 977 * Clamp both *level0_out and *level1_out to [first_level, last_level], with 978 * the minimum number of comparisons, and zeroing lod_fpart in the extreme 979 * ends in the process. 980 */ 981 982 /* *level0_out < first_level */ 983 clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, 984 *level0_out, first_level, 985 "clamp_lod_to_first"); 986 987 *level0_out = LLVMBuildSelect(builder, clamp_min, 988 first_level, *level0_out, ""); 989 990 *level1_out = LLVMBuildSelect(builder, clamp_min, 991 first_level, *level1_out, ""); 992 993 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, 994 levelf_bld->zero, *lod_fpart_inout, ""); 995 996 /* *level0_out >= last_level */ 997 clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, 998 *level0_out, last_level, 999 "clamp_lod_to_last"); 1000 1001 *level0_out = LLVMBuildSelect(builder, clamp_max, 1002 last_level, *level0_out, ""); 1003 1004 *level1_out = LLVMBuildSelect(builder, clamp_max, 1005 last_level, *level1_out, ""); 1006 1007 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, 1008 levelf_bld->zero, *lod_fpart_inout, ""); 1009 1010 lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit); 1011 lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit); 1012 lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit); 1013 } 1014 1015 1016 /** 1017 * Return pointer to a single mipmap level. 1018 * \param level integer mipmap level 1019 */ 1020 LLVMValueRef 1021 lp_build_get_mipmap_level(struct lp_build_sample_context *bld, 1022 LLVMValueRef level) 1023 { 1024 LLVMBuilderRef builder = bld->gallivm->builder; 1025 LLVMValueRef indexes[2], data_ptr, mip_offset; 1026 1027 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1028 indexes[1] = level; 1029 mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1030 mip_offset = LLVMBuildLoad(builder, mip_offset, ""); 1031 data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, ""); 1032 return data_ptr; 1033 } 1034 1035 /** 1036 * Return (per-pixel) offsets to mip levels. 1037 * \param level integer mipmap level 1038 */ 1039 LLVMValueRef 1040 lp_build_get_mip_offsets(struct lp_build_sample_context *bld, 1041 LLVMValueRef level) 1042 { 1043 LLVMBuilderRef builder = bld->gallivm->builder; 1044 LLVMValueRef indexes[2], offsets, offset1; 1045 1046 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1047 if (bld->num_mips == 1) { 1048 indexes[1] = level; 1049 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1050 offset1 = LLVMBuildLoad(builder, offset1, ""); 1051 offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1); 1052 } 1053 else if (bld->num_mips == bld->coord_bld.type.length / 4) { 1054 unsigned i; 1055 1056 offsets = bld->int_coord_bld.undef; 1057 for (i = 0; i < bld->num_mips; i++) { 1058 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1059 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i); 1060 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1061 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1062 offset1 = LLVMBuildLoad(builder, offset1, ""); 1063 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, ""); 1064 } 1065 offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4); 1066 } 1067 else { 1068 unsigned i; 1069 1070 assert (bld->num_mips == bld->coord_bld.type.length); 1071 1072 offsets = bld->int_coord_bld.undef; 1073 for (i = 0; i < bld->num_mips; i++) { 1074 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1075 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1076 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1077 offset1 = LLVMBuildLoad(builder, offset1, ""); 1078 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, ""); 1079 } 1080 } 1081 return offsets; 1082 } 1083 1084 1085 /** 1086 * Codegen equivalent for u_minify(). 1087 * @param lod_scalar if lod is a (broadcasted) scalar 1088 * Return max(1, base_size >> level); 1089 */ 1090 LLVMValueRef 1091 lp_build_minify(struct lp_build_context *bld, 1092 LLVMValueRef base_size, 1093 LLVMValueRef level, 1094 boolean lod_scalar) 1095 { 1096 LLVMBuilderRef builder = bld->gallivm->builder; 1097 assert(lp_check_value(bld->type, base_size)); 1098 assert(lp_check_value(bld->type, level)); 1099 1100 if (level == bld->zero) { 1101 /* if we're using mipmap level zero, no minification is needed */ 1102 return base_size; 1103 } 1104 else { 1105 LLVMValueRef size; 1106 assert(bld->type.sign); 1107 if (lod_scalar || 1108 (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) { 1109 size = LLVMBuildLShr(builder, base_size, level, "minify"); 1110 size = lp_build_max(bld, size, bld->one); 1111 } 1112 else { 1113 /* 1114 * emulate shift with float mul, since intel "forgot" shifts with 1115 * per-element shift count until avx2, which results in terrible 1116 * scalar extraction (both count and value), scalar shift, 1117 * vector reinsertion. Should not be an issue on any non-x86 cpu 1118 * with a vector instruction set. 1119 * On cpus with AMD's XOP this should also be unnecessary but I'm 1120 * not sure if llvm would emit this with current flags. 1121 */ 1122 LLVMValueRef const127, const23, lf; 1123 struct lp_type ftype; 1124 struct lp_build_context fbld; 1125 ftype = lp_type_float_vec(32, bld->type.length * bld->type.width); 1126 lp_build_context_init(&fbld, bld->gallivm, ftype); 1127 const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127); 1128 const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23); 1129 1130 /* calculate 2^(-level) float */ 1131 lf = lp_build_sub(bld, const127, level); 1132 lf = lp_build_shl(bld, lf, const23); 1133 lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, ""); 1134 1135 /* finish shift operation by doing float mul */ 1136 base_size = lp_build_int_to_float(&fbld, base_size); 1137 size = lp_build_mul(&fbld, base_size, lf); 1138 /* 1139 * do the max also with floats because 1140 * a) non-emulated int max requires sse41 1141 * (this is actually a lie as we could cast to 16bit values 1142 * as 16bit is sufficient and 16bit int max is sse2) 1143 * b) with avx we can do int max 4-wide but float max 8-wide 1144 */ 1145 size = lp_build_max(&fbld, size, fbld.one); 1146 size = lp_build_itrunc(&fbld, size); 1147 } 1148 return size; 1149 } 1150 } 1151 1152 1153 /** 1154 * Dereference stride_array[mipmap_level] array to get a stride. 1155 * Return stride as a vector. 1156 */ 1157 static LLVMValueRef 1158 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, 1159 LLVMValueRef stride_array, LLVMValueRef level) 1160 { 1161 LLVMBuilderRef builder = bld->gallivm->builder; 1162 LLVMValueRef indexes[2], stride, stride1; 1163 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1164 if (bld->num_mips == 1) { 1165 indexes[1] = level; 1166 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1167 stride1 = LLVMBuildLoad(builder, stride1, ""); 1168 stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1); 1169 } 1170 else if (bld->num_mips == bld->coord_bld.type.length / 4) { 1171 LLVMValueRef stride1; 1172 unsigned i; 1173 1174 stride = bld->int_coord_bld.undef; 1175 for (i = 0; i < bld->num_mips; i++) { 1176 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1177 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i); 1178 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1179 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1180 stride1 = LLVMBuildLoad(builder, stride1, ""); 1181 stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, ""); 1182 } 1183 stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4); 1184 } 1185 else { 1186 LLVMValueRef stride1; 1187 unsigned i; 1188 1189 assert (bld->num_mips == bld->coord_bld.type.length); 1190 1191 stride = bld->int_coord_bld.undef; 1192 for (i = 0; i < bld->coord_bld.type.length; i++) { 1193 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1194 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1195 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1196 stride1 = LLVMBuildLoad(builder, stride1, ""); 1197 stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, ""); 1198 } 1199 } 1200 return stride; 1201 } 1202 1203 1204 /** 1205 * When sampling a mipmap, we need to compute the width, height, depth 1206 * of the source levels from the level indexes. This helper function 1207 * does that. 1208 */ 1209 void 1210 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, 1211 LLVMValueRef ilevel, 1212 LLVMValueRef *out_size, 1213 LLVMValueRef *row_stride_vec, 1214 LLVMValueRef *img_stride_vec) 1215 { 1216 const unsigned dims = bld->dims; 1217 LLVMValueRef ilevel_vec; 1218 1219 /* 1220 * Compute width, height, depth at mipmap level 'ilevel' 1221 */ 1222 if (bld->num_mips == 1) { 1223 ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel); 1224 *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE); 1225 } 1226 else { 1227 LLVMValueRef int_size_vec; 1228 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 1229 unsigned num_quads = bld->coord_bld.type.length / 4; 1230 unsigned i; 1231 1232 if (bld->num_mips == num_quads) { 1233 /* 1234 * XXX: this should be #ifndef SANE_INSTRUCTION_SET. 1235 * intel "forgot" the variable shift count instruction until avx2. 1236 * A harmless 8x32 shift gets translated into 32 instructions 1237 * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently 1238 * unable to recognize if there are really just 2 different shift 1239 * count values. So do the shift 4-wide before expansion. 1240 */ 1241 struct lp_build_context bld4; 1242 struct lp_type type4; 1243 1244 type4 = bld->int_coord_bld.type; 1245 type4.length = 4; 1246 1247 lp_build_context_init(&bld4, bld->gallivm, type4); 1248 1249 if (bld->dims == 1) { 1250 assert(bld->int_size_in_bld.type.length == 1); 1251 int_size_vec = lp_build_broadcast_scalar(&bld4, 1252 bld->int_size); 1253 } 1254 else { 1255 assert(bld->int_size_in_bld.type.length == 4); 1256 int_size_vec = bld->int_size; 1257 } 1258 1259 for (i = 0; i < num_quads; i++) { 1260 LLVMValueRef ileveli; 1261 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1262 1263 ileveli = lp_build_extract_broadcast(bld->gallivm, 1264 bld->leveli_bld.type, 1265 bld4.type, 1266 ilevel, 1267 indexi); 1268 tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE); 1269 } 1270 /* 1271 * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1, 1272 * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise. 1273 */ 1274 *out_size = lp_build_concat(bld->gallivm, 1275 tmp, 1276 bld4.type, 1277 num_quads); 1278 } 1279 else { 1280 /* FIXME: this is terrible and results in _huge_ vector 1281 * (for the dims > 1 case). 1282 * Should refactor this (together with extract_image_sizes) and do 1283 * something more useful. Could for instance if we have width,height 1284 * with 4-wide vector pack all elements into a 8xi16 vector 1285 * (on which we can still do useful math) instead of using a 16xi32 1286 * vector. 1287 * For dims == 1 this will create [w0, w1, w2, w3, ...] vector. 1288 * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector. 1289 */ 1290 assert(bld->num_mips == bld->coord_bld.type.length); 1291 if (bld->dims == 1) { 1292 assert(bld->int_size_in_bld.type.length == 1); 1293 int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, 1294 bld->int_size); 1295 *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE); 1296 } 1297 else { 1298 LLVMValueRef ilevel1; 1299 for (i = 0; i < bld->num_mips; i++) { 1300 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1301 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type, 1302 bld->int_size_in_bld.type, ilevel, indexi); 1303 tmp[i] = bld->int_size; 1304 tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE); 1305 } 1306 *out_size = lp_build_concat(bld->gallivm, tmp, 1307 bld->int_size_in_bld.type, 1308 bld->num_mips); 1309 } 1310 } 1311 } 1312 1313 if (dims >= 2) { 1314 *row_stride_vec = lp_build_get_level_stride_vec(bld, 1315 bld->row_stride_array, 1316 ilevel); 1317 } 1318 if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) { 1319 *img_stride_vec = lp_build_get_level_stride_vec(bld, 1320 bld->img_stride_array, 1321 ilevel); 1322 } 1323 } 1324 1325 1326 /** 1327 * Extract and broadcast texture size. 1328 * 1329 * @param size_type type of the texture size vector (either 1330 * bld->int_size_type or bld->float_size_type) 1331 * @param coord_type type of the texture size vector (either 1332 * bld->int_coord_type or bld->coord_type) 1333 * @param size vector with the texture size (width, height, depth) 1334 */ 1335 void 1336 lp_build_extract_image_sizes(struct lp_build_sample_context *bld, 1337 struct lp_build_context *size_bld, 1338 struct lp_type coord_type, 1339 LLVMValueRef size, 1340 LLVMValueRef *out_width, 1341 LLVMValueRef *out_height, 1342 LLVMValueRef *out_depth) 1343 { 1344 const unsigned dims = bld->dims; 1345 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1346 struct lp_type size_type = size_bld->type; 1347 1348 if (bld->num_mips == 1) { 1349 *out_width = lp_build_extract_broadcast(bld->gallivm, 1350 size_type, 1351 coord_type, 1352 size, 1353 LLVMConstInt(i32t, 0, 0)); 1354 if (dims >= 2) { 1355 *out_height = lp_build_extract_broadcast(bld->gallivm, 1356 size_type, 1357 coord_type, 1358 size, 1359 LLVMConstInt(i32t, 1, 0)); 1360 if (dims == 3) { 1361 *out_depth = lp_build_extract_broadcast(bld->gallivm, 1362 size_type, 1363 coord_type, 1364 size, 1365 LLVMConstInt(i32t, 2, 0)); 1366 } 1367 } 1368 } 1369 else { 1370 unsigned num_quads = bld->coord_bld.type.length / 4; 1371 1372 if (dims == 1) { 1373 *out_width = size; 1374 } 1375 else if (bld->num_mips == num_quads) { 1376 *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4); 1377 if (dims >= 2) { 1378 *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4); 1379 if (dims == 3) { 1380 *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4); 1381 } 1382 } 1383 } 1384 else { 1385 assert(bld->num_mips == bld->coord_type.length); 1386 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1387 coord_type, size, 0); 1388 if (dims >= 2) { 1389 *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1390 coord_type, size, 1); 1391 if (dims == 3) { 1392 *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1393 coord_type, size, 2); 1394 } 1395 } 1396 } 1397 } 1398 } 1399 1400 1401 /** 1402 * Unnormalize coords. 1403 * 1404 * @param flt_size vector with the integer texture size (width, height, depth) 1405 */ 1406 void 1407 lp_build_unnormalized_coords(struct lp_build_sample_context *bld, 1408 LLVMValueRef flt_size, 1409 LLVMValueRef *s, 1410 LLVMValueRef *t, 1411 LLVMValueRef *r) 1412 { 1413 const unsigned dims = bld->dims; 1414 LLVMValueRef width; 1415 LLVMValueRef height; 1416 LLVMValueRef depth; 1417 1418 lp_build_extract_image_sizes(bld, 1419 &bld->float_size_bld, 1420 bld->coord_type, 1421 flt_size, 1422 &width, 1423 &height, 1424 &depth); 1425 1426 /* s = s * width, t = t * height */ 1427 *s = lp_build_mul(&bld->coord_bld, *s, width); 1428 if (dims >= 2) { 1429 *t = lp_build_mul(&bld->coord_bld, *t, height); 1430 if (dims >= 3) { 1431 *r = lp_build_mul(&bld->coord_bld, *r, depth); 1432 } 1433 } 1434 } 1435 1436 /** 1437 * Generate new coords and faces for cubemap texels falling off the face. 1438 * 1439 * @param face face (center) of the pixel 1440 * @param x0 lower x coord 1441 * @param x1 higher x coord (must be x0 + 1) 1442 * @param y0 lower y coord 1443 * @param y1 higher y coord (must be x0 + 1) 1444 * @param max_coord texture cube (level) size - 1 1445 * @param next_faces new face values when falling off 1446 * @param next_xcoords new x coord values when falling off 1447 * @param next_ycoords new y coord values when falling off 1448 * 1449 * The arrays hold the new values when under/overflow of 1450 * lower x, higher x, lower y, higher y coord would occur (in this order). 1451 * next_xcoords/next_ycoords have two entries each (for both new lower and 1452 * higher coord). 1453 */ 1454 void 1455 lp_build_cube_new_coords(struct lp_build_context *ivec_bld, 1456 LLVMValueRef face, 1457 LLVMValueRef x0, 1458 LLVMValueRef x1, 1459 LLVMValueRef y0, 1460 LLVMValueRef y1, 1461 LLVMValueRef max_coord, 1462 LLVMValueRef next_faces[4], 1463 LLVMValueRef next_xcoords[4][2], 1464 LLVMValueRef next_ycoords[4][2]) 1465 { 1466 /* 1467 * Lookup tables aren't nice for simd code hence try some logic here. 1468 * (Note that while it would not be necessary to do per-sample (4) lookups 1469 * when using a LUT as it's impossible that texels fall off of positive 1470 * and negative edges simultaneously, it would however be necessary to 1471 * do 2 lookups for corner handling as in this case texels both fall off 1472 * of x and y axes.) 1473 */ 1474 /* 1475 * Next faces (for face 012345): 1476 * x < 0.0 : 451110 1477 * x >= 1.0 : 540001 1478 * y < 0.0 : 225422 1479 * y >= 1.0 : 334533 1480 * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1 1481 * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1) 1482 * nfy+: face & ~4 > 1 ? face + 2 : 3; 1483 * This could also use pshufb instead, but would need (manually coded) 1484 * ssse3 intrinsic (llvm won't do non-constant shuffles). 1485 */ 1486 struct gallivm_state *gallivm = ivec_bld->gallivm; 1487 LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp; 1488 LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1; 1489 LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2); 1490 LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3); 1491 LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4); 1492 LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5); 1493 1494 sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5); 1495 tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one); 1496 sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one); 1497 faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one); 1498 tmp = lp_build_add(ivec_bld, faceand1, c4); 1499 next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp); 1500 next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one); 1501 1502 tmp = lp_build_andnot(ivec_bld, face, c4); 1503 sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one); 1504 tmp = lp_build_add(ivec_bld, face, c2); 1505 next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3); 1506 next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one); 1507 1508 /* 1509 * new xcoords (for face 012345): 1510 * x < 0.0 : max max t max-t max max 1511 * x >= 1.0 : 0 0 max-t t 0 0 1512 * y < 0.0 : max 0 max-s s s max-s 1513 * y >= 1.0 : max 0 s max-s s max-s 1514 * 1515 * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0 1516 * ncx[0] = max - ncx[1] 1517 * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max 1518 * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3] 1519 */ 1520 sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2); 1521 maxmy0 = lp_build_sub(ivec_bld, max_coord, y0); 1522 tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0); 1523 next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero); 1524 next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]); 1525 maxmy1 = lp_build_sub(ivec_bld, max_coord, y1); 1526 tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1); 1527 next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero); 1528 next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]); 1529 1530 sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one); 1531 1532 tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord); 1533 maxmx0 = lp_build_sub(ivec_bld, max_coord, x0); 1534 tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0); 1535 next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1536 tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]); 1537 next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]); 1538 maxmx1 = lp_build_sub(ivec_bld, max_coord, x1); 1539 tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1); 1540 next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1541 tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]); 1542 next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]); 1543 1544 /* 1545 * new ycoords (for face 012345): 1546 * x < 0.0 : t t 0 max t t 1547 * x >= 1.0 : t t 0 max t t 1548 * y < 0.0 : max-s s 0 max max 0 1549 * y >= 1.0 : s max-s 0 max 0 max 1550 * 1551 * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t 1552 * ncy[1] = ncy[0] 1553 * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max 1554 * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3] 1555 */ 1556 tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord); 1557 next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0); 1558 next_ycoords[1][0] = next_ycoords[0][0]; 1559 next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1); 1560 next_ycoords[1][1] = next_ycoords[0][1]; 1561 1562 tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0); 1563 tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero); 1564 next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1565 tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]); 1566 next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp); 1567 tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1); 1568 tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero); 1569 next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1570 tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]); 1571 next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp); 1572 } 1573 1574 1575 /** Helper used by lp_build_cube_lookup() */ 1576 static LLVMValueRef 1577 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord) 1578 { 1579 /* ima = +0.5 / abs(coord); */ 1580 LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5); 1581 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); 1582 LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord); 1583 return ima; 1584 } 1585 1586 1587 /** Helper for doing 3-wise selection. 1588 * Returns sel1 ? val2 : (sel0 ? val0 : val1). 1589 */ 1590 static LLVMValueRef 1591 lp_build_select3(struct lp_build_context *sel_bld, 1592 LLVMValueRef sel0, 1593 LLVMValueRef sel1, 1594 LLVMValueRef val0, 1595 LLVMValueRef val1, 1596 LLVMValueRef val2) 1597 { 1598 LLVMValueRef tmp; 1599 tmp = lp_build_select(sel_bld, sel0, val0, val1); 1600 return lp_build_select(sel_bld, sel1, val2, tmp); 1601 } 1602 1603 1604 /** 1605 * Generate code to do cube face selection and compute per-face texcoords. 1606 */ 1607 void 1608 lp_build_cube_lookup(struct lp_build_sample_context *bld, 1609 LLVMValueRef *coords, 1610 const struct lp_derivatives *derivs_in, /* optional */ 1611 LLVMValueRef *rho, 1612 struct lp_derivatives *derivs_out, /* optional */ 1613 boolean need_derivs) 1614 { 1615 struct lp_build_context *coord_bld = &bld->coord_bld; 1616 LLVMBuilderRef builder = bld->gallivm->builder; 1617 struct gallivm_state *gallivm = bld->gallivm; 1618 LLVMValueRef si, ti, ri; 1619 1620 /* 1621 * Do per-pixel face selection. We cannot however (as we used to do) 1622 * simply calculate the derivs afterwards (which is very bogus for 1623 * explicit derivs btw) because the values would be "random" when 1624 * not all pixels lie on the same face. So what we do here is just 1625 * calculate the derivatives after scaling the coords by the absolute 1626 * value of the inverse major axis, and essentially do rho calculation 1627 * steps as if it were a 3d texture. This is perfect if all pixels hit 1628 * the same face, but not so great at edges, I believe the max error 1629 * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring 1630 * the 3d distance between 2 points on the cube instead of measuring up/down 1631 * the edge). Still this is possibly a win over just selecting the same face 1632 * for all pixels. Unfortunately, something like that doesn't work for 1633 * explicit derivatives. 1634 */ 1635 struct lp_build_context *cint_bld = &bld->int_coord_bld; 1636 struct lp_type intctype = cint_bld->type; 1637 LLVMTypeRef coord_vec_type = coord_bld->vec_type; 1638 LLVMTypeRef cint_vec_type = cint_bld->vec_type; 1639 LLVMValueRef as, at, ar, face, face_s, face_t; 1640 LLVMValueRef as_ge_at, maxasat, ar_ge_as_at; 1641 LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz; 1642 LLVMValueRef tnegi, rnegi; 1643 LLVMValueRef ma, mai, signma, signmabit, imahalfpos; 1644 LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5); 1645 LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype, 1646 1LL << (intctype.width - 1)); 1647 LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype, 1648 intctype.width -1); 1649 LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X); 1650 LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y); 1651 LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z); 1652 LLVMValueRef s = coords[0]; 1653 LLVMValueRef t = coords[1]; 1654 LLVMValueRef r = coords[2]; 1655 1656 assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1); 1657 assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1); 1658 assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1); 1659 1660 /* 1661 * get absolute value (for x/y/z face selection) and sign bit 1662 * (for mirroring minor coords and pos/neg face selection) 1663 * of the original coords. 1664 */ 1665 as = lp_build_abs(&bld->coord_bld, s); 1666 at = lp_build_abs(&bld->coord_bld, t); 1667 ar = lp_build_abs(&bld->coord_bld, r); 1668 1669 /* 1670 * major face determination: select x if x > y else select y 1671 * select z if z >= max(x,y) else select previous result 1672 * if some axis are the same we chose z over y, y over x - the 1673 * dx10 spec seems to ask for it while OpenGL doesn't care (if we 1674 * wouldn't care could save a select or two if using different 1675 * compares and doing at_g_as_ar last since tnewx and tnewz are the 1676 * same). 1677 */ 1678 as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at); 1679 maxasat = lp_build_max(coord_bld, as, at); 1680 ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat); 1681 1682 if (need_derivs && (derivs_in || 1683 ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) && 1684 (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) { 1685 /* 1686 * XXX: This is really really complex. 1687 * It is a bit overkill to use this for implicit derivatives as well, 1688 * no way this is worth the cost in practice, but seems to be the 1689 * only way for getting accurate and per-pixel lod values. 1690 */ 1691 LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3]; 1692 LLVMValueRef madx, mady, madxdivma, madydivma; 1693 LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi; 1694 LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi; 1695 LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz; 1696 LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz; 1697 LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy; 1698 /* 1699 * s = 1/2 * ( sc / ma + 1) 1700 * t = 1/2 * ( tc / ma + 1) 1701 * 1702 * s' = 1/2 * (sc' * ma - sc * ma') / ma^2 1703 * t' = 1/2 * (tc' * ma - tc * ma') / ma^2 1704 * 1705 * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma 1706 * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma 1707 * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma 1708 * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma 1709 */ 1710 1711 /* select ma, calculate ima */ 1712 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1713 mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); 1714 signmabit = LLVMBuildAnd(builder, mai, signmask, ""); 1715 ima = lp_build_div(coord_bld, coord_bld->one, ma); 1716 imahalf = lp_build_mul(coord_bld, posHalf, ima); 1717 imahalfpos = lp_build_abs(coord_bld, imahalf); 1718 1719 if (!derivs_in) { 1720 ddx[0] = lp_build_ddx(coord_bld, s); 1721 ddx[1] = lp_build_ddx(coord_bld, t); 1722 ddx[2] = lp_build_ddx(coord_bld, r); 1723 ddy[0] = lp_build_ddy(coord_bld, s); 1724 ddy[1] = lp_build_ddy(coord_bld, t); 1725 ddy[2] = lp_build_ddy(coord_bld, r); 1726 } 1727 else { 1728 ddx[0] = derivs_in->ddx[0]; 1729 ddx[1] = derivs_in->ddx[1]; 1730 ddx[2] = derivs_in->ddx[2]; 1731 ddy[0] = derivs_in->ddy[0]; 1732 ddy[1] = derivs_in->ddy[1]; 1733 ddy[2] = derivs_in->ddy[2]; 1734 } 1735 1736 /* select major derivatives */ 1737 madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]); 1738 mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]); 1739 1740 si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); 1741 ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); 1742 ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); 1743 1744 sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, ""); 1745 tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, ""); 1746 rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, ""); 1747 1748 sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, ""); 1749 tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, ""); 1750 rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, ""); 1751 1752 /* 1753 * compute all possible new s/t coords, which does the mirroring, 1754 * and do the same for derivs minor axes. 1755 * snewx = signma * -r; 1756 * tnewx = -t; 1757 * snewy = s; 1758 * tnewy = signma * r; 1759 * snewz = signma * s; 1760 * tnewz = -t; 1761 */ 1762 tnegi = LLVMBuildXor(builder, ti, signmask, ""); 1763 rnegi = LLVMBuildXor(builder, ri, signmask, ""); 1764 tdxnegi = LLVMBuildXor(builder, tdxi, signmask, ""); 1765 rdxnegi = LLVMBuildXor(builder, rdxi, signmask, ""); 1766 tdynegi = LLVMBuildXor(builder, tdyi, signmask, ""); 1767 rdynegi = LLVMBuildXor(builder, rdyi, signmask, ""); 1768 1769 snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); 1770 tnewx = tnegi; 1771 sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, ""); 1772 tdxnewx = tdxnegi; 1773 sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, ""); 1774 tdynewx = tdynegi; 1775 1776 snewy = si; 1777 tnewy = LLVMBuildXor(builder, signmabit, ri, ""); 1778 sdxnewy = sdxi; 1779 tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, ""); 1780 sdynewy = sdyi; 1781 tdynewy = LLVMBuildXor(builder, signmabit, rdyi, ""); 1782 1783 snewz = LLVMBuildXor(builder, signmabit, si, ""); 1784 tnewz = tnegi; 1785 sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, ""); 1786 tdxnewz = tdxnegi; 1787 sdynewz = LLVMBuildXor(builder, signmabit, sdyi, ""); 1788 tdynewz = tdynegi; 1789 1790 /* select the mirrored values */ 1791 face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); 1792 face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); 1793 face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); 1794 face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz); 1795 face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz); 1796 face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz); 1797 face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz); 1798 1799 face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); 1800 face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); 1801 face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, ""); 1802 face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, ""); 1803 face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, ""); 1804 face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, ""); 1805 1806 /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */ 1807 madxdivma = lp_build_mul(coord_bld, madx, ima); 1808 tmp = lp_build_mul(coord_bld, madxdivma, face_s); 1809 tmp = lp_build_sub(coord_bld, face_sdx, tmp); 1810 derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf); 1811 1812 /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */ 1813 tmp = lp_build_mul(coord_bld, madxdivma, face_t); 1814 tmp = lp_build_sub(coord_bld, face_tdx, tmp); 1815 derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf); 1816 1817 /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */ 1818 madydivma = lp_build_mul(coord_bld, mady, ima); 1819 tmp = lp_build_mul(coord_bld, madydivma, face_s); 1820 tmp = lp_build_sub(coord_bld, face_sdy, tmp); 1821 derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf); 1822 1823 /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */ 1824 tmp = lp_build_mul(coord_bld, madydivma, face_t); 1825 tmp = lp_build_sub(coord_bld, face_tdy, tmp); 1826 derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf); 1827 1828 signma = LLVMBuildLShr(builder, mai, signshift, ""); 1829 coords[2] = LLVMBuildOr(builder, face, signma, "face"); 1830 1831 /* project coords */ 1832 face_s = lp_build_mul(coord_bld, face_s, imahalfpos); 1833 face_t = lp_build_mul(coord_bld, face_t, imahalfpos); 1834 1835 coords[0] = lp_build_add(coord_bld, face_s, posHalf); 1836 coords[1] = lp_build_add(coord_bld, face_t, posHalf); 1837 1838 return; 1839 } 1840 1841 else if (need_derivs) { 1842 LLVMValueRef ddx_ddy[2], tmp[3], rho_vec; 1843 static const unsigned char swizzle0[] = { /* no-op swizzle */ 1844 0, LP_BLD_SWIZZLE_DONTCARE, 1845 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1846 }; 1847 static const unsigned char swizzle1[] = { 1848 1, LP_BLD_SWIZZLE_DONTCARE, 1849 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1850 }; 1851 static const unsigned char swizzle01[] = { /* no-op swizzle */ 1852 0, 1, 1853 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1854 }; 1855 static const unsigned char swizzle23[] = { 1856 2, 3, 1857 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1858 }; 1859 static const unsigned char swizzle02[] = { 1860 0, 2, 1861 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1862 }; 1863 1864 /* 1865 * scale the s/t/r coords pre-select/mirror so we can calculate 1866 * "reasonable" derivs. 1867 */ 1868 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1869 imahalfpos = lp_build_cube_imapos(coord_bld, ma); 1870 s = lp_build_mul(coord_bld, s, imahalfpos); 1871 t = lp_build_mul(coord_bld, t, imahalfpos); 1872 r = lp_build_mul(coord_bld, r, imahalfpos); 1873 1874 /* 1875 * This isn't quite the same as the "ordinary" (3d deriv) path since we 1876 * know the texture is square which simplifies things (we can omit the 1877 * size mul which happens very early completely here and do it at the 1878 * very end). 1879 * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX 1880 * since the error can get quite big otherwise at edges. 1881 * (With no_rho_approx max error is sqrt(2) at edges, same as it is 1882 * without no_rho_approx for 2d textures, otherwise it would be factor 2.) 1883 */ 1884 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t); 1885 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r); 1886 1887 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]); 1888 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]); 1889 1890 tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01); 1891 tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23); 1892 tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02); 1893 1894 rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]); 1895 rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]); 1896 1897 tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 1898 tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 1899 *rho = lp_build_max(coord_bld, tmp[0], tmp[1]); 1900 } 1901 1902 if (!need_derivs) { 1903 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1904 } 1905 mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); 1906 signmabit = LLVMBuildAnd(builder, mai, signmask, ""); 1907 1908 si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); 1909 ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); 1910 ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); 1911 1912 /* 1913 * compute all possible new s/t coords, which does the mirroring 1914 * snewx = signma * -r; 1915 * tnewx = -t; 1916 * snewy = s; 1917 * tnewy = signma * r; 1918 * snewz = signma * s; 1919 * tnewz = -t; 1920 */ 1921 tnegi = LLVMBuildXor(builder, ti, signmask, ""); 1922 rnegi = LLVMBuildXor(builder, ri, signmask, ""); 1923 1924 snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); 1925 tnewx = tnegi; 1926 1927 snewy = si; 1928 tnewy = LLVMBuildXor(builder, signmabit, ri, ""); 1929 1930 snewz = LLVMBuildXor(builder, signmabit, si, ""); 1931 tnewz = tnegi; 1932 1933 /* select the mirrored values */ 1934 face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); 1935 face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); 1936 face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); 1937 1938 face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); 1939 face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); 1940 1941 /* add +1 for neg face */ 1942 /* XXX with AVX probably want to use another select here - 1943 * as long as we ensure vblendvps gets used we can actually 1944 * skip the comparison and just use sign as a "mask" directly. 1945 */ 1946 signma = LLVMBuildLShr(builder, mai, signshift, ""); 1947 coords[2] = LLVMBuildOr(builder, face, signma, "face"); 1948 1949 /* project coords */ 1950 if (!need_derivs) { 1951 imahalfpos = lp_build_cube_imapos(coord_bld, ma); 1952 face_s = lp_build_mul(coord_bld, face_s, imahalfpos); 1953 face_t = lp_build_mul(coord_bld, face_t, imahalfpos); 1954 } 1955 1956 coords[0] = lp_build_add(coord_bld, face_s, posHalf); 1957 coords[1] = lp_build_add(coord_bld, face_t, posHalf); 1958 } 1959 1960 1961 /** 1962 * Compute the partial offset of a pixel block along an arbitrary axis. 1963 * 1964 * @param coord coordinate in pixels 1965 * @param stride number of bytes between rows of successive pixel blocks 1966 * @param block_length number of pixels in a pixels block along the coordinate 1967 * axis 1968 * @param out_offset resulting relative offset of the pixel block in bytes 1969 * @param out_subcoord resulting sub-block pixel coordinate 1970 */ 1971 void 1972 lp_build_sample_partial_offset(struct lp_build_context *bld, 1973 unsigned block_length, 1974 LLVMValueRef coord, 1975 LLVMValueRef stride, 1976 LLVMValueRef *out_offset, 1977 LLVMValueRef *out_subcoord) 1978 { 1979 LLVMBuilderRef builder = bld->gallivm->builder; 1980 LLVMValueRef offset; 1981 LLVMValueRef subcoord; 1982 1983 if (block_length == 1) { 1984 subcoord = bld->zero; 1985 } 1986 else { 1987 /* 1988 * Pixel blocks have power of two dimensions. LLVM should convert the 1989 * rem/div to bit arithmetic. 1990 * TODO: Verify this. 1991 * It does indeed BUT it does transform it to scalar (and back) when doing so 1992 * (using roughly extract, shift/and, mov, unpack) (llvm 2.7). 1993 * The generated code looks seriously unfunny and is quite expensive. 1994 */ 1995 #if 0 1996 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); 1997 subcoord = LLVMBuildURem(builder, coord, block_width, ""); 1998 coord = LLVMBuildUDiv(builder, coord, block_width, ""); 1999 #else 2000 unsigned logbase2 = util_logbase2(block_length); 2001 LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2); 2002 LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1); 2003 subcoord = LLVMBuildAnd(builder, coord, block_mask, ""); 2004 coord = LLVMBuildLShr(builder, coord, block_shift, ""); 2005 #endif 2006 } 2007 2008 offset = lp_build_mul(bld, coord, stride); 2009 2010 assert(out_offset); 2011 assert(out_subcoord); 2012 2013 *out_offset = offset; 2014 *out_subcoord = subcoord; 2015 } 2016 2017 2018 /** 2019 * Compute the offset of a pixel block. 2020 * 2021 * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. 2022 * 2023 * Returns the relative offset and i,j sub-block coordinates 2024 */ 2025 void 2026 lp_build_sample_offset(struct lp_build_context *bld, 2027 const struct util_format_description *format_desc, 2028 LLVMValueRef x, 2029 LLVMValueRef y, 2030 LLVMValueRef z, 2031 LLVMValueRef y_stride, 2032 LLVMValueRef z_stride, 2033 LLVMValueRef *out_offset, 2034 LLVMValueRef *out_i, 2035 LLVMValueRef *out_j) 2036 { 2037 LLVMValueRef x_stride; 2038 LLVMValueRef offset; 2039 2040 x_stride = lp_build_const_vec(bld->gallivm, bld->type, 2041 format_desc->block.bits/8); 2042 2043 lp_build_sample_partial_offset(bld, 2044 format_desc->block.width, 2045 x, x_stride, 2046 &offset, out_i); 2047 2048 if (y && y_stride) { 2049 LLVMValueRef y_offset; 2050 lp_build_sample_partial_offset(bld, 2051 format_desc->block.height, 2052 y, y_stride, 2053 &y_offset, out_j); 2054 offset = lp_build_add(bld, offset, y_offset); 2055 } 2056 else { 2057 *out_j = bld->zero; 2058 } 2059 2060 if (z && z_stride) { 2061 LLVMValueRef z_offset; 2062 LLVMValueRef k; 2063 lp_build_sample_partial_offset(bld, 2064 1, /* pixel blocks are always 2D */ 2065 z, z_stride, 2066 &z_offset, &k); 2067 offset = lp_build_add(bld, offset, z_offset); 2068 } 2069 2070 *out_offset = offset; 2071 } 2072