1 /************************************************************************** 2 * 3 * Copyright 2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 /** 29 * @file 30 * Texture sampling -- common code. 31 * 32 * @author Jose Fonseca <jfonseca (at) vmware.com> 33 */ 34 35 #include "pipe/p_defines.h" 36 #include "pipe/p_state.h" 37 #include "util/u_format.h" 38 #include "util/u_math.h" 39 #include "util/u_cpu_detect.h" 40 #include "lp_bld_arit.h" 41 #include "lp_bld_const.h" 42 #include "lp_bld_debug.h" 43 #include "lp_bld_printf.h" 44 #include "lp_bld_flow.h" 45 #include "lp_bld_sample.h" 46 #include "lp_bld_swizzle.h" 47 #include "lp_bld_type.h" 48 #include "lp_bld_logic.h" 49 #include "lp_bld_pack.h" 50 #include "lp_bld_quad.h" 51 #include "lp_bld_bitarit.h" 52 53 54 /* 55 * Bri-linear factor. Should be greater than one. 56 */ 57 #define BRILINEAR_FACTOR 2 58 59 /** 60 * Does the given texture wrap mode allow sampling the texture border color? 61 * XXX maybe move this into gallium util code. 62 */ 63 boolean 64 lp_sampler_wrap_mode_uses_border_color(unsigned mode, 65 unsigned min_img_filter, 66 unsigned mag_img_filter) 67 { 68 switch (mode) { 69 case PIPE_TEX_WRAP_REPEAT: 70 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 71 case PIPE_TEX_WRAP_MIRROR_REPEAT: 72 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 73 return FALSE; 74 case PIPE_TEX_WRAP_CLAMP: 75 case PIPE_TEX_WRAP_MIRROR_CLAMP: 76 if (min_img_filter == PIPE_TEX_FILTER_NEAREST && 77 mag_img_filter == PIPE_TEX_FILTER_NEAREST) { 78 return FALSE; 79 } else { 80 return TRUE; 81 } 82 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 83 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 84 return TRUE; 85 default: 86 assert(0 && "unexpected wrap mode"); 87 return FALSE; 88 } 89 } 90 91 92 /** 93 * Initialize lp_sampler_static_texture_state object with the gallium 94 * texture/sampler_view state (this contains the parts which are 95 * considered static). 96 */ 97 void 98 lp_sampler_static_texture_state(struct lp_static_texture_state *state, 99 const struct pipe_sampler_view *view) 100 { 101 const struct pipe_resource *texture; 102 103 memset(state, 0, sizeof *state); 104 105 if (!view || !view->texture) 106 return; 107 108 texture = view->texture; 109 110 state->format = view->format; 111 state->swizzle_r = view->swizzle_r; 112 state->swizzle_g = view->swizzle_g; 113 state->swizzle_b = view->swizzle_b; 114 state->swizzle_a = view->swizzle_a; 115 116 state->target = view->target; 117 state->pot_width = util_is_power_of_two(texture->width0); 118 state->pot_height = util_is_power_of_two(texture->height0); 119 state->pot_depth = util_is_power_of_two(texture->depth0); 120 state->level_zero_only = !view->u.tex.last_level; 121 122 /* 123 * the layer / element / level parameters are all either dynamic 124 * state or handled transparently wrt execution. 125 */ 126 } 127 128 129 /** 130 * Initialize lp_sampler_static_sampler_state object with the gallium sampler 131 * state (this contains the parts which are considered static). 132 */ 133 void 134 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state, 135 const struct pipe_sampler_state *sampler) 136 { 137 memset(state, 0, sizeof *state); 138 139 if (!sampler) 140 return; 141 142 /* 143 * We don't copy sampler state over unless it is actually enabled, to avoid 144 * spurious recompiles, as the sampler static state is part of the shader 145 * key. 146 * 147 * Ideally the state tracker or cso_cache module would make all state 148 * canonical, but until that happens it's better to be safe than sorry here. 149 * 150 * XXX: Actually there's much more than can be done here, especially 151 * regarding 1D/2D/3D/CUBE textures, wrap modes, etc. 152 */ 153 154 state->wrap_s = sampler->wrap_s; 155 state->wrap_t = sampler->wrap_t; 156 state->wrap_r = sampler->wrap_r; 157 state->min_img_filter = sampler->min_img_filter; 158 state->mag_img_filter = sampler->mag_img_filter; 159 state->min_mip_filter = sampler->min_mip_filter; 160 state->seamless_cube_map = sampler->seamless_cube_map; 161 162 if (sampler->max_lod > 0.0f) { 163 state->max_lod_pos = 1; 164 } 165 166 if (sampler->lod_bias != 0.0f) { 167 state->lod_bias_non_zero = 1; 168 } 169 170 if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE || 171 state->min_img_filter != state->mag_img_filter) { 172 173 /* If min_lod == max_lod we can greatly simplify mipmap selection. 174 * This is a case that occurs during automatic mipmap generation. 175 */ 176 if (sampler->min_lod == sampler->max_lod) { 177 state->min_max_lod_equal = 1; 178 } else { 179 if (sampler->min_lod > 0.0f) { 180 state->apply_min_lod = 1; 181 } 182 183 /* 184 * XXX this won't do anything with the mesa state tracker which always 185 * sets max_lod to not more than actually present mip maps... 186 */ 187 if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) { 188 state->apply_max_lod = 1; 189 } 190 } 191 } 192 193 state->compare_mode = sampler->compare_mode; 194 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) { 195 state->compare_func = sampler->compare_func; 196 } 197 198 state->normalized_coords = sampler->normalized_coords; 199 } 200 201 202 /** 203 * Generate code to compute coordinate gradient (rho). 204 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y 205 * 206 * The resulting rho has bld->levelf format (per quad or per element). 207 */ 208 static LLVMValueRef 209 lp_build_rho(struct lp_build_sample_context *bld, 210 unsigned texture_unit, 211 LLVMValueRef s, 212 LLVMValueRef t, 213 LLVMValueRef r, 214 LLVMValueRef cube_rho, 215 const struct lp_derivatives *derivs) 216 { 217 struct gallivm_state *gallivm = bld->gallivm; 218 struct lp_build_context *int_size_bld = &bld->int_size_in_bld; 219 struct lp_build_context *float_size_bld = &bld->float_size_in_bld; 220 struct lp_build_context *float_bld = &bld->float_bld; 221 struct lp_build_context *coord_bld = &bld->coord_bld; 222 struct lp_build_context *rho_bld = &bld->lodf_bld; 223 const unsigned dims = bld->dims; 224 LLVMValueRef ddx_ddy[2] = {NULL}; 225 LLVMBuilderRef builder = bld->gallivm->builder; 226 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 227 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 228 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); 229 LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); 230 LLVMValueRef rho_vec; 231 LLVMValueRef int_size, float_size; 232 LLVMValueRef rho; 233 LLVMValueRef first_level, first_level_vec; 234 unsigned length = coord_bld->type.length; 235 unsigned num_quads = length / 4; 236 boolean rho_per_quad = rho_bld->type.length != length; 237 boolean no_rho_opt = bld->no_rho_approx && (dims > 1); 238 unsigned i; 239 LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 240 LLVMValueRef rho_xvec, rho_yvec; 241 242 /* Note that all simplified calculations will only work for isotropic filtering */ 243 244 /* 245 * rho calcs are always per quad except for explicit derivs (excluding 246 * the messy cube maps for now) when requested. 247 */ 248 249 first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, 250 bld->context_ptr, texture_unit); 251 first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level); 252 int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE); 253 float_size = lp_build_int_to_float(float_size_bld, int_size); 254 255 if (cube_rho) { 256 LLVMValueRef cubesize; 257 LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); 258 259 /* 260 * Cube map code did already everything except size mul and per-quad extraction. 261 * Luckily cube maps are always quadratic! 262 */ 263 if (rho_per_quad) { 264 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 265 rho_bld->type, cube_rho, 0); 266 } 267 else { 268 rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4); 269 } 270 /* Could optimize this for single quad just skip the broadcast */ 271 cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 272 rho_bld->type, float_size, index0); 273 /* skipping sqrt hence returning rho squared */ 274 cubesize = lp_build_mul(rho_bld, cubesize, cubesize); 275 rho = lp_build_mul(rho_bld, cubesize, rho); 276 } 277 else if (derivs) { 278 LLVMValueRef ddmax[3], ddx[3], ddy[3]; 279 for (i = 0; i < dims; i++) { 280 LLVMValueRef floatdim; 281 LLVMValueRef indexi = lp_build_const_int32(gallivm, i); 282 283 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 284 coord_bld->type, float_size, indexi); 285 286 /* 287 * note that for rho_per_quad case could reduce math (at some shuffle 288 * cost), but for now use same code to per-pixel lod case. 289 */ 290 if (no_rho_opt) { 291 ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]); 292 ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]); 293 ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]); 294 ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]); 295 } 296 else { 297 LLVMValueRef tmpx, tmpy; 298 tmpx = lp_build_abs(coord_bld, derivs->ddx[i]); 299 tmpy = lp_build_abs(coord_bld, derivs->ddy[i]); 300 ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy); 301 ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]); 302 } 303 } 304 if (no_rho_opt) { 305 rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]); 306 rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]); 307 if (dims > 2) { 308 rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]); 309 rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]); 310 } 311 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec); 312 /* skipping sqrt hence returning rho squared */ 313 } 314 else { 315 rho = ddmax[0]; 316 if (dims > 1) { 317 rho = lp_build_max(coord_bld, rho, ddmax[1]); 318 if (dims > 2) { 319 rho = lp_build_max(coord_bld, rho, ddmax[2]); 320 } 321 } 322 } 323 if (rho_per_quad) { 324 /* 325 * rho_vec contains per-pixel rho, convert to scalar per quad. 326 */ 327 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 328 rho_bld->type, rho, 0); 329 } 330 } 331 else { 332 /* 333 * This looks all a bit complex, but it's not that bad 334 * (the shuffle code makes it look worse than it is). 335 * Still, might not be ideal for all cases. 336 */ 337 static const unsigned char swizzle0[] = { /* no-op swizzle */ 338 0, LP_BLD_SWIZZLE_DONTCARE, 339 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 340 }; 341 static const unsigned char swizzle1[] = { 342 1, LP_BLD_SWIZZLE_DONTCARE, 343 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 344 }; 345 static const unsigned char swizzle2[] = { 346 2, LP_BLD_SWIZZLE_DONTCARE, 347 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 348 }; 349 350 if (dims < 2) { 351 ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s); 352 } 353 else if (dims >= 2) { 354 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t); 355 if (dims > 2) { 356 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r); 357 } 358 } 359 360 if (no_rho_opt) { 361 static const unsigned char swizzle01[] = { /* no-op swizzle */ 362 0, 1, 363 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 364 }; 365 static const unsigned char swizzle23[] = { 366 2, 3, 367 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 368 }; 369 LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4]; 370 371 for (i = 0; i < num_quads; i++) { 372 shuffles[i*4+0] = shuffles[i*4+1] = index0; 373 shuffles[i*4+2] = shuffles[i*4+3] = index1; 374 } 375 floatdim = LLVMBuildShuffleVector(builder, float_size, float_size, 376 LLVMConstVector(shuffles, length), ""); 377 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim); 378 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]); 379 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01); 380 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23); 381 rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt); 382 383 if (dims > 2) { 384 static const unsigned char swizzle02[] = { 385 0, 2, 386 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 387 }; 388 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type, 389 coord_bld->type, float_size, index2); 390 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim); 391 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]); 392 ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02); 393 rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]); 394 } 395 396 rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 397 rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 398 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec); 399 400 if (rho_per_quad) { 401 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 402 rho_bld->type, rho, 0); 403 } 404 else { 405 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4); 406 } 407 /* skipping sqrt hence returning rho squared */ 408 } 409 else { 410 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]); 411 if (dims > 2) { 412 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]); 413 } 414 else { 415 ddx_ddy[1] = NULL; /* silence compiler warning */ 416 } 417 418 if (dims < 2) { 419 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0); 420 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2); 421 } 422 else if (dims == 2) { 423 static const unsigned char swizzle02[] = { 424 0, 2, 425 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 426 }; 427 static const unsigned char swizzle13[] = { 428 1, 3, 429 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 430 }; 431 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02); 432 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13); 433 } 434 else { 435 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH]; 436 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH]; 437 assert(dims == 3); 438 for (i = 0; i < num_quads; i++) { 439 shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i); 440 shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2); 441 shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i); 442 shuffles1[4*i + 3] = i32undef; 443 shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1); 444 shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3); 445 shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2); 446 shuffles2[4*i + 3] = i32undef; 447 } 448 rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1], 449 LLVMConstVector(shuffles1, length), ""); 450 rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1], 451 LLVMConstVector(shuffles2, length), ""); 452 } 453 454 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec); 455 456 if (bld->coord_type.length > 4) { 457 /* expand size to each quad */ 458 if (dims > 1) { 459 /* could use some broadcast_vector helper for this? */ 460 LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4]; 461 for (i = 0; i < num_quads; i++) { 462 src[i] = float_size; 463 } 464 float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads); 465 } 466 else { 467 float_size = lp_build_broadcast_scalar(coord_bld, float_size); 468 } 469 rho_vec = lp_build_mul(coord_bld, rho_vec, float_size); 470 471 if (dims <= 1) { 472 rho = rho_vec; 473 } 474 else { 475 if (dims >= 2) { 476 LLVMValueRef rho_s, rho_t, rho_r; 477 478 rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 479 rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 480 481 rho = lp_build_max(coord_bld, rho_s, rho_t); 482 483 if (dims >= 3) { 484 rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2); 485 rho = lp_build_max(coord_bld, rho, rho_r); 486 } 487 } 488 } 489 if (rho_per_quad) { 490 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, 491 rho_bld->type, rho, 0); 492 } 493 else { 494 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4); 495 } 496 } 497 else { 498 if (dims <= 1) { 499 rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, ""); 500 } 501 rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); 502 503 if (dims <= 1) { 504 rho = rho_vec; 505 } 506 else { 507 if (dims >= 2) { 508 LLVMValueRef rho_s, rho_t, rho_r; 509 510 rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); 511 rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); 512 513 rho = lp_build_max(float_bld, rho_s, rho_t); 514 515 if (dims >= 3) { 516 rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); 517 rho = lp_build_max(float_bld, rho, rho_r); 518 } 519 } 520 } 521 if (!rho_per_quad) { 522 rho = lp_build_broadcast_scalar(rho_bld, rho); 523 } 524 } 525 } 526 } 527 528 return rho; 529 } 530 531 532 /* 533 * Bri-linear lod computation 534 * 535 * Use a piece-wise linear approximation of log2 such that: 536 * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc. 537 * - linear approximation for values in the neighborhood of 0.5, 1.5., etc, 538 * with the steepness specified in 'factor' 539 * - exact result for 0.5, 1.5, etc. 540 * 541 * 542 * 1.0 - /----* 543 * / 544 * / 545 * / 546 * 0.5 - * 547 * / 548 * / 549 * / 550 * 0.0 - *----/ 551 * 552 * | | 553 * 2^0 2^1 554 * 555 * This is a technique also commonly used in hardware: 556 * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html 557 * 558 * TODO: For correctness, this should only be applied when texture is known to 559 * have regular mipmaps, i.e., mipmaps derived from the base level. 560 * 561 * TODO: This could be done in fixed point, where applicable. 562 */ 563 static void 564 lp_build_brilinear_lod(struct lp_build_context *bld, 565 LLVMValueRef lod, 566 double factor, 567 LLVMValueRef *out_lod_ipart, 568 LLVMValueRef *out_lod_fpart) 569 { 570 LLVMValueRef lod_fpart; 571 double pre_offset = (factor - 0.5)/factor - 0.5; 572 double post_offset = 1 - factor; 573 574 if (0) { 575 lp_build_printf(bld->gallivm, "lod = %f\n", lod); 576 } 577 578 lod = lp_build_add(bld, lod, 579 lp_build_const_vec(bld->gallivm, bld->type, pre_offset)); 580 581 lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart); 582 583 lod_fpart = lp_build_mad(bld, lod_fpart, 584 lp_build_const_vec(bld->gallivm, bld->type, factor), 585 lp_build_const_vec(bld->gallivm, bld->type, post_offset)); 586 587 /* 588 * It's not necessary to clamp lod_fpart since: 589 * - the above expression will never produce numbers greater than one. 590 * - the mip filtering branch is only taken if lod_fpart is positive 591 */ 592 593 *out_lod_fpart = lod_fpart; 594 595 if (0) { 596 lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart); 597 lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart); 598 } 599 } 600 601 602 /* 603 * Combined log2 and brilinear lod computation. 604 * 605 * It's in all identical to calling lp_build_fast_log2() and 606 * lp_build_brilinear_lod() above, but by combining we can compute the integer 607 * and fractional part independently. 608 */ 609 static void 610 lp_build_brilinear_rho(struct lp_build_context *bld, 611 LLVMValueRef rho, 612 double factor, 613 LLVMValueRef *out_lod_ipart, 614 LLVMValueRef *out_lod_fpart) 615 { 616 LLVMValueRef lod_ipart; 617 LLVMValueRef lod_fpart; 618 619 const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor); 620 const double post_offset = 1 - 2*factor; 621 622 assert(bld->type.floating); 623 624 assert(lp_check_value(bld->type, rho)); 625 626 /* 627 * The pre factor will make the intersections with the exact powers of two 628 * happen precisely where we want them to be, which means that the integer 629 * part will not need any post adjustments. 630 */ 631 rho = lp_build_mul(bld, rho, 632 lp_build_const_vec(bld->gallivm, bld->type, pre_factor)); 633 634 /* ipart = ifloor(log2(rho)) */ 635 lod_ipart = lp_build_extract_exponent(bld, rho, 0); 636 637 /* fpart = rho / 2**ipart */ 638 lod_fpart = lp_build_extract_mantissa(bld, rho); 639 640 lod_fpart = lp_build_mad(bld, lod_fpart, 641 lp_build_const_vec(bld->gallivm, bld->type, factor), 642 lp_build_const_vec(bld->gallivm, bld->type, post_offset)); 643 644 /* 645 * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since: 646 * - the above expression will never produce numbers greater than one. 647 * - the mip filtering branch is only taken if lod_fpart is positive 648 */ 649 650 *out_lod_ipart = lod_ipart; 651 *out_lod_fpart = lod_fpart; 652 } 653 654 655 /** 656 * Fast implementation of iround(log2(sqrt(x))), based on 657 * log2(x^n) == n*log2(x). 658 * 659 * Gives accurate results all the time. 660 * (Could be trivially extended to handle other power-of-two roots.) 661 */ 662 static LLVMValueRef 663 lp_build_ilog2_sqrt(struct lp_build_context *bld, 664 LLVMValueRef x) 665 { 666 LLVMBuilderRef builder = bld->gallivm->builder; 667 LLVMValueRef ipart; 668 struct lp_type i_type = lp_int_type(bld->type); 669 LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1); 670 671 assert(bld->type.floating); 672 673 assert(lp_check_value(bld->type, x)); 674 675 /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */ 676 ipart = lp_build_extract_exponent(bld, x, 1); 677 ipart = LLVMBuildAShr(builder, ipart, one, ""); 678 679 return ipart; 680 } 681 682 683 /** 684 * Generate code to compute texture level of detail (lambda). 685 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y 686 * \param lod_bias optional float vector with the shader lod bias 687 * \param explicit_lod optional float vector with the explicit lod 688 * \param cube_rho rho calculated by cube coord mapping (optional) 689 * \param out_lod_ipart integer part of lod 690 * \param out_lod_fpart float part of lod (never larger than 1 but may be negative) 691 * \param out_lod_positive (mask) if lod is positive (i.e. texture is minified) 692 * 693 * The resulting lod can be scalar per quad or be per element. 694 */ 695 void 696 lp_build_lod_selector(struct lp_build_sample_context *bld, 697 boolean is_lodq, 698 unsigned texture_unit, 699 unsigned sampler_unit, 700 LLVMValueRef s, 701 LLVMValueRef t, 702 LLVMValueRef r, 703 LLVMValueRef cube_rho, 704 const struct lp_derivatives *derivs, 705 LLVMValueRef lod_bias, /* optional */ 706 LLVMValueRef explicit_lod, /* optional */ 707 unsigned mip_filter, 708 LLVMValueRef *out_lod, 709 LLVMValueRef *out_lod_ipart, 710 LLVMValueRef *out_lod_fpart, 711 LLVMValueRef *out_lod_positive) 712 713 { 714 LLVMBuilderRef builder = bld->gallivm->builder; 715 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 716 struct lp_build_context *lodf_bld = &bld->lodf_bld; 717 LLVMValueRef lod; 718 719 *out_lod_ipart = bld->lodi_bld.zero; 720 *out_lod_positive = bld->lodi_bld.zero; 721 *out_lod_fpart = lodf_bld->zero; 722 723 /* 724 * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture Magnification: 725 * "Implementations may either unconditionally assume c = 0 for the minification 726 * vs. magnification switch-over point, or may choose to make c depend on the 727 * combination of minification and magnification modes as follows: if the 728 * magnification filter is given by LINEAR and the minification filter is given 729 * by NEAREST_MIPMAP_NEAREST or NEAREST_MIPMAP_LINEAR, then c = 0.5. This is 730 * done to ensure that a minified texture does not appear "sharper" than a 731 * magnified texture. Otherwise c = 0." 732 * And 3.9.11 Texture Minification: 733 * "If lod is less than or equal to the constant c (see section 3.9.12) the 734 * texture is said to be magnified; if it is greater, the texture is minified." 735 * So, using 0 as switchover point always, and using magnification for lod == 0. 736 * Note that the always c = 0 behavior is new (first appearing in GL 3.1 spec), 737 * old GL versions required 0.5 for the modes listed above. 738 * I have no clue about the (undocumented) wishes of d3d9/d3d10 here! 739 */ 740 741 if (bld->static_sampler_state->min_max_lod_equal && !is_lodq) { 742 /* User is forcing sampling from a particular mipmap level. 743 * This is hit during mipmap generation. 744 */ 745 LLVMValueRef min_lod = 746 dynamic_state->min_lod(dynamic_state, bld->gallivm, 747 bld->context_ptr, sampler_unit); 748 749 lod = lp_build_broadcast_scalar(lodf_bld, min_lod); 750 } 751 else { 752 if (explicit_lod) { 753 if (bld->num_lods != bld->coord_type.length) 754 lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, 755 lodf_bld->type, explicit_lod, 0); 756 else 757 lod = explicit_lod; 758 } 759 else { 760 LLVMValueRef rho; 761 boolean rho_squared = (bld->no_rho_approx && 762 (bld->dims > 1)) || cube_rho; 763 764 rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs); 765 766 /* 767 * Compute lod = log2(rho) 768 */ 769 770 if (!lod_bias && !is_lodq && 771 !bld->static_sampler_state->lod_bias_non_zero && 772 !bld->static_sampler_state->apply_max_lod && 773 !bld->static_sampler_state->apply_min_lod) { 774 /* 775 * Special case when there are no post-log2 adjustments, which 776 * saves instructions but keeping the integer and fractional lod 777 * computations separate from the start. 778 */ 779 780 if (mip_filter == PIPE_TEX_MIPFILTER_NONE || 781 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { 782 /* 783 * Don't actually need both values all the time, lod_ipart is 784 * needed for nearest mipfilter, lod_positive if min != mag. 785 */ 786 if (rho_squared) { 787 *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho); 788 } 789 else { 790 *out_lod_ipart = lp_build_ilog2(lodf_bld, rho); 791 } 792 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 793 rho, lodf_bld->one); 794 return; 795 } 796 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && 797 !bld->no_brilinear && !rho_squared) { 798 /* 799 * This can't work if rho is squared. Not sure if it could be 800 * fixed while keeping it worthwile, could also do sqrt here 801 * but brilinear and no_rho_opt seems like a combination not 802 * making much sense anyway so just use ordinary path below. 803 */ 804 lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR, 805 out_lod_ipart, out_lod_fpart); 806 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 807 rho, lodf_bld->one); 808 return; 809 } 810 } 811 812 if (0) { 813 lod = lp_build_log2(lodf_bld, rho); 814 } 815 else { 816 lod = lp_build_fast_log2(lodf_bld, rho); 817 } 818 if (rho_squared) { 819 /* log2(x^2) == 0.5*log2(x) */ 820 lod = lp_build_mul(lodf_bld, lod, 821 lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F)); 822 } 823 824 /* add shader lod bias */ 825 if (lod_bias) { 826 if (bld->num_lods != bld->coord_type.length) 827 lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, 828 lodf_bld->type, lod_bias, 0); 829 lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias"); 830 } 831 } 832 833 /* add sampler lod bias */ 834 if (bld->static_sampler_state->lod_bias_non_zero) { 835 LLVMValueRef sampler_lod_bias = 836 dynamic_state->lod_bias(dynamic_state, bld->gallivm, 837 bld->context_ptr, sampler_unit); 838 sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld, 839 sampler_lod_bias); 840 lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias"); 841 } 842 843 if (is_lodq) { 844 *out_lod = lod; 845 } 846 847 /* clamp lod */ 848 if (bld->static_sampler_state->apply_max_lod) { 849 LLVMValueRef max_lod = 850 dynamic_state->max_lod(dynamic_state, bld->gallivm, 851 bld->context_ptr, sampler_unit); 852 max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod); 853 854 lod = lp_build_min(lodf_bld, lod, max_lod); 855 } 856 if (bld->static_sampler_state->apply_min_lod) { 857 LLVMValueRef min_lod = 858 dynamic_state->min_lod(dynamic_state, bld->gallivm, 859 bld->context_ptr, sampler_unit); 860 min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod); 861 862 lod = lp_build_max(lodf_bld, lod, min_lod); 863 } 864 865 if (is_lodq) { 866 *out_lod_fpart = lod; 867 return; 868 } 869 } 870 871 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER, 872 lod, lodf_bld->zero); 873 874 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { 875 if (!bld->no_brilinear) { 876 lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR, 877 out_lod_ipart, out_lod_fpart); 878 } 879 else { 880 lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart); 881 } 882 883 lp_build_name(*out_lod_fpart, "lod_fpart"); 884 } 885 else { 886 *out_lod_ipart = lp_build_iround(lodf_bld, lod); 887 } 888 889 lp_build_name(*out_lod_ipart, "lod_ipart"); 890 891 return; 892 } 893 894 895 /** 896 * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod 897 * to actual mip level. 898 * Note: this is all scalar per quad code. 899 * \param lod_ipart int texture level of detail 900 * \param level_out returns integer 901 * \param out_of_bounds returns per coord out_of_bounds mask if provided 902 */ 903 void 904 lp_build_nearest_mip_level(struct lp_build_sample_context *bld, 905 unsigned texture_unit, 906 LLVMValueRef lod_ipart, 907 LLVMValueRef *level_out, 908 LLVMValueRef *out_of_bounds) 909 { 910 struct lp_build_context *leveli_bld = &bld->leveli_bld; 911 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 912 LLVMValueRef first_level, last_level, level; 913 914 first_level = dynamic_state->first_level(dynamic_state, bld->gallivm, 915 bld->context_ptr, texture_unit); 916 last_level = dynamic_state->last_level(dynamic_state, bld->gallivm, 917 bld->context_ptr, texture_unit); 918 first_level = lp_build_broadcast_scalar(leveli_bld, first_level); 919 last_level = lp_build_broadcast_scalar(leveli_bld, last_level); 920 921 level = lp_build_add(leveli_bld, lod_ipart, first_level); 922 923 if (out_of_bounds) { 924 LLVMValueRef out, out1; 925 out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level); 926 out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level); 927 out = lp_build_or(leveli_bld, out, out1); 928 if (bld->num_mips == bld->coord_bld.type.length) { 929 *out_of_bounds = out; 930 } 931 else if (bld->num_mips == 1) { 932 *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out); 933 } 934 else { 935 assert(bld->num_mips == bld->coord_bld.type.length / 4); 936 *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm, 937 leveli_bld->type, 938 bld->int_coord_bld.type, 939 out); 940 } 941 level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds); 942 *level_out = level; 943 } 944 else { 945 /* clamp level to legal range of levels */ 946 *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level); 947 948 } 949 } 950 951 952 /** 953 * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s) 954 * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod 955 * part accordingly. 956 * Later, we'll sample from those two mipmap levels and interpolate between them. 957 */ 958 void 959 lp_build_linear_mip_levels(struct lp_build_sample_context *bld, 960 unsigned texture_unit, 961 LLVMValueRef lod_ipart, 962 LLVMValueRef *lod_fpart_inout, 963 LLVMValueRef *level0_out, 964 LLVMValueRef *level1_out) 965 { 966 LLVMBuilderRef builder = bld->gallivm->builder; 967 struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state; 968 struct lp_build_context *leveli_bld = &bld->leveli_bld; 969 struct lp_build_context *levelf_bld = &bld->levelf_bld; 970 LLVMValueRef first_level, last_level; 971 LLVMValueRef clamp_min; 972 LLVMValueRef clamp_max; 973 974 assert(bld->num_lods == bld->num_mips); 975 976 first_level = dynamic_state->first_level(dynamic_state, bld->gallivm, 977 bld->context_ptr, texture_unit); 978 last_level = dynamic_state->last_level(dynamic_state, bld->gallivm, 979 bld->context_ptr, texture_unit); 980 first_level = lp_build_broadcast_scalar(leveli_bld, first_level); 981 last_level = lp_build_broadcast_scalar(leveli_bld, last_level); 982 983 *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level); 984 *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one); 985 986 /* 987 * Clamp both *level0_out and *level1_out to [first_level, last_level], with 988 * the minimum number of comparisons, and zeroing lod_fpart in the extreme 989 * ends in the process. 990 */ 991 992 /* *level0_out < first_level */ 993 clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, 994 *level0_out, first_level, 995 "clamp_lod_to_first"); 996 997 *level0_out = LLVMBuildSelect(builder, clamp_min, 998 first_level, *level0_out, ""); 999 1000 *level1_out = LLVMBuildSelect(builder, clamp_min, 1001 first_level, *level1_out, ""); 1002 1003 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, 1004 levelf_bld->zero, *lod_fpart_inout, ""); 1005 1006 /* *level0_out >= last_level */ 1007 clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, 1008 *level0_out, last_level, 1009 "clamp_lod_to_last"); 1010 1011 *level0_out = LLVMBuildSelect(builder, clamp_max, 1012 last_level, *level0_out, ""); 1013 1014 *level1_out = LLVMBuildSelect(builder, clamp_max, 1015 last_level, *level1_out, ""); 1016 1017 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, 1018 levelf_bld->zero, *lod_fpart_inout, ""); 1019 1020 lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit); 1021 lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit); 1022 lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit); 1023 } 1024 1025 1026 /** 1027 * Return pointer to a single mipmap level. 1028 * \param level integer mipmap level 1029 */ 1030 LLVMValueRef 1031 lp_build_get_mipmap_level(struct lp_build_sample_context *bld, 1032 LLVMValueRef level) 1033 { 1034 LLVMBuilderRef builder = bld->gallivm->builder; 1035 LLVMValueRef indexes[2], data_ptr, mip_offset; 1036 1037 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1038 indexes[1] = level; 1039 mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1040 mip_offset = LLVMBuildLoad(builder, mip_offset, ""); 1041 data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, ""); 1042 return data_ptr; 1043 } 1044 1045 /** 1046 * Return (per-pixel) offsets to mip levels. 1047 * \param level integer mipmap level 1048 */ 1049 LLVMValueRef 1050 lp_build_get_mip_offsets(struct lp_build_sample_context *bld, 1051 LLVMValueRef level) 1052 { 1053 LLVMBuilderRef builder = bld->gallivm->builder; 1054 LLVMValueRef indexes[2], offsets, offset1; 1055 1056 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1057 if (bld->num_mips == 1) { 1058 indexes[1] = level; 1059 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1060 offset1 = LLVMBuildLoad(builder, offset1, ""); 1061 offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1); 1062 } 1063 else if (bld->num_mips == bld->coord_bld.type.length / 4) { 1064 unsigned i; 1065 1066 offsets = bld->int_coord_bld.undef; 1067 for (i = 0; i < bld->num_mips; i++) { 1068 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1069 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i); 1070 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1071 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1072 offset1 = LLVMBuildLoad(builder, offset1, ""); 1073 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, ""); 1074 } 1075 offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4); 1076 } 1077 else { 1078 unsigned i; 1079 1080 assert (bld->num_mips == bld->coord_bld.type.length); 1081 1082 offsets = bld->int_coord_bld.undef; 1083 for (i = 0; i < bld->num_mips; i++) { 1084 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1085 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1086 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, ""); 1087 offset1 = LLVMBuildLoad(builder, offset1, ""); 1088 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, ""); 1089 } 1090 } 1091 return offsets; 1092 } 1093 1094 1095 /** 1096 * Codegen equivalent for u_minify(). 1097 * @param lod_scalar if lod is a (broadcasted) scalar 1098 * Return max(1, base_size >> level); 1099 */ 1100 LLVMValueRef 1101 lp_build_minify(struct lp_build_context *bld, 1102 LLVMValueRef base_size, 1103 LLVMValueRef level, 1104 boolean lod_scalar) 1105 { 1106 LLVMBuilderRef builder = bld->gallivm->builder; 1107 assert(lp_check_value(bld->type, base_size)); 1108 assert(lp_check_value(bld->type, level)); 1109 1110 if (level == bld->zero) { 1111 /* if we're using mipmap level zero, no minification is needed */ 1112 return base_size; 1113 } 1114 else { 1115 LLVMValueRef size; 1116 assert(bld->type.sign); 1117 if (lod_scalar || 1118 (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) { 1119 size = LLVMBuildLShr(builder, base_size, level, "minify"); 1120 size = lp_build_max(bld, size, bld->one); 1121 } 1122 else { 1123 /* 1124 * emulate shift with float mul, since intel "forgot" shifts with 1125 * per-element shift count until avx2, which results in terrible 1126 * scalar extraction (both count and value), scalar shift, 1127 * vector reinsertion. Should not be an issue on any non-x86 cpu 1128 * with a vector instruction set. 1129 * On cpus with AMD's XOP this should also be unnecessary but I'm 1130 * not sure if llvm would emit this with current flags. 1131 */ 1132 LLVMValueRef const127, const23, lf; 1133 struct lp_type ftype; 1134 struct lp_build_context fbld; 1135 ftype = lp_type_float_vec(32, bld->type.length * bld->type.width); 1136 lp_build_context_init(&fbld, bld->gallivm, ftype); 1137 const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127); 1138 const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23); 1139 1140 /* calculate 2^(-level) float */ 1141 lf = lp_build_sub(bld, const127, level); 1142 lf = lp_build_shl(bld, lf, const23); 1143 lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, ""); 1144 1145 /* finish shift operation by doing float mul */ 1146 base_size = lp_build_int_to_float(&fbld, base_size); 1147 size = lp_build_mul(&fbld, base_size, lf); 1148 /* 1149 * do the max also with floats because 1150 * a) non-emulated int max requires sse41 1151 * (this is actually a lie as we could cast to 16bit values 1152 * as 16bit is sufficient and 16bit int max is sse2) 1153 * b) with avx we can do int max 4-wide but float max 8-wide 1154 */ 1155 size = lp_build_max(&fbld, size, fbld.one); 1156 size = lp_build_itrunc(&fbld, size); 1157 } 1158 return size; 1159 } 1160 } 1161 1162 1163 /** 1164 * Dereference stride_array[mipmap_level] array to get a stride. 1165 * Return stride as a vector. 1166 */ 1167 static LLVMValueRef 1168 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, 1169 LLVMValueRef stride_array, LLVMValueRef level) 1170 { 1171 LLVMBuilderRef builder = bld->gallivm->builder; 1172 LLVMValueRef indexes[2], stride, stride1; 1173 indexes[0] = lp_build_const_int32(bld->gallivm, 0); 1174 if (bld->num_mips == 1) { 1175 indexes[1] = level; 1176 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1177 stride1 = LLVMBuildLoad(builder, stride1, ""); 1178 stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1); 1179 } 1180 else if (bld->num_mips == bld->coord_bld.type.length / 4) { 1181 LLVMValueRef stride1; 1182 unsigned i; 1183 1184 stride = bld->int_coord_bld.undef; 1185 for (i = 0; i < bld->num_mips; i++) { 1186 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1187 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i); 1188 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1189 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1190 stride1 = LLVMBuildLoad(builder, stride1, ""); 1191 stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, ""); 1192 } 1193 stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4); 1194 } 1195 else { 1196 LLVMValueRef stride1; 1197 unsigned i; 1198 1199 assert (bld->num_mips == bld->coord_bld.type.length); 1200 1201 stride = bld->int_coord_bld.undef; 1202 for (i = 0; i < bld->coord_bld.type.length; i++) { 1203 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1204 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, ""); 1205 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, ""); 1206 stride1 = LLVMBuildLoad(builder, stride1, ""); 1207 stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, ""); 1208 } 1209 } 1210 return stride; 1211 } 1212 1213 1214 /** 1215 * When sampling a mipmap, we need to compute the width, height, depth 1216 * of the source levels from the level indexes. This helper function 1217 * does that. 1218 */ 1219 void 1220 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, 1221 LLVMValueRef ilevel, 1222 LLVMValueRef *out_size, 1223 LLVMValueRef *row_stride_vec, 1224 LLVMValueRef *img_stride_vec) 1225 { 1226 const unsigned dims = bld->dims; 1227 LLVMValueRef ilevel_vec; 1228 1229 /* 1230 * Compute width, height, depth at mipmap level 'ilevel' 1231 */ 1232 if (bld->num_mips == 1) { 1233 ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel); 1234 *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE); 1235 } 1236 else { 1237 LLVMValueRef int_size_vec; 1238 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; 1239 unsigned num_quads = bld->coord_bld.type.length / 4; 1240 unsigned i; 1241 1242 if (bld->num_mips == num_quads) { 1243 /* 1244 * XXX: this should be #ifndef SANE_INSTRUCTION_SET. 1245 * intel "forgot" the variable shift count instruction until avx2. 1246 * A harmless 8x32 shift gets translated into 32 instructions 1247 * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently 1248 * unable to recognize if there are really just 2 different shift 1249 * count values. So do the shift 4-wide before expansion. 1250 */ 1251 struct lp_build_context bld4; 1252 struct lp_type type4; 1253 1254 type4 = bld->int_coord_bld.type; 1255 type4.length = 4; 1256 1257 lp_build_context_init(&bld4, bld->gallivm, type4); 1258 1259 if (bld->dims == 1) { 1260 assert(bld->int_size_in_bld.type.length == 1); 1261 int_size_vec = lp_build_broadcast_scalar(&bld4, 1262 bld->int_size); 1263 } 1264 else { 1265 assert(bld->int_size_in_bld.type.length == 4); 1266 int_size_vec = bld->int_size; 1267 } 1268 1269 for (i = 0; i < num_quads; i++) { 1270 LLVMValueRef ileveli; 1271 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1272 1273 ileveli = lp_build_extract_broadcast(bld->gallivm, 1274 bld->leveli_bld.type, 1275 bld4.type, 1276 ilevel, 1277 indexi); 1278 tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE); 1279 } 1280 /* 1281 * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1, 1282 * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise. 1283 */ 1284 *out_size = lp_build_concat(bld->gallivm, 1285 tmp, 1286 bld4.type, 1287 num_quads); 1288 } 1289 else { 1290 /* FIXME: this is terrible and results in _huge_ vector 1291 * (for the dims > 1 case). 1292 * Should refactor this (together with extract_image_sizes) and do 1293 * something more useful. Could for instance if we have width,height 1294 * with 4-wide vector pack all elements into a 8xi16 vector 1295 * (on which we can still do useful math) instead of using a 16xi32 1296 * vector. 1297 * For dims == 1 this will create [w0, w1, w2, w3, ...] vector. 1298 * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector. 1299 */ 1300 assert(bld->num_mips == bld->coord_bld.type.length); 1301 if (bld->dims == 1) { 1302 assert(bld->int_size_in_bld.type.length == 1); 1303 int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, 1304 bld->int_size); 1305 *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE); 1306 } 1307 else { 1308 LLVMValueRef ilevel1; 1309 for (i = 0; i < bld->num_mips; i++) { 1310 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i); 1311 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type, 1312 bld->int_size_in_bld.type, ilevel, indexi); 1313 tmp[i] = bld->int_size; 1314 tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE); 1315 } 1316 *out_size = lp_build_concat(bld->gallivm, tmp, 1317 bld->int_size_in_bld.type, 1318 bld->num_mips); 1319 } 1320 } 1321 } 1322 1323 if (dims >= 2) { 1324 *row_stride_vec = lp_build_get_level_stride_vec(bld, 1325 bld->row_stride_array, 1326 ilevel); 1327 } 1328 if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) { 1329 *img_stride_vec = lp_build_get_level_stride_vec(bld, 1330 bld->img_stride_array, 1331 ilevel); 1332 } 1333 } 1334 1335 1336 /** 1337 * Extract and broadcast texture size. 1338 * 1339 * @param size_type type of the texture size vector (either 1340 * bld->int_size_type or bld->float_size_type) 1341 * @param coord_type type of the texture size vector (either 1342 * bld->int_coord_type or bld->coord_type) 1343 * @param size vector with the texture size (width, height, depth) 1344 */ 1345 void 1346 lp_build_extract_image_sizes(struct lp_build_sample_context *bld, 1347 struct lp_build_context *size_bld, 1348 struct lp_type coord_type, 1349 LLVMValueRef size, 1350 LLVMValueRef *out_width, 1351 LLVMValueRef *out_height, 1352 LLVMValueRef *out_depth) 1353 { 1354 const unsigned dims = bld->dims; 1355 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1356 struct lp_type size_type = size_bld->type; 1357 1358 if (bld->num_mips == 1) { 1359 *out_width = lp_build_extract_broadcast(bld->gallivm, 1360 size_type, 1361 coord_type, 1362 size, 1363 LLVMConstInt(i32t, 0, 0)); 1364 if (dims >= 2) { 1365 *out_height = lp_build_extract_broadcast(bld->gallivm, 1366 size_type, 1367 coord_type, 1368 size, 1369 LLVMConstInt(i32t, 1, 0)); 1370 if (dims == 3) { 1371 *out_depth = lp_build_extract_broadcast(bld->gallivm, 1372 size_type, 1373 coord_type, 1374 size, 1375 LLVMConstInt(i32t, 2, 0)); 1376 } 1377 } 1378 } 1379 else { 1380 unsigned num_quads = bld->coord_bld.type.length / 4; 1381 1382 if (dims == 1) { 1383 *out_width = size; 1384 } 1385 else if (bld->num_mips == num_quads) { 1386 *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4); 1387 if (dims >= 2) { 1388 *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4); 1389 if (dims == 3) { 1390 *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4); 1391 } 1392 } 1393 } 1394 else { 1395 assert(bld->num_mips == bld->coord_type.length); 1396 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1397 coord_type, size, 0); 1398 if (dims >= 2) { 1399 *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1400 coord_type, size, 1); 1401 if (dims == 3) { 1402 *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type, 1403 coord_type, size, 2); 1404 } 1405 } 1406 } 1407 } 1408 } 1409 1410 1411 /** 1412 * Unnormalize coords. 1413 * 1414 * @param flt_size vector with the integer texture size (width, height, depth) 1415 */ 1416 void 1417 lp_build_unnormalized_coords(struct lp_build_sample_context *bld, 1418 LLVMValueRef flt_size, 1419 LLVMValueRef *s, 1420 LLVMValueRef *t, 1421 LLVMValueRef *r) 1422 { 1423 const unsigned dims = bld->dims; 1424 LLVMValueRef width; 1425 LLVMValueRef height = NULL; 1426 LLVMValueRef depth = NULL; 1427 1428 lp_build_extract_image_sizes(bld, 1429 &bld->float_size_bld, 1430 bld->coord_type, 1431 flt_size, 1432 &width, 1433 &height, 1434 &depth); 1435 1436 /* s = s * width, t = t * height */ 1437 *s = lp_build_mul(&bld->coord_bld, *s, width); 1438 if (dims >= 2) { 1439 *t = lp_build_mul(&bld->coord_bld, *t, height); 1440 if (dims >= 3) { 1441 *r = lp_build_mul(&bld->coord_bld, *r, depth); 1442 } 1443 } 1444 } 1445 1446 /** 1447 * Generate new coords and faces for cubemap texels falling off the face. 1448 * 1449 * @param face face (center) of the pixel 1450 * @param x0 lower x coord 1451 * @param x1 higher x coord (must be x0 + 1) 1452 * @param y0 lower y coord 1453 * @param y1 higher y coord (must be x0 + 1) 1454 * @param max_coord texture cube (level) size - 1 1455 * @param next_faces new face values when falling off 1456 * @param next_xcoords new x coord values when falling off 1457 * @param next_ycoords new y coord values when falling off 1458 * 1459 * The arrays hold the new values when under/overflow of 1460 * lower x, higher x, lower y, higher y coord would occur (in this order). 1461 * next_xcoords/next_ycoords have two entries each (for both new lower and 1462 * higher coord). 1463 */ 1464 void 1465 lp_build_cube_new_coords(struct lp_build_context *ivec_bld, 1466 LLVMValueRef face, 1467 LLVMValueRef x0, 1468 LLVMValueRef x1, 1469 LLVMValueRef y0, 1470 LLVMValueRef y1, 1471 LLVMValueRef max_coord, 1472 LLVMValueRef next_faces[4], 1473 LLVMValueRef next_xcoords[4][2], 1474 LLVMValueRef next_ycoords[4][2]) 1475 { 1476 /* 1477 * Lookup tables aren't nice for simd code hence try some logic here. 1478 * (Note that while it would not be necessary to do per-sample (4) lookups 1479 * when using a LUT as it's impossible that texels fall off of positive 1480 * and negative edges simultaneously, it would however be necessary to 1481 * do 2 lookups for corner handling as in this case texels both fall off 1482 * of x and y axes.) 1483 */ 1484 /* 1485 * Next faces (for face 012345): 1486 * x < 0.0 : 451110 1487 * x >= 1.0 : 540001 1488 * y < 0.0 : 225422 1489 * y >= 1.0 : 334533 1490 * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1 1491 * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1) 1492 * nfy+: face & ~4 > 1 ? face + 2 : 3; 1493 * This could also use pshufb instead, but would need (manually coded) 1494 * ssse3 intrinsic (llvm won't do non-constant shuffles). 1495 */ 1496 struct gallivm_state *gallivm = ivec_bld->gallivm; 1497 LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp; 1498 LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1; 1499 LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2); 1500 LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3); 1501 LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4); 1502 LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5); 1503 1504 sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5); 1505 tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one); 1506 sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one); 1507 faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one); 1508 tmp = lp_build_add(ivec_bld, faceand1, c4); 1509 next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp); 1510 next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one); 1511 1512 tmp = lp_build_andnot(ivec_bld, face, c4); 1513 sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one); 1514 tmp = lp_build_add(ivec_bld, face, c2); 1515 next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3); 1516 next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one); 1517 1518 /* 1519 * new xcoords (for face 012345): 1520 * x < 0.0 : max max t max-t max max 1521 * x >= 1.0 : 0 0 max-t t 0 0 1522 * y < 0.0 : max 0 max-s s s max-s 1523 * y >= 1.0 : max 0 s max-s s max-s 1524 * 1525 * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0 1526 * ncx[0] = max - ncx[1] 1527 * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max 1528 * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3] 1529 */ 1530 sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2); 1531 maxmy0 = lp_build_sub(ivec_bld, max_coord, y0); 1532 tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0); 1533 next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero); 1534 next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]); 1535 maxmy1 = lp_build_sub(ivec_bld, max_coord, y1); 1536 tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1); 1537 next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero); 1538 next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]); 1539 1540 sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one); 1541 1542 tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord); 1543 maxmx0 = lp_build_sub(ivec_bld, max_coord, x0); 1544 tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0); 1545 next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1546 tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]); 1547 next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]); 1548 maxmx1 = lp_build_sub(ivec_bld, max_coord, x1); 1549 tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1); 1550 next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1551 tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]); 1552 next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]); 1553 1554 /* 1555 * new ycoords (for face 012345): 1556 * x < 0.0 : t t 0 max t t 1557 * x >= 1.0 : t t 0 max t t 1558 * y < 0.0 : max-s s 0 max max 0 1559 * y >= 1.0 : s max-s 0 max 0 max 1560 * 1561 * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t 1562 * ncy[1] = ncy[0] 1563 * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max 1564 * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3] 1565 */ 1566 tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord); 1567 next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0); 1568 next_ycoords[1][0] = next_ycoords[0][0]; 1569 next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1); 1570 next_ycoords[1][1] = next_ycoords[0][1]; 1571 1572 tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0); 1573 tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero); 1574 next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1575 tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]); 1576 next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp); 1577 tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1); 1578 tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero); 1579 next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel); 1580 tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]); 1581 next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp); 1582 } 1583 1584 1585 /** Helper used by lp_build_cube_lookup() */ 1586 static LLVMValueRef 1587 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord) 1588 { 1589 /* ima = +0.5 / abs(coord); */ 1590 LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5); 1591 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); 1592 LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord); 1593 return ima; 1594 } 1595 1596 1597 /** Helper for doing 3-wise selection. 1598 * Returns sel1 ? val2 : (sel0 ? val0 : val1). 1599 */ 1600 static LLVMValueRef 1601 lp_build_select3(struct lp_build_context *sel_bld, 1602 LLVMValueRef sel0, 1603 LLVMValueRef sel1, 1604 LLVMValueRef val0, 1605 LLVMValueRef val1, 1606 LLVMValueRef val2) 1607 { 1608 LLVMValueRef tmp; 1609 tmp = lp_build_select(sel_bld, sel0, val0, val1); 1610 return lp_build_select(sel_bld, sel1, val2, tmp); 1611 } 1612 1613 1614 /** 1615 * Generate code to do cube face selection and compute per-face texcoords. 1616 */ 1617 void 1618 lp_build_cube_lookup(struct lp_build_sample_context *bld, 1619 LLVMValueRef *coords, 1620 const struct lp_derivatives *derivs_in, /* optional */ 1621 LLVMValueRef *rho, 1622 struct lp_derivatives *derivs_out, /* optional */ 1623 boolean need_derivs) 1624 { 1625 struct lp_build_context *coord_bld = &bld->coord_bld; 1626 LLVMBuilderRef builder = bld->gallivm->builder; 1627 struct gallivm_state *gallivm = bld->gallivm; 1628 LLVMValueRef si, ti, ri; 1629 1630 /* 1631 * Do per-pixel face selection. We cannot however (as we used to do) 1632 * simply calculate the derivs afterwards (which is very bogus for 1633 * explicit derivs btw) because the values would be "random" when 1634 * not all pixels lie on the same face. So what we do here is just 1635 * calculate the derivatives after scaling the coords by the absolute 1636 * value of the inverse major axis, and essentially do rho calculation 1637 * steps as if it were a 3d texture. This is perfect if all pixels hit 1638 * the same face, but not so great at edges, I believe the max error 1639 * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring 1640 * the 3d distance between 2 points on the cube instead of measuring up/down 1641 * the edge). Still this is possibly a win over just selecting the same face 1642 * for all pixels. Unfortunately, something like that doesn't work for 1643 * explicit derivatives. 1644 */ 1645 struct lp_build_context *cint_bld = &bld->int_coord_bld; 1646 struct lp_type intctype = cint_bld->type; 1647 LLVMTypeRef coord_vec_type = coord_bld->vec_type; 1648 LLVMTypeRef cint_vec_type = cint_bld->vec_type; 1649 LLVMValueRef as, at, ar, face, face_s, face_t; 1650 LLVMValueRef as_ge_at, maxasat, ar_ge_as_at; 1651 LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz; 1652 LLVMValueRef tnegi, rnegi; 1653 LLVMValueRef ma, mai, signma, signmabit, imahalfpos; 1654 LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5); 1655 LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype, 1656 1LL << (intctype.width - 1)); 1657 LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype, 1658 intctype.width -1); 1659 LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X); 1660 LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y); 1661 LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z); 1662 LLVMValueRef s = coords[0]; 1663 LLVMValueRef t = coords[1]; 1664 LLVMValueRef r = coords[2]; 1665 1666 assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1); 1667 assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1); 1668 assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1); 1669 1670 /* 1671 * get absolute value (for x/y/z face selection) and sign bit 1672 * (for mirroring minor coords and pos/neg face selection) 1673 * of the original coords. 1674 */ 1675 as = lp_build_abs(&bld->coord_bld, s); 1676 at = lp_build_abs(&bld->coord_bld, t); 1677 ar = lp_build_abs(&bld->coord_bld, r); 1678 1679 /* 1680 * major face determination: select x if x > y else select y 1681 * select z if z >= max(x,y) else select previous result 1682 * if some axis are the same we chose z over y, y over x - the 1683 * dx10 spec seems to ask for it while OpenGL doesn't care (if we 1684 * wouldn't care could save a select or two if using different 1685 * compares and doing at_g_as_ar last since tnewx and tnewz are the 1686 * same). 1687 */ 1688 as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at); 1689 maxasat = lp_build_max(coord_bld, as, at); 1690 ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat); 1691 1692 if (need_derivs && (derivs_in || (bld->no_quad_lod && bld->no_rho_approx))) { 1693 /* 1694 * XXX: This is really really complex. 1695 * It is a bit overkill to use this for implicit derivatives as well, 1696 * no way this is worth the cost in practice, but seems to be the 1697 * only way for getting accurate and per-pixel lod values. 1698 */ 1699 LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3]; 1700 LLVMValueRef madx, mady, madxdivma, madydivma; 1701 LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi; 1702 LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi; 1703 LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz; 1704 LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz; 1705 LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy; 1706 /* 1707 * s = 1/2 * ( sc / ma + 1) 1708 * t = 1/2 * ( tc / ma + 1) 1709 * 1710 * s' = 1/2 * (sc' * ma - sc * ma') / ma^2 1711 * t' = 1/2 * (tc' * ma - tc * ma') / ma^2 1712 * 1713 * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma 1714 * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma 1715 * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma 1716 * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma 1717 */ 1718 1719 /* select ma, calculate ima */ 1720 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1721 mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); 1722 signmabit = LLVMBuildAnd(builder, mai, signmask, ""); 1723 ima = lp_build_div(coord_bld, coord_bld->one, ma); 1724 imahalf = lp_build_mul(coord_bld, posHalf, ima); 1725 imahalfpos = lp_build_abs(coord_bld, imahalf); 1726 1727 if (!derivs_in) { 1728 ddx[0] = lp_build_ddx(coord_bld, s); 1729 ddx[1] = lp_build_ddx(coord_bld, t); 1730 ddx[2] = lp_build_ddx(coord_bld, r); 1731 ddy[0] = lp_build_ddy(coord_bld, s); 1732 ddy[1] = lp_build_ddy(coord_bld, t); 1733 ddy[2] = lp_build_ddy(coord_bld, r); 1734 } 1735 else { 1736 ddx[0] = derivs_in->ddx[0]; 1737 ddx[1] = derivs_in->ddx[1]; 1738 ddx[2] = derivs_in->ddx[2]; 1739 ddy[0] = derivs_in->ddy[0]; 1740 ddy[1] = derivs_in->ddy[1]; 1741 ddy[2] = derivs_in->ddy[2]; 1742 } 1743 1744 /* select major derivatives */ 1745 madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]); 1746 mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]); 1747 1748 si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); 1749 ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); 1750 ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); 1751 1752 sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, ""); 1753 tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, ""); 1754 rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, ""); 1755 1756 sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, ""); 1757 tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, ""); 1758 rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, ""); 1759 1760 /* 1761 * compute all possible new s/t coords, which does the mirroring, 1762 * and do the same for derivs minor axes. 1763 * snewx = signma * -r; 1764 * tnewx = -t; 1765 * snewy = s; 1766 * tnewy = signma * r; 1767 * snewz = signma * s; 1768 * tnewz = -t; 1769 */ 1770 tnegi = LLVMBuildXor(builder, ti, signmask, ""); 1771 rnegi = LLVMBuildXor(builder, ri, signmask, ""); 1772 tdxnegi = LLVMBuildXor(builder, tdxi, signmask, ""); 1773 rdxnegi = LLVMBuildXor(builder, rdxi, signmask, ""); 1774 tdynegi = LLVMBuildXor(builder, tdyi, signmask, ""); 1775 rdynegi = LLVMBuildXor(builder, rdyi, signmask, ""); 1776 1777 snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); 1778 tnewx = tnegi; 1779 sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, ""); 1780 tdxnewx = tdxnegi; 1781 sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, ""); 1782 tdynewx = tdynegi; 1783 1784 snewy = si; 1785 tnewy = LLVMBuildXor(builder, signmabit, ri, ""); 1786 sdxnewy = sdxi; 1787 tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, ""); 1788 sdynewy = sdyi; 1789 tdynewy = LLVMBuildXor(builder, signmabit, rdyi, ""); 1790 1791 snewz = LLVMBuildXor(builder, signmabit, si, ""); 1792 tnewz = tnegi; 1793 sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, ""); 1794 tdxnewz = tdxnegi; 1795 sdynewz = LLVMBuildXor(builder, signmabit, sdyi, ""); 1796 tdynewz = tdynegi; 1797 1798 /* select the mirrored values */ 1799 face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); 1800 face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); 1801 face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); 1802 face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz); 1803 face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz); 1804 face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz); 1805 face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz); 1806 1807 face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); 1808 face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); 1809 face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, ""); 1810 face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, ""); 1811 face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, ""); 1812 face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, ""); 1813 1814 /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */ 1815 madxdivma = lp_build_mul(coord_bld, madx, ima); 1816 tmp = lp_build_mul(coord_bld, madxdivma, face_s); 1817 tmp = lp_build_sub(coord_bld, face_sdx, tmp); 1818 derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf); 1819 1820 /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */ 1821 tmp = lp_build_mul(coord_bld, madxdivma, face_t); 1822 tmp = lp_build_sub(coord_bld, face_tdx, tmp); 1823 derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf); 1824 1825 /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */ 1826 madydivma = lp_build_mul(coord_bld, mady, ima); 1827 tmp = lp_build_mul(coord_bld, madydivma, face_s); 1828 tmp = lp_build_sub(coord_bld, face_sdy, tmp); 1829 derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf); 1830 1831 /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */ 1832 tmp = lp_build_mul(coord_bld, madydivma, face_t); 1833 tmp = lp_build_sub(coord_bld, face_tdy, tmp); 1834 derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf); 1835 1836 signma = LLVMBuildLShr(builder, mai, signshift, ""); 1837 coords[2] = LLVMBuildOr(builder, face, signma, "face"); 1838 1839 /* project coords */ 1840 face_s = lp_build_mul(coord_bld, face_s, imahalfpos); 1841 face_t = lp_build_mul(coord_bld, face_t, imahalfpos); 1842 1843 coords[0] = lp_build_add(coord_bld, face_s, posHalf); 1844 coords[1] = lp_build_add(coord_bld, face_t, posHalf); 1845 1846 return; 1847 } 1848 1849 else if (need_derivs) { 1850 LLVMValueRef ddx_ddy[2], tmp[3], rho_vec; 1851 static const unsigned char swizzle0[] = { /* no-op swizzle */ 1852 0, LP_BLD_SWIZZLE_DONTCARE, 1853 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1854 }; 1855 static const unsigned char swizzle1[] = { 1856 1, LP_BLD_SWIZZLE_DONTCARE, 1857 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1858 }; 1859 static const unsigned char swizzle01[] = { /* no-op swizzle */ 1860 0, 1, 1861 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1862 }; 1863 static const unsigned char swizzle23[] = { 1864 2, 3, 1865 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1866 }; 1867 static const unsigned char swizzle02[] = { 1868 0, 2, 1869 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE 1870 }; 1871 1872 /* 1873 * scale the s/t/r coords pre-select/mirror so we can calculate 1874 * "reasonable" derivs. 1875 */ 1876 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1877 imahalfpos = lp_build_cube_imapos(coord_bld, ma); 1878 s = lp_build_mul(coord_bld, s, imahalfpos); 1879 t = lp_build_mul(coord_bld, t, imahalfpos); 1880 r = lp_build_mul(coord_bld, r, imahalfpos); 1881 1882 /* 1883 * This isn't quite the same as the "ordinary" (3d deriv) path since we 1884 * know the texture is square which simplifies things (we can omit the 1885 * size mul which happens very early completely here and do it at the 1886 * very end). 1887 * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX 1888 * since the error can get quite big otherwise at edges. 1889 * (With no_rho_approx max error is sqrt(2) at edges, same as it is 1890 * without no_rho_approx for 2d textures, otherwise it would be factor 2.) 1891 */ 1892 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t); 1893 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r); 1894 1895 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]); 1896 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]); 1897 1898 tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01); 1899 tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23); 1900 tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02); 1901 1902 rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]); 1903 rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]); 1904 1905 tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0); 1906 tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); 1907 *rho = lp_build_max(coord_bld, tmp[0], tmp[1]); 1908 } 1909 1910 if (!need_derivs) { 1911 ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r); 1912 } 1913 mai = LLVMBuildBitCast(builder, ma, cint_vec_type, ""); 1914 signmabit = LLVMBuildAnd(builder, mai, signmask, ""); 1915 1916 si = LLVMBuildBitCast(builder, s, cint_vec_type, ""); 1917 ti = LLVMBuildBitCast(builder, t, cint_vec_type, ""); 1918 ri = LLVMBuildBitCast(builder, r, cint_vec_type, ""); 1919 1920 /* 1921 * compute all possible new s/t coords, which does the mirroring 1922 * snewx = signma * -r; 1923 * tnewx = -t; 1924 * snewy = s; 1925 * tnewy = signma * r; 1926 * snewz = signma * s; 1927 * tnewz = -t; 1928 */ 1929 tnegi = LLVMBuildXor(builder, ti, signmask, ""); 1930 rnegi = LLVMBuildXor(builder, ri, signmask, ""); 1931 1932 snewx = LLVMBuildXor(builder, signmabit, rnegi, ""); 1933 tnewx = tnegi; 1934 1935 snewy = si; 1936 tnewy = LLVMBuildXor(builder, signmabit, ri, ""); 1937 1938 snewz = LLVMBuildXor(builder, signmabit, si, ""); 1939 tnewz = tnegi; 1940 1941 /* select the mirrored values */ 1942 face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz); 1943 face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz); 1944 face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez); 1945 1946 face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, ""); 1947 face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, ""); 1948 1949 /* add +1 for neg face */ 1950 /* XXX with AVX probably want to use another select here - 1951 * as long as we ensure vblendvps gets used we can actually 1952 * skip the comparison and just use sign as a "mask" directly. 1953 */ 1954 signma = LLVMBuildLShr(builder, mai, signshift, ""); 1955 coords[2] = LLVMBuildOr(builder, face, signma, "face"); 1956 1957 /* project coords */ 1958 if (!need_derivs) { 1959 imahalfpos = lp_build_cube_imapos(coord_bld, ma); 1960 face_s = lp_build_mul(coord_bld, face_s, imahalfpos); 1961 face_t = lp_build_mul(coord_bld, face_t, imahalfpos); 1962 } 1963 1964 coords[0] = lp_build_add(coord_bld, face_s, posHalf); 1965 coords[1] = lp_build_add(coord_bld, face_t, posHalf); 1966 } 1967 1968 1969 /** 1970 * Compute the partial offset of a pixel block along an arbitrary axis. 1971 * 1972 * @param coord coordinate in pixels 1973 * @param stride number of bytes between rows of successive pixel blocks 1974 * @param block_length number of pixels in a pixels block along the coordinate 1975 * axis 1976 * @param out_offset resulting relative offset of the pixel block in bytes 1977 * @param out_subcoord resulting sub-block pixel coordinate 1978 */ 1979 void 1980 lp_build_sample_partial_offset(struct lp_build_context *bld, 1981 unsigned block_length, 1982 LLVMValueRef coord, 1983 LLVMValueRef stride, 1984 LLVMValueRef *out_offset, 1985 LLVMValueRef *out_subcoord) 1986 { 1987 LLVMBuilderRef builder = bld->gallivm->builder; 1988 LLVMValueRef offset; 1989 LLVMValueRef subcoord; 1990 1991 if (block_length == 1) { 1992 subcoord = bld->zero; 1993 } 1994 else { 1995 /* 1996 * Pixel blocks have power of two dimensions. LLVM should convert the 1997 * rem/div to bit arithmetic. 1998 * TODO: Verify this. 1999 * It does indeed BUT it does transform it to scalar (and back) when doing so 2000 * (using roughly extract, shift/and, mov, unpack) (llvm 2.7). 2001 * The generated code looks seriously unfunny and is quite expensive. 2002 */ 2003 #if 0 2004 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); 2005 subcoord = LLVMBuildURem(builder, coord, block_width, ""); 2006 coord = LLVMBuildUDiv(builder, coord, block_width, ""); 2007 #else 2008 unsigned logbase2 = util_logbase2(block_length); 2009 LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2); 2010 LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1); 2011 subcoord = LLVMBuildAnd(builder, coord, block_mask, ""); 2012 coord = LLVMBuildLShr(builder, coord, block_shift, ""); 2013 #endif 2014 } 2015 2016 offset = lp_build_mul(bld, coord, stride); 2017 2018 assert(out_offset); 2019 assert(out_subcoord); 2020 2021 *out_offset = offset; 2022 *out_subcoord = subcoord; 2023 } 2024 2025 2026 /** 2027 * Compute the offset of a pixel block. 2028 * 2029 * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. 2030 * 2031 * Returns the relative offset and i,j sub-block coordinates 2032 */ 2033 void 2034 lp_build_sample_offset(struct lp_build_context *bld, 2035 const struct util_format_description *format_desc, 2036 LLVMValueRef x, 2037 LLVMValueRef y, 2038 LLVMValueRef z, 2039 LLVMValueRef y_stride, 2040 LLVMValueRef z_stride, 2041 LLVMValueRef *out_offset, 2042 LLVMValueRef *out_i, 2043 LLVMValueRef *out_j) 2044 { 2045 LLVMValueRef x_stride; 2046 LLVMValueRef offset; 2047 2048 x_stride = lp_build_const_vec(bld->gallivm, bld->type, 2049 format_desc->block.bits/8); 2050 2051 lp_build_sample_partial_offset(bld, 2052 format_desc->block.width, 2053 x, x_stride, 2054 &offset, out_i); 2055 2056 if (y && y_stride) { 2057 LLVMValueRef y_offset; 2058 lp_build_sample_partial_offset(bld, 2059 format_desc->block.height, 2060 y, y_stride, 2061 &y_offset, out_j); 2062 offset = lp_build_add(bld, offset, y_offset); 2063 } 2064 else { 2065 *out_j = bld->zero; 2066 } 2067 2068 if (z && z_stride) { 2069 LLVMValueRef z_offset; 2070 LLVMValueRef k; 2071 lp_build_sample_partial_offset(bld, 2072 1, /* pixel blocks are always 2D */ 2073 z, z_stride, 2074 &z_offset, &k); 2075 offset = lp_build_add(bld, offset, z_offset); 2076 } 2077 2078 *out_offset = offset; 2079 } 2080