1 /* 2 * Mesa 3-D graphics library 3 * 4 * Copyright 2012 Intel Corporation 5 * Copyright 2013 Google 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sublicense, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice (including the 16 * next paragraph) shall be included in all copies or substantial portions 17 * of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * Authors: 28 * Chad Versace <chad.versace (at) linux.intel.com> 29 * Frank Henigman <fjhenigman (at) google.com> 30 */ 31 32 #include <string.h> 33 34 #include "util/macros.h" 35 36 #include "brw_context.h" 37 #include "intel_tiled_memcpy.h" 38 39 #if defined(__SSSE3__) 40 #include <tmmintrin.h> 41 #elif defined(__SSE2__) 42 #include <emmintrin.h> 43 #endif 44 45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE 46 47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b) 48 #define ALIGN_UP(a, b) ALIGN(a, b) 49 50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e. 51 * unitless). A "span" is the most number of bytes we can copy from linear 52 * to tiled without needing to calculate a new destination address. 53 */ 54 static const uint32_t xtile_width = 512; 55 static const uint32_t xtile_height = 8; 56 static const uint32_t xtile_span = 64; 57 static const uint32_t ytile_width = 128; 58 static const uint32_t ytile_height = 32; 59 static const uint32_t ytile_span = 16; 60 61 static inline uint32_t 62 ror(uint32_t n, uint32_t d) 63 { 64 return (n >> d) | (n << (32 - d)); 65 } 66 67 static inline uint32_t 68 bswap32(uint32_t n) 69 { 70 #if defined(HAVE___BUILTIN_BSWAP32) 71 return __builtin_bswap32(n); 72 #else 73 return (n >> 24) | 74 ((n >> 8) & 0x0000ff00) | 75 ((n << 8) & 0x00ff0000) | 76 (n << 24); 77 #endif 78 } 79 80 /** 81 * Copy RGBA to BGRA - swap R and B. 82 */ 83 static inline void * 84 rgba8_copy(void *dst, const void *src, size_t bytes) 85 { 86 uint32_t *d = dst; 87 uint32_t const *s = src; 88 89 assert(bytes % 4 == 0); 90 91 while (bytes >= 4) { 92 *d = ror(bswap32(*s), 8); 93 d += 1; 94 s += 1; 95 bytes -= 4; 96 } 97 return dst; 98 } 99 100 #ifdef __SSSE3__ 101 static const uint8_t rgba8_permutation[16] = 102 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; 103 104 static inline void 105 rgba8_copy_16_aligned_dst(void *dst, const void *src) 106 { 107 _mm_store_si128(dst, 108 _mm_shuffle_epi8(_mm_loadu_si128(src), 109 *(__m128i *)rgba8_permutation)); 110 } 111 112 static inline void 113 rgba8_copy_16_aligned_src(void *dst, const void *src) 114 { 115 _mm_storeu_si128(dst, 116 _mm_shuffle_epi8(_mm_load_si128(src), 117 *(__m128i *)rgba8_permutation)); 118 } 119 120 #elif defined(__SSE2__) 121 static inline void 122 rgba8_copy_16_aligned_dst(void *dst, const void *src) 123 { 124 __m128i srcreg, dstreg, agmask, ag, rb, br; 125 126 agmask = _mm_set1_epi32(0xFF00FF00); 127 srcreg = _mm_loadu_si128((__m128i *)src); 128 129 rb = _mm_andnot_si128(agmask, srcreg); 130 ag = _mm_and_si128(agmask, srcreg); 131 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 132 _MM_SHUFFLE(2, 3, 0, 1)); 133 dstreg = _mm_or_si128(ag, br); 134 135 _mm_store_si128((__m128i *)dst, dstreg); 136 } 137 138 static inline void 139 rgba8_copy_16_aligned_src(void *dst, const void *src) 140 { 141 __m128i srcreg, dstreg, agmask, ag, rb, br; 142 143 agmask = _mm_set1_epi32(0xFF00FF00); 144 srcreg = _mm_load_si128((__m128i *)src); 145 146 rb = _mm_andnot_si128(agmask, srcreg); 147 ag = _mm_and_si128(agmask, srcreg); 148 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), 149 _MM_SHUFFLE(2, 3, 0, 1)); 150 dstreg = _mm_or_si128(ag, br); 151 152 _mm_storeu_si128((__m128i *)dst, dstreg); 153 } 154 #endif 155 156 /** 157 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned. 158 */ 159 static inline void * 160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) 161 { 162 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf)); 163 164 #if defined(__SSSE3__) || defined(__SSE2__) 165 if (bytes == 64) { 166 rgba8_copy_16_aligned_dst(dst + 0, src + 0); 167 rgba8_copy_16_aligned_dst(dst + 16, src + 16); 168 rgba8_copy_16_aligned_dst(dst + 32, src + 32); 169 rgba8_copy_16_aligned_dst(dst + 48, src + 48); 170 return dst; 171 } 172 173 while (bytes >= 16) { 174 rgba8_copy_16_aligned_dst(dst, src); 175 src += 16; 176 dst += 16; 177 bytes -= 16; 178 } 179 #endif 180 181 rgba8_copy(dst, src, bytes); 182 183 return dst; 184 } 185 186 /** 187 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned. 188 */ 189 static inline void * 190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) 191 { 192 assert(bytes == 0 || !(((uintptr_t)src) & 0xf)); 193 194 #if defined(__SSSE3__) || defined(__SSE2__) 195 if (bytes == 64) { 196 rgba8_copy_16_aligned_src(dst + 0, src + 0); 197 rgba8_copy_16_aligned_src(dst + 16, src + 16); 198 rgba8_copy_16_aligned_src(dst + 32, src + 32); 199 rgba8_copy_16_aligned_src(dst + 48, src + 48); 200 return dst; 201 } 202 203 while (bytes >= 16) { 204 rgba8_copy_16_aligned_src(dst, src); 205 src += 16; 206 dst += 16; 207 bytes -= 16; 208 } 209 #endif 210 211 rgba8_copy(dst, src, bytes); 212 213 return dst; 214 } 215 216 /** 217 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3). 218 * These ranges are in bytes, i.e. pixels * bytes-per-pixel. 219 * The first and last ranges must be shorter than a "span" (the longest linear 220 * stretch within a tile) and the middle must equal a whole number of spans. 221 * Ranges may be empty. The region copied must land entirely within one tile. 222 * 'dst' is the start of the tile and 'src' is the corresponding 223 * address to copy from, though copying begins at (x0, y0). 224 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero. 225 * Swizzling flips bit 6 in the copy destination offset, when certain other 226 * bits are set in it. 227 */ 228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 229 uint32_t y0, uint32_t y1, 230 char *dst, const char *src, 231 int32_t linear_pitch, 232 uint32_t swizzle_bit, 233 mem_copy_fn mem_copy); 234 235 /** 236 * Copy texture data from linear to X tile layout. 237 * 238 * \copydoc tile_copy_fn 239 * 240 * The mem_copy parameters allow the user to specify an alternative mem_copy 241 * function that, for instance, may do RGBA -> BGRA swizzling. The first 242 * function must handle any memory alignment while the second function must 243 * only handle 16-byte alignment in whichever side (source or destination) is 244 * tiled. 245 */ 246 static inline void 247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 248 uint32_t y0, uint32_t y1, 249 char *dst, const char *src, 250 int32_t src_pitch, 251 uint32_t swizzle_bit, 252 mem_copy_fn mem_copy, 253 mem_copy_fn mem_copy_align16) 254 { 255 /* The copy destination offset for each range copied is the sum of 256 * an X offset 'x0' or 'xo' and a Y offset 'yo.' 257 */ 258 uint32_t xo, yo; 259 260 src += (ptrdiff_t)y0 * src_pitch; 261 262 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 263 /* Bits 9 and 10 of the copy destination offset control swizzling. 264 * Only 'yo' contributes to those bits in the total offset, 265 * so calculate 'swizzle' just once per row. 266 * Move bits 9 and 10 three and four places respectively down 267 * to bit 6 and xor them. 268 */ 269 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 270 271 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); 272 273 for (xo = x1; xo < x2; xo += xtile_span) { 274 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); 275 } 276 277 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 278 279 src += src_pitch; 280 } 281 } 282 283 /** 284 * Copy texture data from linear to Y tile layout. 285 * 286 * \copydoc tile_copy_fn 287 */ 288 static inline void 289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 290 uint32_t y0, uint32_t y1, 291 char *dst, const char *src, 292 int32_t src_pitch, 293 uint32_t swizzle_bit, 294 mem_copy_fn mem_copy, 295 mem_copy_fn mem_copy_align16) 296 { 297 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 298 * as the tile). Thus the destination offset for (x,y) is the sum of: 299 * (x % column_width) // position within column 300 * (x / column_width) * bytes_per_column // column number * bytes per column 301 * y * column_width 302 * 303 * The copy destination offset for each range copied is the sum of 304 * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 305 */ 306 const uint32_t column_width = ytile_span; 307 const uint32_t bytes_per_column = column_width * ytile_height; 308 309 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 310 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 311 312 /* Bit 9 of the destination offset control swizzling. 313 * Only the X offset contributes to bit 9 of the total offset, 314 * so swizzle can be calculated in advance for these X positions. 315 * Move bit 9 three places down to bit 6. 316 */ 317 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 318 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 319 320 uint32_t x, yo; 321 322 src += (ptrdiff_t)y0 * src_pitch; 323 324 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 325 uint32_t xo = xo1; 326 uint32_t swizzle = swizzle1; 327 328 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); 329 330 /* Step by spans/columns. As it happens, the swizzle bit flips 331 * at each step so we don't need to calculate it explicitly. 332 */ 333 for (x = x1; x < x2; x += ytile_span) { 334 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); 335 xo += bytes_per_column; 336 swizzle ^= swizzle_bit; 337 } 338 339 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); 340 341 src += src_pitch; 342 } 343 } 344 345 /** 346 * Copy texture data from X tile layout to linear. 347 * 348 * \copydoc tile_copy_fn 349 */ 350 static inline void 351 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 352 uint32_t y0, uint32_t y1, 353 char *dst, const char *src, 354 int32_t dst_pitch, 355 uint32_t swizzle_bit, 356 mem_copy_fn mem_copy, 357 mem_copy_fn mem_copy_align16) 358 { 359 /* The copy destination offset for each range copied is the sum of 360 * an X offset 'x0' or 'xo' and a Y offset 'yo.' 361 */ 362 uint32_t xo, yo; 363 364 dst += (ptrdiff_t)y0 * dst_pitch; 365 366 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) { 367 /* Bits 9 and 10 of the copy destination offset control swizzling. 368 * Only 'yo' contributes to those bits in the total offset, 369 * so calculate 'swizzle' just once per row. 370 * Move bits 9 and 10 three and four places respectively down 371 * to bit 6 and xor them. 372 */ 373 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit; 374 375 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); 376 377 for (xo = x1; xo < x2; xo += xtile_span) { 378 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); 379 } 380 381 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 382 383 dst += dst_pitch; 384 } 385 } 386 387 /** 388 * Copy texture data from Y tile layout to linear. 389 * 390 * \copydoc tile_copy_fn 391 */ 392 static inline void 393 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 394 uint32_t y0, uint32_t y1, 395 char *dst, const char *src, 396 int32_t dst_pitch, 397 uint32_t swizzle_bit, 398 mem_copy_fn mem_copy, 399 mem_copy_fn mem_copy_align16) 400 { 401 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height 402 * as the tile). Thus the destination offset for (x,y) is the sum of: 403 * (x % column_width) // position within column 404 * (x / column_width) * bytes_per_column // column number * bytes per column 405 * y * column_width 406 * 407 * The copy destination offset for each range copied is the sum of 408 * an X offset 'xo0' or 'xo' and a Y offset 'yo.' 409 */ 410 const uint32_t column_width = ytile_span; 411 const uint32_t bytes_per_column = column_width * ytile_height; 412 413 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column; 414 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column; 415 416 /* Bit 9 of the destination offset control swizzling. 417 * Only the X offset contributes to bit 9 of the total offset, 418 * so swizzle can be calculated in advance for these X positions. 419 * Move bit 9 three places down to bit 6. 420 */ 421 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit; 422 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit; 423 424 uint32_t x, yo; 425 426 dst += (ptrdiff_t)y0 * dst_pitch; 427 428 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) { 429 uint32_t xo = xo1; 430 uint32_t swizzle = swizzle1; 431 432 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0); 433 434 /* Step by spans/columns. As it happens, the swizzle bit flips 435 * at each step so we don't need to calculate it explicitly. 436 */ 437 for (x = x1; x < x2; x += ytile_span) { 438 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); 439 xo += bytes_per_column; 440 swizzle ^= swizzle_bit; 441 } 442 443 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); 444 445 dst += dst_pitch; 446 } 447 } 448 449 450 /** 451 * Copy texture data from linear to X tile layout, faster. 452 * 453 * Same as \ref linear_to_xtiled but faster, because it passes constant 454 * parameters for common cases, allowing the compiler to inline code 455 * optimized for those cases. 456 * 457 * \copydoc tile_copy_fn 458 */ 459 static FLATTEN void 460 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 461 uint32_t y0, uint32_t y1, 462 char *dst, const char *src, 463 int32_t src_pitch, 464 uint32_t swizzle_bit, 465 mem_copy_fn mem_copy) 466 { 467 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 468 if (mem_copy == memcpy) 469 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 470 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 471 else if (mem_copy == rgba8_copy) 472 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, 473 dst, src, src_pitch, swizzle_bit, 474 rgba8_copy, rgba8_copy_aligned_dst); 475 else 476 unreachable("not reached"); 477 } else { 478 if (mem_copy == memcpy) 479 return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 480 dst, src, src_pitch, swizzle_bit, 481 memcpy, memcpy); 482 else if (mem_copy == rgba8_copy) 483 return linear_to_xtiled(x0, x1, x2, x3, y0, y1, 484 dst, src, src_pitch, swizzle_bit, 485 rgba8_copy, rgba8_copy_aligned_dst); 486 else 487 unreachable("not reached"); 488 } 489 linear_to_xtiled(x0, x1, x2, x3, y0, y1, 490 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 491 } 492 493 /** 494 * Copy texture data from linear to Y tile layout, faster. 495 * 496 * Same as \ref linear_to_ytiled but faster, because it passes constant 497 * parameters for common cases, allowing the compiler to inline code 498 * optimized for those cases. 499 * 500 * \copydoc tile_copy_fn 501 */ 502 static FLATTEN void 503 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 504 uint32_t y0, uint32_t y1, 505 char *dst, const char *src, 506 int32_t src_pitch, 507 uint32_t swizzle_bit, 508 mem_copy_fn mem_copy) 509 { 510 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 511 if (mem_copy == memcpy) 512 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 513 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 514 else if (mem_copy == rgba8_copy) 515 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, 516 dst, src, src_pitch, swizzle_bit, 517 rgba8_copy, rgba8_copy_aligned_dst); 518 else 519 unreachable("not reached"); 520 } else { 521 if (mem_copy == memcpy) 522 return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 523 dst, src, src_pitch, swizzle_bit, memcpy, memcpy); 524 else if (mem_copy == rgba8_copy) 525 return linear_to_ytiled(x0, x1, x2, x3, y0, y1, 526 dst, src, src_pitch, swizzle_bit, 527 rgba8_copy, rgba8_copy_aligned_dst); 528 else 529 unreachable("not reached"); 530 } 531 linear_to_ytiled(x0, x1, x2, x3, y0, y1, 532 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); 533 } 534 535 /** 536 * Copy texture data from X tile layout to linear, faster. 537 * 538 * Same as \ref xtile_to_linear but faster, because it passes constant 539 * parameters for common cases, allowing the compiler to inline code 540 * optimized for those cases. 541 * 542 * \copydoc tile_copy_fn 543 */ 544 static FLATTEN void 545 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 546 uint32_t y0, uint32_t y1, 547 char *dst, const char *src, 548 int32_t dst_pitch, 549 uint32_t swizzle_bit, 550 mem_copy_fn mem_copy) 551 { 552 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { 553 if (mem_copy == memcpy) 554 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 555 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 556 else if (mem_copy == rgba8_copy) 557 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, 558 dst, src, dst_pitch, swizzle_bit, 559 rgba8_copy, rgba8_copy_aligned_src); 560 else 561 unreachable("not reached"); 562 } else { 563 if (mem_copy == memcpy) 564 return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 565 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 566 else if (mem_copy == rgba8_copy) 567 return xtiled_to_linear(x0, x1, x2, x3, y0, y1, 568 dst, src, dst_pitch, swizzle_bit, 569 rgba8_copy, rgba8_copy_aligned_src); 570 else 571 unreachable("not reached"); 572 } 573 xtiled_to_linear(x0, x1, x2, x3, y0, y1, 574 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 575 } 576 577 /** 578 * Copy texture data from Y tile layout to linear, faster. 579 * 580 * Same as \ref ytile_to_linear but faster, because it passes constant 581 * parameters for common cases, allowing the compiler to inline code 582 * optimized for those cases. 583 * 584 * \copydoc tile_copy_fn 585 */ 586 static FLATTEN void 587 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 588 uint32_t y0, uint32_t y1, 589 char *dst, const char *src, 590 int32_t dst_pitch, 591 uint32_t swizzle_bit, 592 mem_copy_fn mem_copy) 593 { 594 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { 595 if (mem_copy == memcpy) 596 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 597 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 598 else if (mem_copy == rgba8_copy) 599 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, 600 dst, src, dst_pitch, swizzle_bit, 601 rgba8_copy, rgba8_copy_aligned_src); 602 else 603 unreachable("not reached"); 604 } else { 605 if (mem_copy == memcpy) 606 return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 607 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); 608 else if (mem_copy == rgba8_copy) 609 return ytiled_to_linear(x0, x1, x2, x3, y0, y1, 610 dst, src, dst_pitch, swizzle_bit, 611 rgba8_copy, rgba8_copy_aligned_src); 612 else 613 unreachable("not reached"); 614 } 615 ytiled_to_linear(x0, x1, x2, x3, y0, y1, 616 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); 617 } 618 619 /** 620 * Copy from linear to tiled texture. 621 * 622 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 623 * pieces that do not cross tile boundaries and copy each piece with a tile 624 * copy function (\ref tile_copy_fn). 625 * The X range is in bytes, i.e. pixels * bytes-per-pixel. 626 * The Y range is in pixels (i.e. unitless). 627 * 'dst' is the start of the texture and 'src' is the corresponding 628 * address to copy from, though copying begins at (xt1, yt1). 629 */ 630 void 631 linear_to_tiled(uint32_t xt1, uint32_t xt2, 632 uint32_t yt1, uint32_t yt2, 633 char *dst, const char *src, 634 uint32_t dst_pitch, int32_t src_pitch, 635 bool has_swizzling, 636 uint32_t tiling, 637 mem_copy_fn mem_copy) 638 { 639 tile_copy_fn tile_copy; 640 uint32_t xt0, xt3; 641 uint32_t yt0, yt3; 642 uint32_t xt, yt; 643 uint32_t tw, th, span; 644 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 645 646 if (tiling == I915_TILING_X) { 647 tw = xtile_width; 648 th = xtile_height; 649 span = xtile_span; 650 tile_copy = linear_to_xtiled_faster; 651 } else if (tiling == I915_TILING_Y) { 652 tw = ytile_width; 653 th = ytile_height; 654 span = ytile_span; 655 tile_copy = linear_to_ytiled_faster; 656 } else { 657 unreachable("unsupported tiling"); 658 } 659 660 /* Round out to tile boundaries. */ 661 xt0 = ALIGN_DOWN(xt1, tw); 662 xt3 = ALIGN_UP (xt2, tw); 663 yt0 = ALIGN_DOWN(yt1, th); 664 yt3 = ALIGN_UP (yt2, th); 665 666 /* Loop over all tiles to which we have something to copy. 667 * 'xt' and 'yt' are the origin of the destination tile, whether copying 668 * copying a full or partial tile. 669 * tile_copy() copies one tile or partial tile. 670 * Looping x inside y is the faster memory access pattern. 671 */ 672 for (yt = yt0; yt < yt3; yt += th) { 673 for (xt = xt0; xt < xt3; xt += tw) { 674 /* The area to update is [x0,x3) x [y0,y1). 675 * May not want the whole tile, hence the min and max. 676 */ 677 uint32_t x0 = MAX2(xt1, xt); 678 uint32_t y0 = MAX2(yt1, yt); 679 uint32_t x3 = MIN2(xt2, xt + tw); 680 uint32_t y1 = MIN2(yt2, yt + th); 681 682 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 683 * the middle interval is the longest span-aligned part. 684 * The sub-ranges could be empty. 685 */ 686 uint32_t x1, x2; 687 x1 = ALIGN_UP(x0, span); 688 if (x1 > x3) 689 x1 = x2 = x3; 690 else 691 x2 = ALIGN_DOWN(x3, span); 692 693 assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 694 assert(x1 - x0 < span && x3 - x2 < span); 695 assert(x3 - x0 <= tw); 696 assert((x2 - x1) % span == 0); 697 698 /* Translate by (xt,yt) for single-tile copier. */ 699 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 700 y0-yt, y1-yt, 701 dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch, 702 src + (ptrdiff_t) xt + (ptrdiff_t) yt * src_pitch, 703 src_pitch, 704 swizzle_bit, 705 mem_copy); 706 } 707 } 708 } 709 710 /** 711 * Copy from tiled to linear texture. 712 * 713 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into 714 * pieces that do not cross tile boundaries and copy each piece with a tile 715 * copy function (\ref tile_copy_fn). 716 * The X range is in bytes, i.e. pixels * bytes-per-pixel. 717 * The Y range is in pixels (i.e. unitless). 718 * 'dst' is the start of the texture and 'src' is the corresponding 719 * address to copy from, though copying begins at (xt1, yt1). 720 */ 721 void 722 tiled_to_linear(uint32_t xt1, uint32_t xt2, 723 uint32_t yt1, uint32_t yt2, 724 char *dst, const char *src, 725 int32_t dst_pitch, uint32_t src_pitch, 726 bool has_swizzling, 727 uint32_t tiling, 728 mem_copy_fn mem_copy) 729 { 730 tile_copy_fn tile_copy; 731 uint32_t xt0, xt3; 732 uint32_t yt0, yt3; 733 uint32_t xt, yt; 734 uint32_t tw, th, span; 735 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0; 736 737 if (tiling == I915_TILING_X) { 738 tw = xtile_width; 739 th = xtile_height; 740 span = xtile_span; 741 tile_copy = xtiled_to_linear_faster; 742 } else if (tiling == I915_TILING_Y) { 743 tw = ytile_width; 744 th = ytile_height; 745 span = ytile_span; 746 tile_copy = ytiled_to_linear_faster; 747 } else { 748 unreachable("unsupported tiling"); 749 } 750 751 /* Round out to tile boundaries. */ 752 xt0 = ALIGN_DOWN(xt1, tw); 753 xt3 = ALIGN_UP (xt2, tw); 754 yt0 = ALIGN_DOWN(yt1, th); 755 yt3 = ALIGN_UP (yt2, th); 756 757 /* Loop over all tiles to which we have something to copy. 758 * 'xt' and 'yt' are the origin of the destination tile, whether copying 759 * copying a full or partial tile. 760 * tile_copy() copies one tile or partial tile. 761 * Looping x inside y is the faster memory access pattern. 762 */ 763 for (yt = yt0; yt < yt3; yt += th) { 764 for (xt = xt0; xt < xt3; xt += tw) { 765 /* The area to update is [x0,x3) x [y0,y1). 766 * May not want the whole tile, hence the min and max. 767 */ 768 uint32_t x0 = MAX2(xt1, xt); 769 uint32_t y0 = MAX2(yt1, yt); 770 uint32_t x3 = MIN2(xt2, xt + tw); 771 uint32_t y1 = MIN2(yt2, yt + th); 772 773 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that 774 * the middle interval is the longest span-aligned part. 775 * The sub-ranges could be empty. 776 */ 777 uint32_t x1, x2; 778 x1 = ALIGN_UP(x0, span); 779 if (x1 > x3) 780 x1 = x2 = x3; 781 else 782 x2 = ALIGN_DOWN(x3, span); 783 784 assert(x0 <= x1 && x1 <= x2 && x2 <= x3); 785 assert(x1 - x0 < span && x3 - x2 < span); 786 assert(x3 - x0 <= tw); 787 assert((x2 - x1) % span == 0); 788 789 /* Translate by (xt,yt) for single-tile copier. */ 790 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt, 791 y0-yt, y1-yt, 792 dst + (ptrdiff_t) xt + (ptrdiff_t) yt * dst_pitch, 793 src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch, 794 dst_pitch, 795 swizzle_bit, 796 mem_copy); 797 } 798 } 799 } 800 801 802 /** 803 * Determine which copy function to use for the given format combination 804 * 805 * The only two possible copy functions which are ever returned are a 806 * direct memcpy and a RGBA <-> BGRA copy function. Since RGBA -> BGRA and 807 * BGRA -> RGBA are exactly the same operation (and memcpy is obviously 808 * symmetric), it doesn't matter whether the copy is from the tiled image 809 * to the untiled or vice versa. The copy function required is the same in 810 * either case so this function can be used. 811 * 812 * \param[in] tiledFormat The format of the tiled image 813 * \param[in] format The GL format of the client data 814 * \param[in] type The GL type of the client data 815 * \param[out] mem_copy Will be set to one of either the standard 816 * library's memcpy or a different copy function 817 * that performs an RGBA to BGRA conversion 818 * \param[out] cpp Number of bytes per channel 819 * 820 * \return true if the format and type combination are valid 821 */ 822 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, 823 GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp) 824 { 825 if (type == GL_UNSIGNED_INT_8_8_8_8_REV && 826 !(format == GL_RGBA || format == GL_BGRA)) 827 return false; /* Invalid type/format combination */ 828 829 if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) || 830 (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) { 831 *cpp = 1; 832 *mem_copy = memcpy; 833 } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) || 834 (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) || 835 (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) || 836 (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) { 837 *cpp = 4; 838 if (format == GL_BGRA) { 839 *mem_copy = memcpy; 840 } else if (format == GL_RGBA) { 841 *mem_copy = rgba8_copy; 842 } 843 } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) || 844 (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) || 845 (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) || 846 (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) { 847 *cpp = 4; 848 if (format == GL_BGRA) { 849 /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can 850 * use the same function. 851 */ 852 *mem_copy = rgba8_copy; 853 } else if (format == GL_RGBA) { 854 *mem_copy = memcpy; 855 } 856 } 857 858 if (!(*mem_copy)) 859 return false; 860 861 return true; 862 } 863