1 /* 2 * Copyright 2013 Ilia Mirkin 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "nv50/nv84_video.h" 24 25 #include "util/u_sse.h" 26 27 struct h264_iparm1 { 28 uint8_t scaling_lists_4x4[6][16]; // 00 29 uint8_t scaling_lists_8x8[2][64]; // 60 30 uint32_t width; // e0 31 uint32_t height; // e4 32 uint64_t ref1_addrs[16]; // e8 33 uint64_t ref2_addrs[16]; // 168 34 uint32_t unk1e8; 35 uint32_t unk1ec; 36 uint32_t w1; // 1f0 37 uint32_t w2; // 1f4 38 uint32_t w3; // 1f8 39 uint32_t h1; // 1fc 40 uint32_t h2; // 200 41 uint32_t h3; // 204 42 uint32_t mb_adaptive_frame_field_flag; // 208 43 uint32_t field_pic_flag; // 20c 44 uint32_t format; // 210 45 uint32_t unk214; // 214 46 }; 47 48 struct h264_iparm2 { 49 uint32_t width; // 00 50 uint32_t height; // 04 51 uint32_t mbs; // 08 52 uint32_t w1; // 0c 53 uint32_t w2; // 10 54 uint32_t w3; // 14 55 uint32_t h1; // 18 56 uint32_t h2; // 1c 57 uint32_t h3; // 20 58 uint32_t unk24; 59 uint32_t mb_adaptive_frame_field_flag; // 28 60 uint32_t top; // 2c 61 uint32_t bottom; // 30 62 uint32_t is_reference; // 34 63 }; 64 65 void 66 nv84_decoder_vp_h264(struct nv84_decoder *dec, 67 struct pipe_h264_picture_desc *desc, 68 struct nv84_video_buffer *dest) 69 { 70 struct h264_iparm1 param1; 71 struct h264_iparm2 param2; 72 int i, width = align(dest->base.width, 16), 73 height = align(dest->base.height, 16); 74 75 struct nouveau_pushbuf *push = dec->vp_pushbuf; 76 struct nouveau_pushbuf_refn bo_refs[] = { 77 { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 78 { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 79 { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 80 { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 81 { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART }, 82 { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 83 }; 84 int num_refs = ARRAY_SIZE(bo_refs); 85 bool is_ref = desc->is_reference; 86 87 STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218); 88 STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38); 89 90 memset(¶m1, 0, sizeof(param1)); 91 memset(¶m2, 0, sizeof(param2)); 92 93 memcpy(¶m1.scaling_lists_4x4, desc->pps->ScalingList4x4, 94 sizeof(param1.scaling_lists_4x4)); 95 memcpy(¶m1.scaling_lists_8x8, desc->pps->ScalingList8x8, 96 sizeof(param1.scaling_lists_8x8)); 97 98 param1.width = width; 99 param1.w1 = param1.w2 = param1.w3 = align(width, 64); 100 param1.height = param1.h2 = height; 101 param1.h1 = param1.h3 = align(height, 32); 102 param1.format = 0x3231564e; /* 'NV12' */ 103 param1.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag; 104 param1.field_pic_flag = desc->field_pic_flag; 105 106 param2.width = width; 107 param2.w1 = param2.w2 = param2.w3 = param1.w1; 108 if (desc->field_pic_flag) 109 param2.height = align(height, 32) / 2; 110 else 111 param2.height = height; 112 param2.h1 = param2.h2 = align(height, 32); 113 param2.h3 = height; 114 param2.mbs = width * height >> 8; 115 if (desc->field_pic_flag) { 116 param2.top = desc->bottom_field_flag ? 2 : 1; 117 param2.bottom = desc->bottom_field_flag; 118 } 119 param2.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag; 120 param2.is_reference = desc->is_reference; 121 122 PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2); 123 124 struct nouveau_bo *ref2_default = dest->full; 125 126 for (i = 0; i < 16; i++) { 127 struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i]; 128 struct nouveau_bo *bo1, *bo2; 129 if (buf) { 130 bo1 = buf->interlaced; 131 bo2 = buf->full; 132 if (i == 0) 133 ref2_default = buf->full; 134 } else { 135 bo1 = dest->interlaced; 136 bo2 = ref2_default; 137 } 138 param1.ref1_addrs[i] = bo1->offset; 139 param1.ref2_addrs[i] = bo2->offset; 140 struct nouveau_pushbuf_refn bo_refs[] = { 141 { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 142 { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 143 }; 144 nouveau_pushbuf_refn(push, bo_refs, ARRAY_SIZE(bo_refs)); 145 } 146 147 memcpy(dec->vp_params->map, ¶m1, sizeof(param1)); 148 memcpy(dec->vp_params->map + 0x400, ¶m2, sizeof(param2)); 149 150 nouveau_pushbuf_refn(push, bo_refs, num_refs); 151 152 /* Wait for BSP to have completed */ 153 BEGIN_NV04(push, SUBC_VP(0x10), 4); 154 PUSH_DATAh(push, dec->fence->offset); 155 PUSH_DATA (push, dec->fence->offset); 156 PUSH_DATA (push, 2); 157 PUSH_DATA (push, 1); /* wait for sem == 2 */ 158 159 /* VP step 1 */ 160 BEGIN_NV04(push, SUBC_VP(0x400), 15); 161 PUSH_DATA (push, 1); 162 PUSH_DATA (push, param2.mbs); 163 PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */ 164 PUSH_DATA (push, 0x55001); /* constant */ 165 PUSH_DATA (push, dec->vp_params->offset >> 8); 166 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8); 167 PUSH_DATA (push, dec->vpring_ctrl); 168 PUSH_DATA (push, dec->vpring->offset >> 8); 169 PUSH_DATA (push, dec->bitstream->size / 2 - 0x700); 170 PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8); 171 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl + 172 dec->vpring_residual + dec->vpring_deblock) >> 8); 173 PUSH_DATA (push, 0); 174 PUSH_DATA (push, 0x100008); 175 PUSH_DATA (push, dest->interlaced->offset >> 8); 176 PUSH_DATA (push, 0); 177 178 BEGIN_NV04(push, SUBC_VP(0x620), 2); 179 PUSH_DATA (push, 0); 180 PUSH_DATA (push, 0); 181 182 BEGIN_NV04(push, SUBC_VP(0x300), 1); 183 PUSH_DATA (push, 0); 184 185 /* VP step 2 */ 186 BEGIN_NV04(push, SUBC_VP(0x400), 5); 187 PUSH_DATA (push, 0x54530201); 188 PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4); 189 PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl + 190 dec->vpring_residual) >> 8); 191 PUSH_DATA (push, dest->interlaced->offset >> 8); 192 PUSH_DATA (push, dest->interlaced->offset >> 8); 193 194 if (is_ref) { 195 BEGIN_NV04(push, SUBC_VP(0x414), 1); 196 PUSH_DATA (push, dest->full->offset >> 8); 197 } 198 199 BEGIN_NV04(push, SUBC_VP(0x620), 2); 200 PUSH_DATAh(push, dec->vp_fw2_offset); 201 PUSH_DATA (push, dec->vp_fw2_offset); 202 203 BEGIN_NV04(push, SUBC_VP(0x300), 1); 204 PUSH_DATA (push, 0); 205 206 /* Set the semaphore back to 1 */ 207 BEGIN_NV04(push, SUBC_VP(0x610), 3); 208 PUSH_DATAh(push, dec->fence->offset); 209 PUSH_DATA (push, dec->fence->offset); 210 PUSH_DATA (push, 1); 211 212 /* Write to the semaphore location, intr */ 213 BEGIN_NV04(push, SUBC_VP(0x304), 1); 214 PUSH_DATA (push, 0x101); 215 216 for (i = 0; i < 2; i++) { 217 struct nv50_miptree *mt = nv50_miptree(dest->resources[i]); 218 mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; 219 } 220 221 PUSH_KICK (push); 222 } 223 224 static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) { 225 int16_t ret = val * quant / 16; 226 if (mpeg1 && ret) { 227 if (ret > 0) 228 ret = (ret - 1) | 1; 229 else 230 ret = (ret + 1) | 1; 231 } 232 if (ret < -2048) 233 ret = -2048; 234 else if (ret > 2047) 235 ret = 2047; 236 return ret; 237 } 238 239 struct mpeg12_mb_info { 240 uint32_t index; 241 uint8_t unk4; 242 uint8_t unk5; 243 uint16_t coded_block_pattern; 244 uint8_t block_counts[6]; 245 uint16_t PMV[8]; 246 uint16_t skipped; 247 }; 248 249 void 250 nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec, 251 struct pipe_mpeg12_picture_desc *desc, 252 const struct pipe_mpeg12_macroblock *macrob) 253 { 254 STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32); 255 256 struct mpeg12_mb_info info = {0}; 257 int i, sum = 0, mask, block_index, count; 258 const int16_t *blocks; 259 int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA; 260 int motion = macrob->macroblock_type & 261 (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD); 262 const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix : 263 dec->mpeg12_non_intra_matrix; 264 int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1; 265 266 info.index = macrob->y * mb(dec->base.width) + macrob->x; 267 info.unk4 = motion; 268 if (intra) 269 info.unk4 |= 1; 270 if (macrob->macroblock_modes.bits.dct_type) 271 info.unk4 |= 0x20; 272 info.unk5 = (macrob->motion_vertical_field_select << 4) | 273 (macrob->macroblock_modes.value & 0xf); 274 info.coded_block_pattern = macrob->coded_block_pattern; 275 if (motion) { 276 memcpy(info.PMV, macrob->PMV, sizeof(info.PMV)); 277 } 278 blocks = macrob->blocks; 279 for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) { 280 if ((macrob->coded_block_pattern & mask) == 0) 281 continue; 282 283 count = 0; 284 285 /* 286 * The observation here is that there are a lot of 0's, and things go 287 * a lot faster if one skips over them. 288 */ 289 290 #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64) 291 /* Note that the SSE implementation is much more tuned to X86_64. As it's not 292 * benchmarked on X86_32, disable it there. I suspect that the code needs to 293 * be reorganized in terms of 32-bit wide data in order to be more 294 * efficient. NV84+ were released well into the 64-bit CPU era, so it should 295 * be a minority case. 296 */ 297 298 /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending 299 * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */ 300 #define wordmask(blocks, zero) \ 301 (uint64_t)(_mm_movemask_epi8( \ 302 _mm_cmpeq_epi16( \ 303 zero, _mm_load_si128((__m128i *)(blocks))))) 304 305 __m128i zero = _mm_setzero_si128(); 306 307 /* TODO: Look into doing the inverse quantization in terms of SSE 308 * operations unconditionally, when necessary. */ 309 uint64_t bmask0 = wordmask(blocks, zero); 310 bmask0 |= wordmask(blocks + 8, zero) << 16; 311 bmask0 |= wordmask(blocks + 16, zero) << 32; 312 bmask0 |= wordmask(blocks + 24, zero) << 48; 313 uint64_t bmask1 = wordmask(blocks + 32, zero); 314 bmask1 |= wordmask(blocks + 40, zero) << 16; 315 bmask1 |= wordmask(blocks + 48, zero) << 32; 316 bmask1 |= wordmask(blocks + 56, zero) << 48; 317 318 /* The wordmask macro returns the inverse of what we want, since it 319 * returns a 1 for equal-to-zero. Invert. */ 320 bmask0 = ~bmask0; 321 bmask1 = ~bmask1; 322 323 /* Note that the bitmask is actually sequences of 2 bits for each block 324 * index. This is because there is no movemask_epi16. That means that 325 * (a) ffs will never return 64, since the prev bit will always be set 326 * in that case, and (b) we need to do an extra bit shift. Or'ing the 327 * bitmasks together is faster than having a loop that computes them one 328 * at a time and processes them, on a Core i7-920. Trying to put bmask 329 * into an array and then looping also slows things down. 330 */ 331 332 /* shift needs to be the same width as i, and unsigned so that / 2 333 * becomes a rshift operation */ 334 uint32_t shift; 335 i = 0; 336 337 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) { 338 int16_t tmp; 339 while ((shift = __builtin_ffsll(bmask0))) { 340 i += (shift - 1) / 2; 341 bmask0 >>= shift - 1; 342 *dec->mpeg12_data++ = dec->zscan[i] * 2; 343 tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1); 344 *dec->mpeg12_data++ = tmp; 345 sum += tmp; 346 count++; 347 i++; 348 bmask0 >>= 2; 349 } 350 i = 32; 351 while ((shift = __builtin_ffsll(bmask1))) { 352 i += (shift - 1) / 2; 353 bmask1 >>= shift - 1; 354 *dec->mpeg12_data++ = dec->zscan[i] * 2; 355 tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1); 356 *dec->mpeg12_data++ = tmp; 357 sum += tmp; 358 count++; 359 i++; 360 bmask1 >>= 2; 361 } 362 } else { 363 while ((shift = __builtin_ffsll(bmask0))) { 364 i += (shift - 1) / 2; 365 bmask0 >>= shift - 1; 366 *dec->mpeg12_data++ = i * 2; 367 *dec->mpeg12_data++ = blocks[i]; 368 count++; 369 i++; 370 bmask0 >>= 2; 371 } 372 i = 32; 373 while ((shift = __builtin_ffsll(bmask1))) { 374 i += (shift - 1) / 2; 375 bmask1 >>= shift - 1; 376 *dec->mpeg12_data++ = i * 2; 377 *dec->mpeg12_data++ = blocks[i]; 378 count++; 379 i++; 380 bmask1 >>= 2; 381 } 382 } 383 #undef wordmask 384 #else 385 386 /* 387 * This loop looks ridiculously written... and it is. I tried a lot of 388 * different ways of achieving this scan, and this was the fastest, at 389 * least on a Core i7-920. Note that it's not necessary to skip the 0's, 390 * the firmware will deal with those just fine. But it's faster to skip 391 * them. Note to people trying benchmarks: make sure to use realistic 392 * mpeg data, which can often be a single data point first followed by 393 * 63 0's, or <data> 7x <0> <data> 7x <0> etc. 394 */ 395 i = 0; 396 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) { 397 while (true) { 398 int16_t tmp; 399 while (likely(i < 64 && !(tmp = blocks[i]))) i++; 400 if (i >= 64) break; 401 *dec->mpeg12_data++ = dec->zscan[i] * 2; 402 tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1); 403 *dec->mpeg12_data++ = tmp; 404 sum += tmp; 405 count++; 406 i++; 407 } 408 } else { 409 while (true) { 410 int16_t tmp; 411 while (likely(i < 64 && !(tmp = blocks[i]))) i++; 412 if (i >= 64) break; 413 *dec->mpeg12_data++ = i * 2; 414 *dec->mpeg12_data++ = tmp; 415 count++; 416 i++; 417 } 418 } 419 420 #endif 421 422 if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) { 423 if (!mpeg1 && (sum & 1) == 0) { 424 if (count && *(dec->mpeg12_data - 2) == 63 * 2) { 425 uint16_t *val = dec->mpeg12_data - 1; 426 if (*val & 1) *val -= 1; 427 else *val += 1; 428 } else { 429 *dec->mpeg12_data++ = 63 * 2; 430 *dec->mpeg12_data++ = 1; 431 count++; 432 } 433 } 434 } 435 436 if (count) { 437 *(dec->mpeg12_data - 2) |= 1; 438 } else { 439 *dec->mpeg12_data++ = 1; 440 *dec->mpeg12_data++ = 0; 441 count = 1; 442 } 443 info.block_counts[block_index] = count; 444 blocks += 64; 445 } 446 447 memcpy(dec->mpeg12_mb_info, &info, sizeof(info)); 448 dec->mpeg12_mb_info += sizeof(info); 449 450 if (macrob->num_skipped_macroblocks) { 451 info.index++; 452 info.coded_block_pattern = 0; 453 info.skipped = macrob->num_skipped_macroblocks - 1; 454 memset(info.block_counts, 0, sizeof(info.block_counts)); 455 memcpy(dec->mpeg12_mb_info, &info, sizeof(info)); 456 dec->mpeg12_mb_info += sizeof(info); 457 } 458 } 459 460 struct mpeg12_header { 461 uint32_t luma_top_size; // 00 462 uint32_t luma_bottom_size; // 04 463 uint32_t chroma_top_size; // 08 464 uint32_t mbs; // 0c 465 uint32_t mb_info_size; // 10 466 uint32_t mb_width_minus1; // 14 467 uint32_t mb_height_minus1; // 18 468 uint32_t width; // 1c 469 uint32_t height; // 20 470 uint8_t progressive; // 24 471 uint8_t mocomp_only; // 25 472 uint8_t frames; // 26 473 uint8_t picture_structure; // 27 474 uint32_t unk28; // 28 -- 0x50100 475 uint32_t unk2c; // 2c 476 uint32_t pad[4 * 13]; 477 }; 478 479 void 480 nv84_decoder_vp_mpeg12(struct nv84_decoder *dec, 481 struct pipe_mpeg12_picture_desc *desc, 482 struct nv84_video_buffer *dest) 483 { 484 struct nouveau_pushbuf *push = dec->vp_pushbuf; 485 struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0]; 486 struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1]; 487 struct nouveau_pushbuf_refn bo_refs[] = { 488 { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 489 { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 490 { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM }, 491 { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART }, 492 }; 493 int i, num_refs = ARRAY_SIZE(bo_refs); 494 struct mpeg12_header header = {0}; 495 struct nv50_miptree *y = nv50_miptree(dest->resources[0]); 496 struct nv50_miptree *uv = nv50_miptree(dest->resources[1]); 497 498 STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100); 499 500 if (!ref1) 501 ref1 = dest; 502 if (!ref2) 503 ref2 = dest; 504 bo_refs[1].bo = ref1->interlaced; 505 bo_refs[2].bo = ref2->interlaced; 506 507 header.luma_top_size = y->layer_stride; 508 header.luma_bottom_size = y->layer_stride; 509 header.chroma_top_size = uv->layer_stride; 510 header.mbs = mb(dec->base.width) * mb(dec->base.height); 511 header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100; 512 header.mb_width_minus1 = mb(dec->base.width) - 1; 513 header.mb_height_minus1 = mb(dec->base.height) - 1; 514 header.width = align(dec->base.width, 16); 515 header.height = align(dec->base.height, 16); 516 header.progressive = desc->frame_pred_frame_dct; 517 header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL); 518 header.picture_structure = desc->picture_structure; 519 header.unk28 = 0x50100; 520 521 memcpy(dec->mpeg12_bo->map, &header, sizeof(header)); 522 523 PUSH_SPACE(push, 10 + 3 + 2); 524 525 nouveau_pushbuf_refn(push, bo_refs, num_refs); 526 527 BEGIN_NV04(push, SUBC_VP(0x400), 9); 528 PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */ 529 PUSH_DATA (push, 0x555001); /* constant */ 530 PUSH_DATA (push, dec->mpeg12_bo->offset >> 8); 531 PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8); 532 PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 + 533 align(0x20 * mb(dec->base.width) * 534 mb(dec->base.height), 0x100)) >> 8); 535 PUSH_DATA (push, dest->interlaced->offset >> 8); 536 PUSH_DATA (push, ref1->interlaced->offset >> 8); 537 PUSH_DATA (push, ref2->interlaced->offset >> 8); 538 PUSH_DATA (push, 6 * 64 * 8 * header.mbs); 539 540 BEGIN_NV04(push, SUBC_VP(0x620), 2); 541 PUSH_DATA (push, 0); 542 PUSH_DATA (push, 0); 543 544 BEGIN_NV04(push, SUBC_VP(0x300), 1); 545 PUSH_DATA (push, 0); 546 547 for (i = 0; i < 2; i++) { 548 struct nv50_miptree *mt = nv50_miptree(dest->resources[i]); 549 mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; 550 } 551 PUSH_KICK (push); 552 } 553