Home | History | Annotate | Download | only in nv50
      1 /*
      2  * Copyright 2013 Ilia Mirkin
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "nv50/nv84_video.h"
     24 
     25 #include "util/u_sse.h"
     26 
     27 struct h264_iparm1 {
     28    uint8_t scaling_lists_4x4[6][16]; // 00
     29    uint8_t scaling_lists_8x8[2][64]; // 60
     30    uint32_t width; // e0
     31    uint32_t height; // e4
     32    uint64_t ref1_addrs[16]; // e8
     33    uint64_t ref2_addrs[16]; // 168
     34    uint32_t unk1e8;
     35    uint32_t unk1ec;
     36    uint32_t w1; // 1f0
     37    uint32_t w2; // 1f4
     38    uint32_t w3; // 1f8
     39    uint32_t h1; // 1fc
     40    uint32_t h2; // 200
     41    uint32_t h3; // 204
     42    uint32_t mb_adaptive_frame_field_flag; // 208
     43    uint32_t field_pic_flag; // 20c
     44    uint32_t format; // 210
     45    uint32_t unk214; // 214
     46 };
     47 
     48 struct h264_iparm2 {
     49    uint32_t width; // 00
     50    uint32_t height; // 04
     51    uint32_t mbs; // 08
     52    uint32_t w1; // 0c
     53    uint32_t w2; // 10
     54    uint32_t w3; // 14
     55    uint32_t h1; // 18
     56    uint32_t h2; // 1c
     57    uint32_t h3; // 20
     58    uint32_t unk24;
     59    uint32_t mb_adaptive_frame_field_flag; // 28
     60    uint32_t top; // 2c
     61    uint32_t bottom; // 30
     62    uint32_t is_reference; // 34
     63 };
     64 
     65 void
     66 nv84_decoder_vp_h264(struct nv84_decoder *dec,
     67                      struct pipe_h264_picture_desc *desc,
     68                      struct nv84_video_buffer *dest)
     69 {
     70    struct h264_iparm1 param1;
     71    struct h264_iparm2 param2;
     72    int i, width = align(dest->base.width, 16),
     73       height = align(dest->base.height, 16);
     74 
     75    struct nouveau_pushbuf *push = dec->vp_pushbuf;
     76    struct nouveau_pushbuf_refn bo_refs[] = {
     77       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
     78       { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
     79       { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
     80       { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
     81       { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
     82       { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
     83    };
     84    int num_refs = ARRAY_SIZE(bo_refs);
     85    bool is_ref = desc->is_reference;
     86 
     87    STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);
     88    STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);
     89 
     90    memset(&param1, 0, sizeof(param1));
     91    memset(&param2, 0, sizeof(param2));
     92 
     93    memcpy(&param1.scaling_lists_4x4, desc->pps->ScalingList4x4,
     94           sizeof(param1.scaling_lists_4x4));
     95    memcpy(&param1.scaling_lists_8x8, desc->pps->ScalingList8x8,
     96           sizeof(param1.scaling_lists_8x8));
     97 
     98    param1.width = width;
     99    param1.w1 = param1.w2 = param1.w3 = align(width, 64);
    100    param1.height = param1.h2 = height;
    101    param1.h1 = param1.h3 = align(height, 32);
    102    param1.format = 0x3231564e; /* 'NV12' */
    103    param1.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;
    104    param1.field_pic_flag = desc->field_pic_flag;
    105 
    106    param2.width = width;
    107    param2.w1 = param2.w2 = param2.w3 = param1.w1;
    108    if (desc->field_pic_flag)
    109       param2.height = align(height, 32) / 2;
    110    else
    111       param2.height = height;
    112    param2.h1 = param2.h2 = align(height, 32);
    113    param2.h3 = height;
    114    param2.mbs = width * height >> 8;
    115    if (desc->field_pic_flag) {
    116       param2.top = desc->bottom_field_flag ? 2 : 1;
    117       param2.bottom = desc->bottom_field_flag;
    118    }
    119    param2.mb_adaptive_frame_field_flag = desc->pps->sps->mb_adaptive_frame_field_flag;
    120    param2.is_reference = desc->is_reference;
    121 
    122    PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);
    123 
    124    struct nouveau_bo *ref2_default = dest->full;
    125 
    126    for (i = 0; i < 16; i++) {
    127       struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];
    128       struct nouveau_bo *bo1, *bo2;
    129       if (buf) {
    130          bo1 = buf->interlaced;
    131          bo2 = buf->full;
    132          if (i == 0)
    133             ref2_default = buf->full;
    134       } else {
    135          bo1 = dest->interlaced;
    136          bo2 = ref2_default;
    137       }
    138       param1.ref1_addrs[i] = bo1->offset;
    139       param1.ref2_addrs[i] = bo2->offset;
    140       struct nouveau_pushbuf_refn bo_refs[] = {
    141          { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
    142          { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
    143       };
    144       nouveau_pushbuf_refn(push, bo_refs, ARRAY_SIZE(bo_refs));
    145    }
    146 
    147    memcpy(dec->vp_params->map, &param1, sizeof(param1));
    148    memcpy(dec->vp_params->map + 0x400, &param2, sizeof(param2));
    149 
    150    nouveau_pushbuf_refn(push, bo_refs, num_refs);
    151 
    152    /* Wait for BSP to have completed */
    153    BEGIN_NV04(push, SUBC_VP(0x10), 4);
    154    PUSH_DATAh(push, dec->fence->offset);
    155    PUSH_DATA (push, dec->fence->offset);
    156    PUSH_DATA (push, 2);
    157    PUSH_DATA (push, 1); /* wait for sem == 2 */
    158 
    159    /* VP step 1 */
    160    BEGIN_NV04(push, SUBC_VP(0x400), 15);
    161    PUSH_DATA (push, 1);
    162    PUSH_DATA (push, param2.mbs);
    163    PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */
    164    PUSH_DATA (push, 0x55001); /* constant */
    165    PUSH_DATA (push, dec->vp_params->offset >> 8);
    166    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);
    167    PUSH_DATA (push, dec->vpring_ctrl);
    168    PUSH_DATA (push, dec->vpring->offset >> 8);
    169    PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
    170    PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);
    171    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
    172                      dec->vpring_residual + dec->vpring_deblock) >> 8);
    173    PUSH_DATA (push, 0);
    174    PUSH_DATA (push, 0x100008);
    175    PUSH_DATA (push, dest->interlaced->offset >> 8);
    176    PUSH_DATA (push, 0);
    177 
    178    BEGIN_NV04(push, SUBC_VP(0x620), 2);
    179    PUSH_DATA (push, 0);
    180    PUSH_DATA (push, 0);
    181 
    182    BEGIN_NV04(push, SUBC_VP(0x300), 1);
    183    PUSH_DATA (push, 0);
    184 
    185    /* VP step 2 */
    186    BEGIN_NV04(push, SUBC_VP(0x400), 5);
    187    PUSH_DATA (push, 0x54530201);
    188    PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);
    189    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
    190                      dec->vpring_residual) >> 8);
    191    PUSH_DATA (push, dest->interlaced->offset >> 8);
    192    PUSH_DATA (push, dest->interlaced->offset >> 8);
    193 
    194    if (is_ref) {
    195       BEGIN_NV04(push, SUBC_VP(0x414), 1);
    196       PUSH_DATA (push, dest->full->offset >> 8);
    197    }
    198 
    199    BEGIN_NV04(push, SUBC_VP(0x620), 2);
    200    PUSH_DATAh(push, dec->vp_fw2_offset);
    201    PUSH_DATA (push, dec->vp_fw2_offset);
    202 
    203    BEGIN_NV04(push, SUBC_VP(0x300), 1);
    204    PUSH_DATA (push, 0);
    205 
    206    /* Set the semaphore back to 1 */
    207    BEGIN_NV04(push, SUBC_VP(0x610), 3);
    208    PUSH_DATAh(push, dec->fence->offset);
    209    PUSH_DATA (push, dec->fence->offset);
    210    PUSH_DATA (push, 1);
    211 
    212    /* Write to the semaphore location, intr */
    213    BEGIN_NV04(push, SUBC_VP(0x304), 1);
    214    PUSH_DATA (push, 0x101);
    215 
    216    for (i = 0; i < 2; i++) {
    217       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
    218       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
    219    }
    220 
    221    PUSH_KICK (push);
    222 }
    223 
    224 static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
    225    int16_t ret = val * quant / 16;
    226    if (mpeg1 && ret) {
    227       if (ret > 0)
    228          ret = (ret - 1) | 1;
    229       else
    230          ret = (ret + 1) | 1;
    231    }
    232    if (ret < -2048)
    233       ret = -2048;
    234    else if (ret > 2047)
    235       ret = 2047;
    236    return ret;
    237 }
    238 
    239 struct mpeg12_mb_info {
    240    uint32_t index;
    241    uint8_t unk4;
    242    uint8_t unk5;
    243    uint16_t coded_block_pattern;
    244    uint8_t block_counts[6];
    245    uint16_t PMV[8];
    246    uint16_t skipped;
    247 };
    248 
    249 void
    250 nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
    251                           struct pipe_mpeg12_picture_desc *desc,
    252                           const struct pipe_mpeg12_macroblock *macrob)
    253 {
    254    STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);
    255 
    256    struct mpeg12_mb_info info = {0};
    257    int i, sum = 0, mask, block_index, count;
    258    const int16_t *blocks;
    259    int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;
    260    int motion = macrob->macroblock_type &
    261       (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);
    262    const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :
    263       dec->mpeg12_non_intra_matrix;
    264    int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;
    265 
    266    info.index = macrob->y * mb(dec->base.width) + macrob->x;
    267    info.unk4 = motion;
    268    if (intra)
    269       info.unk4 |= 1;
    270    if (macrob->macroblock_modes.bits.dct_type)
    271       info.unk4 |= 0x20;
    272    info.unk5 = (macrob->motion_vertical_field_select << 4) |
    273       (macrob->macroblock_modes.value & 0xf);
    274    info.coded_block_pattern = macrob->coded_block_pattern;
    275    if (motion) {
    276       memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));
    277    }
    278    blocks = macrob->blocks;
    279    for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {
    280       if ((macrob->coded_block_pattern & mask) == 0)
    281          continue;
    282 
    283       count = 0;
    284 
    285       /*
    286        * The observation here is that there are a lot of 0's, and things go
    287        * a lot faster if one skips over them.
    288        */
    289 
    290 #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
    291 /* Note that the SSE implementation is much more tuned to X86_64. As it's not
    292  * benchmarked on X86_32, disable it there. I suspect that the code needs to
    293  * be reorganized in terms of 32-bit wide data in order to be more
    294  * efficient. NV84+ were released well into the 64-bit CPU era, so it should
    295  * be a minority case.
    296  */
    297 
    298 /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
    299  * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
    300 #define wordmask(blocks, zero) \
    301       (uint64_t)(_mm_movemask_epi8( \
    302                        _mm_cmpeq_epi16( \
    303                              zero, _mm_load_si128((__m128i *)(blocks)))))
    304 
    305       __m128i zero = _mm_setzero_si128();
    306 
    307       /* TODO: Look into doing the inverse quantization in terms of SSE
    308        * operations unconditionally, when necessary. */
    309       uint64_t bmask0 = wordmask(blocks, zero);
    310       bmask0 |= wordmask(blocks + 8, zero) << 16;
    311       bmask0 |= wordmask(blocks + 16, zero) << 32;
    312       bmask0 |= wordmask(blocks + 24, zero) << 48;
    313       uint64_t bmask1 = wordmask(blocks + 32, zero);
    314       bmask1 |= wordmask(blocks + 40, zero) << 16;
    315       bmask1 |= wordmask(blocks + 48, zero) << 32;
    316       bmask1 |= wordmask(blocks + 56, zero) << 48;
    317 
    318       /* The wordmask macro returns the inverse of what we want, since it
    319        * returns a 1 for equal-to-zero. Invert. */
    320       bmask0 = ~bmask0;
    321       bmask1 = ~bmask1;
    322 
    323       /* Note that the bitmask is actually sequences of 2 bits for each block
    324        * index. This is because there is no movemask_epi16. That means that
    325        * (a) ffs will never return 64, since the prev bit will always be set
    326        * in that case, and (b) we need to do an extra bit shift. Or'ing the
    327        * bitmasks together is faster than having a loop that computes them one
    328        * at a time and processes them, on a Core i7-920. Trying to put bmask
    329        * into an array and then looping also slows things down.
    330        */
    331 
    332       /* shift needs to be the same width as i, and unsigned so that / 2
    333        * becomes a rshift operation */
    334       uint32_t shift;
    335       i = 0;
    336 
    337       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
    338          int16_t tmp;
    339          while ((shift = __builtin_ffsll(bmask0))) {
    340             i += (shift - 1) / 2;
    341             bmask0 >>= shift - 1;
    342             *dec->mpeg12_data++ = dec->zscan[i] * 2;
    343             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
    344             *dec->mpeg12_data++ = tmp;
    345             sum += tmp;
    346             count++;
    347             i++;
    348             bmask0 >>= 2;
    349          }
    350          i = 32;
    351          while ((shift = __builtin_ffsll(bmask1))) {
    352             i += (shift - 1) / 2;
    353             bmask1 >>= shift - 1;
    354             *dec->mpeg12_data++ = dec->zscan[i] * 2;
    355             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
    356             *dec->mpeg12_data++ = tmp;
    357             sum += tmp;
    358             count++;
    359             i++;
    360             bmask1 >>= 2;
    361          }
    362       } else {
    363          while ((shift = __builtin_ffsll(bmask0))) {
    364             i += (shift - 1) / 2;
    365             bmask0 >>= shift - 1;
    366             *dec->mpeg12_data++ = i * 2;
    367             *dec->mpeg12_data++ = blocks[i];
    368             count++;
    369             i++;
    370             bmask0 >>= 2;
    371          }
    372          i = 32;
    373          while ((shift = __builtin_ffsll(bmask1))) {
    374             i += (shift - 1) / 2;
    375             bmask1 >>= shift - 1;
    376             *dec->mpeg12_data++ = i * 2;
    377             *dec->mpeg12_data++ = blocks[i];
    378             count++;
    379             i++;
    380             bmask1 >>= 2;
    381          }
    382       }
    383 #undef wordmask
    384 #else
    385 
    386       /*
    387        * This loop looks ridiculously written... and it is. I tried a lot of
    388        * different ways of achieving this scan, and this was the fastest, at
    389        * least on a Core i7-920. Note that it's not necessary to skip the 0's,
    390        * the firmware will deal with those just fine. But it's faster to skip
    391        * them. Note to people trying benchmarks: make sure to use realistic
    392        * mpeg data, which can often be a single data point first followed by
    393        * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
    394        */
    395       i = 0;
    396       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
    397          while (true) {
    398             int16_t tmp;
    399             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
    400             if (i >= 64) break;
    401             *dec->mpeg12_data++ = dec->zscan[i] * 2;
    402             tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);
    403             *dec->mpeg12_data++ = tmp;
    404             sum += tmp;
    405             count++;
    406             i++;
    407          }
    408       } else {
    409          while (true) {
    410             int16_t tmp;
    411             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
    412             if (i >= 64) break;
    413             *dec->mpeg12_data++ = i * 2;
    414             *dec->mpeg12_data++ = tmp;
    415             count++;
    416             i++;
    417          }
    418       }
    419 
    420 #endif
    421 
    422       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
    423          if (!mpeg1 && (sum & 1) == 0) {
    424             if (count && *(dec->mpeg12_data - 2) == 63 * 2) {
    425                uint16_t *val = dec->mpeg12_data - 1;
    426                if (*val & 1) *val -= 1;
    427                else *val += 1;
    428             } else {
    429                *dec->mpeg12_data++ = 63 * 2;
    430                *dec->mpeg12_data++ = 1;
    431                count++;
    432             }
    433          }
    434       }
    435 
    436       if (count) {
    437          *(dec->mpeg12_data - 2) |= 1;
    438       } else {
    439          *dec->mpeg12_data++ = 1;
    440          *dec->mpeg12_data++ = 0;
    441          count = 1;
    442       }
    443       info.block_counts[block_index] = count;
    444       blocks += 64;
    445    }
    446 
    447    memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
    448    dec->mpeg12_mb_info += sizeof(info);
    449 
    450    if (macrob->num_skipped_macroblocks) {
    451       info.index++;
    452       info.coded_block_pattern = 0;
    453       info.skipped = macrob->num_skipped_macroblocks - 1;
    454       memset(info.block_counts, 0, sizeof(info.block_counts));
    455       memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
    456       dec->mpeg12_mb_info += sizeof(info);
    457    }
    458 }
    459 
    460 struct mpeg12_header {
    461    uint32_t luma_top_size; // 00
    462    uint32_t luma_bottom_size; // 04
    463    uint32_t chroma_top_size; // 08
    464    uint32_t mbs; // 0c
    465    uint32_t mb_info_size; // 10
    466    uint32_t mb_width_minus1; // 14
    467    uint32_t mb_height_minus1; // 18
    468    uint32_t width; // 1c
    469    uint32_t height; // 20
    470    uint8_t progressive; // 24
    471    uint8_t mocomp_only; // 25
    472    uint8_t frames; // 26
    473    uint8_t picture_structure; // 27
    474    uint32_t unk28; // 28 -- 0x50100
    475    uint32_t unk2c; // 2c
    476    uint32_t pad[4 * 13];
    477 };
    478 
    479 void
    480 nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
    481                        struct pipe_mpeg12_picture_desc *desc,
    482                        struct nv84_video_buffer *dest)
    483 {
    484    struct nouveau_pushbuf *push = dec->vp_pushbuf;
    485    struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];
    486    struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];
    487    struct nouveau_pushbuf_refn bo_refs[] = {
    488       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
    489       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
    490       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
    491       { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
    492    };
    493    int i, num_refs = ARRAY_SIZE(bo_refs);
    494    struct mpeg12_header header = {0};
    495    struct nv50_miptree *y = nv50_miptree(dest->resources[0]);
    496    struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);
    497 
    498    STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);
    499 
    500    if (!ref1)
    501       ref1 = dest;
    502    if (!ref2)
    503       ref2 = dest;
    504    bo_refs[1].bo = ref1->interlaced;
    505    bo_refs[2].bo = ref2->interlaced;
    506 
    507    header.luma_top_size = y->layer_stride;
    508    header.luma_bottom_size = y->layer_stride;
    509    header.chroma_top_size = uv->layer_stride;
    510    header.mbs = mb(dec->base.width) * mb(dec->base.height);
    511    header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;
    512    header.mb_width_minus1 = mb(dec->base.width) - 1;
    513    header.mb_height_minus1 = mb(dec->base.height) - 1;
    514    header.width = align(dec->base.width, 16);
    515    header.height = align(dec->base.height, 16);
    516    header.progressive = desc->frame_pred_frame_dct;
    517    header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);
    518    header.picture_structure = desc->picture_structure;
    519    header.unk28 = 0x50100;
    520 
    521    memcpy(dec->mpeg12_bo->map, &header, sizeof(header));
    522 
    523    PUSH_SPACE(push, 10 + 3 + 2);
    524 
    525    nouveau_pushbuf_refn(push, bo_refs, num_refs);
    526 
    527    BEGIN_NV04(push, SUBC_VP(0x400), 9);
    528    PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */
    529    PUSH_DATA (push, 0x555001); /* constant */
    530    PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);
    531    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);
    532    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +
    533                      align(0x20 * mb(dec->base.width) *
    534                            mb(dec->base.height), 0x100)) >> 8);
    535    PUSH_DATA (push, dest->interlaced->offset >> 8);
    536    PUSH_DATA (push, ref1->interlaced->offset >> 8);
    537    PUSH_DATA (push, ref2->interlaced->offset >> 8);
    538    PUSH_DATA (push, 6 * 64 * 8 * header.mbs);
    539 
    540    BEGIN_NV04(push, SUBC_VP(0x620), 2);
    541    PUSH_DATA (push, 0);
    542    PUSH_DATA (push, 0);
    543 
    544    BEGIN_NV04(push, SUBC_VP(0x300), 1);
    545    PUSH_DATA (push, 0);
    546 
    547    for (i = 0; i < 2; i++) {
    548       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
    549       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
    550    }
    551    PUSH_KICK (push);
    552 }
    553