Home | History | Annotate | Download | only in vl
      1 /**************************************************************************
      2  *
      3  * Copyright 2010 Christian Knig
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 #include <assert.h>
     29 
     30 #include "pipe/p_context.h"
     31 #include "pipe/p_screen.h"
     32 
     33 #include "util/u_draw.h"
     34 #include "util/u_sampler.h"
     35 #include "util/u_memory.h"
     36 
     37 #include "tgsi/tgsi_ureg.h"
     38 
     39 #include "vl_defines.h"
     40 #include "vl_types.h"
     41 #include "vl_vertex_buffers.h"
     42 #include "vl_idct.h"
     43 
     44 enum VS_OUTPUT
     45 {
     46    VS_O_VPOS = 0,
     47    VS_O_L_ADDR0 = 0,
     48    VS_O_L_ADDR1,
     49    VS_O_R_ADDR0,
     50    VS_O_R_ADDR1
     51 };
     52 
     53 /**
     54  * The DCT matrix stored as hex representation of floats. Equal to the following equation:
     55  * for (i = 0; i < 8; ++i)
     56  *    for (j = 0; j < 8; ++j)
     57  *       if (i == 0) const_matrix[i][j] = 1.0f / sqrtf(8.0f);
     58  *       else const_matrix[i][j] = sqrtf(2.0f / 8.0f) * cosf((2 * j + 1) * i * M_PI / (2.0f * 8.0f));
     59  */
     60 static const uint32_t const_matrix[8][8] = {
     61    { 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3 },
     62    { 0x3efb14be, 0x3ed4db31, 0x3e8e39da, 0x3dc7c5c4, 0xbdc7c5c2, 0xbe8e39d9, 0xbed4db32, 0xbefb14bf },
     63    { 0x3eec835f, 0x3e43ef15, 0xbe43ef14, 0xbeec835e, 0xbeec835f, 0xbe43ef1a, 0x3e43ef1b, 0x3eec835f },
     64    { 0x3ed4db31, 0xbdc7c5c2, 0xbefb14bf, 0xbe8e39dd, 0x3e8e39d7, 0x3efb14bf, 0x3dc7c5d0, 0xbed4db34 },
     65    { 0x3eb504f3, 0xbeb504f3, 0xbeb504f4, 0x3eb504f1, 0x3eb504f3, 0xbeb504f0, 0xbeb504ef, 0x3eb504f4 },
     66    { 0x3e8e39da, 0xbefb14bf, 0x3dc7c5c8, 0x3ed4db32, 0xbed4db34, 0xbdc7c5bb, 0x3efb14bf, 0xbe8e39d7 },
     67    { 0x3e43ef15, 0xbeec835f, 0x3eec835f, 0xbe43ef07, 0xbe43ef23, 0x3eec8361, 0xbeec835c, 0x3e43ef25 },
     68    { 0x3dc7c5c4, 0xbe8e39dd, 0x3ed4db32, 0xbefb14c0, 0x3efb14be, 0xbed4db31, 0x3e8e39ce, 0xbdc7c596 },
     69 };
     70 
     71 static void
     72 calc_addr(struct ureg_program *shader, struct ureg_dst addr[2],
     73           struct ureg_src tc, struct ureg_src start, bool right_side,
     74           bool transposed, float size)
     75 {
     76    unsigned wm_start = (right_side == transposed) ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_Y;
     77    unsigned sw_start = right_side ? TGSI_SWIZZLE_Y : TGSI_SWIZZLE_X;
     78 
     79    unsigned wm_tc = (right_side == transposed) ? TGSI_WRITEMASK_Y : TGSI_WRITEMASK_X;
     80    unsigned sw_tc = right_side ? TGSI_SWIZZLE_X : TGSI_SWIZZLE_Y;
     81 
     82    /*
     83     * addr[0..1].(start) = right_side ? start.x : tc.x
     84     * addr[0..1].(tc) = right_side ? tc.y : start.y
     85     * addr[0..1].z = tc.z
     86     * addr[1].(start) += 1.0f / scale
     87     */
     88    ureg_MOV(shader, ureg_writemask(addr[0], wm_start), ureg_scalar(start, sw_start));
     89    ureg_MOV(shader, ureg_writemask(addr[0], wm_tc), ureg_scalar(tc, sw_tc));
     90 
     91    ureg_ADD(shader, ureg_writemask(addr[1], wm_start), ureg_scalar(start, sw_start), ureg_imm1f(shader, 1.0f / size));
     92    ureg_MOV(shader, ureg_writemask(addr[1], wm_tc), ureg_scalar(tc, sw_tc));
     93 }
     94 
     95 static void
     96 increment_addr(struct ureg_program *shader, struct ureg_dst daddr[2],
     97                struct ureg_src saddr[2], bool right_side, bool transposed,
     98                int pos, float size)
     99 {
    100    unsigned wm_start = (right_side == transposed) ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_Y;
    101    unsigned wm_tc = (right_side == transposed) ? TGSI_WRITEMASK_Y : TGSI_WRITEMASK_X;
    102 
    103    /*
    104     * daddr[0..1].(start) = saddr[0..1].(start)
    105     * daddr[0..1].(tc) = saddr[0..1].(tc)
    106     */
    107 
    108    ureg_MOV(shader, ureg_writemask(daddr[0], wm_start), saddr[0]);
    109    ureg_ADD(shader, ureg_writemask(daddr[0], wm_tc), saddr[0], ureg_imm1f(shader, pos / size));
    110    ureg_MOV(shader, ureg_writemask(daddr[1], wm_start), saddr[1]);
    111    ureg_ADD(shader, ureg_writemask(daddr[1], wm_tc), saddr[1], ureg_imm1f(shader, pos / size));
    112 }
    113 
    114 static void
    115 fetch_four(struct ureg_program *shader, struct ureg_dst m[2], struct ureg_src addr[2],
    116            struct ureg_src sampler, bool resource3d)
    117 {
    118    ureg_TEX(shader, m[0], resource3d ? TGSI_TEXTURE_3D : TGSI_TEXTURE_2D, addr[0], sampler);
    119    ureg_TEX(shader, m[1], resource3d ? TGSI_TEXTURE_3D : TGSI_TEXTURE_2D, addr[1], sampler);
    120 }
    121 
    122 static void
    123 matrix_mul(struct ureg_program *shader, struct ureg_dst dst, struct ureg_dst l[2], struct ureg_dst r[2])
    124 {
    125    struct ureg_dst tmp;
    126 
    127    tmp = ureg_DECL_temporary(shader);
    128 
    129    /*
    130     * tmp.xy = dot4(m[0][0..1], m[1][0..1])
    131     * dst = tmp.x + tmp.y
    132     */
    133    ureg_DP4(shader, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(l[0]), ureg_src(r[0]));
    134    ureg_DP4(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(l[1]), ureg_src(r[1]));
    135    ureg_ADD(shader, dst,
    136       ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X),
    137       ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
    138 
    139    ureg_release_temporary(shader, tmp);
    140 }
    141 
    142 static void *
    143 create_mismatch_vert_shader(struct vl_idct *idct)
    144 {
    145    struct ureg_program *shader;
    146    struct ureg_src vpos;
    147    struct ureg_src scale;
    148    struct ureg_dst t_tex;
    149    struct ureg_dst o_vpos, o_addr[2];
    150 
    151    shader = ureg_create(PIPE_SHADER_VERTEX);
    152    if (!shader)
    153       return NULL;
    154 
    155    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    156 
    157    t_tex = ureg_DECL_temporary(shader);
    158 
    159    o_vpos = ureg_DECL_output(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS);
    160 
    161    o_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0);
    162    o_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1);
    163 
    164    /*
    165     * scale = (VL_BLOCK_WIDTH, VL_BLOCK_HEIGHT) / (dst.width, dst.height)
    166     *
    167     * t_vpos = vpos + 7 / VL_BLOCK_WIDTH
    168     * o_vpos.xy = t_vpos * scale
    169     *
    170     * o_addr = calc_addr(...)
    171     *
    172     */
    173 
    174    scale = ureg_imm2f(shader,
    175       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    176       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    177 
    178    ureg_MAD(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), vpos, scale, scale);
    179    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));
    180 
    181    ureg_MUL(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), vpos, scale);
    182    calc_addr(shader, o_addr, ureg_src(t_tex), ureg_src(t_tex), false, false, idct->buffer_width / 4);
    183 
    184    ureg_release_temporary(shader, t_tex);
    185 
    186    ureg_END(shader);
    187 
    188    return ureg_create_shader_and_destroy(shader, idct->pipe);
    189 }
    190 
    191 static void *
    192 create_mismatch_frag_shader(struct vl_idct *idct)
    193 {
    194    struct ureg_program *shader;
    195 
    196    struct ureg_src addr[2];
    197 
    198    struct ureg_dst m[8][2];
    199    struct ureg_dst fragment;
    200 
    201    unsigned i;
    202 
    203    shader = ureg_create(PIPE_SHADER_FRAGMENT);
    204    if (!shader)
    205       return NULL;
    206 
    207    addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    208    addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    209 
    210    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
    211 
    212    for (i = 0; i < 8; ++i) {
    213       m[i][0] = ureg_DECL_temporary(shader);
    214       m[i][1] = ureg_DECL_temporary(shader);
    215    }
    216 
    217    for (i = 0; i < 8; ++i) {
    218       increment_addr(shader, m[i], addr, false, false, i, idct->buffer_height);
    219    }
    220 
    221    for (i = 0; i < 8; ++i) {
    222       struct ureg_src s_addr[2];
    223       s_addr[0] = ureg_src(m[i][0]);
    224       s_addr[1] = ureg_src(m[i][1]);
    225       fetch_four(shader, m[i], s_addr, ureg_DECL_sampler(shader, 0), false);
    226    }
    227 
    228    for (i = 1; i < 8; ++i) {
    229       ureg_ADD(shader, m[0][0], ureg_src(m[0][0]), ureg_src(m[i][0]));
    230       ureg_ADD(shader, m[0][1], ureg_src(m[0][1]), ureg_src(m[i][1]));
    231    }
    232 
    233    ureg_ADD(shader, m[0][0], ureg_src(m[0][0]), ureg_src(m[0][1]));
    234    ureg_DP4(shader, m[0][0], ureg_abs(ureg_src(m[0][0])), ureg_imm1f(shader, 1 << 14));
    235 
    236    ureg_MUL(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_abs(ureg_src(m[7][1])), ureg_imm1f(shader, 1 << 14));
    237    ureg_FRC(shader, m[0][0], ureg_src(m[0][0]));
    238    ureg_SGT(shader, m[0][0], ureg_imm1f(shader, 0.5f), ureg_abs(ureg_src(m[0][0])));
    239 
    240    ureg_CMP(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_negate(ureg_src(m[0][0])),
    241             ureg_imm1f(shader, 1.0f / (1 << 15)), ureg_imm1f(shader, -1.0f / (1 << 15)));
    242    ureg_MUL(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_src(m[0][0]),
    243             ureg_scalar(ureg_src(m[0][0]), TGSI_SWIZZLE_X));
    244 
    245    ureg_MOV(shader, ureg_writemask(fragment, TGSI_WRITEMASK_XYZ), ureg_src(m[7][1]));
    246    ureg_ADD(shader, ureg_writemask(fragment, TGSI_WRITEMASK_W), ureg_src(m[0][0]), ureg_src(m[7][1]));
    247 
    248    for (i = 0; i < 8; ++i) {
    249       ureg_release_temporary(shader, m[i][0]);
    250       ureg_release_temporary(shader, m[i][1]);
    251    }
    252 
    253    ureg_END(shader);
    254 
    255    return ureg_create_shader_and_destroy(shader, idct->pipe);
    256 }
    257 
    258 static void *
    259 create_stage1_vert_shader(struct vl_idct *idct)
    260 {
    261    struct ureg_program *shader;
    262    struct ureg_src vrect, vpos;
    263    struct ureg_src scale;
    264    struct ureg_dst t_tex, t_start;
    265    struct ureg_dst o_vpos, o_l_addr[2], o_r_addr[2];
    266 
    267    shader = ureg_create(PIPE_SHADER_VERTEX);
    268    if (!shader)
    269       return NULL;
    270 
    271    vrect = ureg_DECL_vs_input(shader, VS_I_RECT);
    272    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    273 
    274    t_tex = ureg_DECL_temporary(shader);
    275    t_start = ureg_DECL_temporary(shader);
    276 
    277    o_vpos = ureg_DECL_output(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS);
    278 
    279    o_l_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0);
    280    o_l_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1);
    281 
    282    o_r_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR0);
    283    o_r_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR1);
    284 
    285    /*
    286     * scale = (VL_BLOCK_WIDTH, VL_BLOCK_HEIGHT) / (dst.width, dst.height)
    287     *
    288     * t_vpos = vpos + vrect
    289     * o_vpos.xy = t_vpos * scale
    290     * o_vpos.zw = vpos
    291     *
    292     * o_l_addr = calc_addr(...)
    293     * o_r_addr = calc_addr(...)
    294     *
    295     */
    296 
    297    scale = ureg_imm2f(shader,
    298       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    299       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    300 
    301    ureg_ADD(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), vpos, vrect);
    302    ureg_MUL(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), ureg_src(t_tex), scale);
    303 
    304    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), ureg_src(t_tex));
    305    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));
    306 
    307    ureg_MUL(shader, ureg_writemask(t_start, TGSI_WRITEMASK_XY), vpos, scale);
    308 
    309    calc_addr(shader, o_l_addr, ureg_src(t_tex), ureg_src(t_start), false, false, idct->buffer_width / 4);
    310    calc_addr(shader, o_r_addr, vrect, ureg_imm1f(shader, 0.0f), true, true, VL_BLOCK_WIDTH / 4);
    311 
    312    ureg_release_temporary(shader, t_tex);
    313    ureg_release_temporary(shader, t_start);
    314 
    315    ureg_END(shader);
    316 
    317    return ureg_create_shader_and_destroy(shader, idct->pipe);
    318 }
    319 
    320 static void *
    321 create_stage1_frag_shader(struct vl_idct *idct)
    322 {
    323    struct ureg_program *shader;
    324    struct ureg_src l_addr[2], r_addr[2];
    325    struct ureg_dst l[4][2], r[2];
    326    struct ureg_dst *fragment;
    327    unsigned i;
    328    int j;
    329 
    330    shader = ureg_create(PIPE_SHADER_FRAGMENT);
    331    if (!shader)
    332       return NULL;
    333 
    334    fragment = MALLOC(idct->nr_of_render_targets * sizeof(struct ureg_dst));
    335 
    336    l_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    337    l_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    338 
    339    r_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR0, TGSI_INTERPOLATE_LINEAR);
    340    r_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR1, TGSI_INTERPOLATE_LINEAR);
    341 
    342    for (i = 0; i < idct->nr_of_render_targets; ++i)
    343        fragment[i] = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, i);
    344 
    345    for (i = 0; i < 4; ++i) {
    346       l[i][0] = ureg_DECL_temporary(shader);
    347       l[i][1] = ureg_DECL_temporary(shader);
    348    }
    349 
    350    r[0] = ureg_DECL_temporary(shader);
    351    r[1] = ureg_DECL_temporary(shader);
    352 
    353    for (i = 0; i < 4; ++i) {
    354       increment_addr(shader, l[i], l_addr, false, false, i - 2, idct->buffer_height);
    355    }
    356 
    357    for (i = 0; i < 4; ++i) {
    358       struct ureg_src s_addr[2];
    359       s_addr[0] = ureg_src(l[i][0]);
    360       s_addr[1] = ureg_src(l[i][1]);
    361       fetch_four(shader, l[i], s_addr, ureg_DECL_sampler(shader, 0), false);
    362    }
    363 
    364    for (i = 0; i < idct->nr_of_render_targets; ++i) {
    365       struct ureg_src s_addr[2];
    366 
    367       increment_addr(shader, r, r_addr, true, true, i - (signed)idct->nr_of_render_targets / 2, VL_BLOCK_HEIGHT);
    368 
    369       s_addr[0] = ureg_src(r[0]);
    370       s_addr[1] = ureg_src(r[1]);
    371       fetch_four(shader, r, s_addr, ureg_DECL_sampler(shader, 1), false);
    372 
    373       for (j = 0; j < 4; ++j) {
    374          matrix_mul(shader, ureg_writemask(fragment[i], TGSI_WRITEMASK_X << j), l[j], r);
    375       }
    376    }
    377 
    378    for (i = 0; i < 4; ++i) {
    379       ureg_release_temporary(shader, l[i][0]);
    380       ureg_release_temporary(shader, l[i][1]);
    381    }
    382    ureg_release_temporary(shader, r[0]);
    383    ureg_release_temporary(shader, r[1]);
    384 
    385    ureg_END(shader);
    386 
    387    FREE(fragment);
    388 
    389    return ureg_create_shader_and_destroy(shader, idct->pipe);
    390 }
    391 
    392 void
    393 vl_idct_stage2_vert_shader(struct vl_idct *idct, struct ureg_program *shader,
    394                            unsigned first_output, struct ureg_dst tex)
    395 {
    396    struct ureg_src vrect, vpos;
    397    struct ureg_src scale;
    398    struct ureg_dst t_start;
    399    struct ureg_dst o_l_addr[2], o_r_addr[2];
    400 
    401    vrect = ureg_DECL_vs_input(shader, VS_I_RECT);
    402    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    403 
    404    t_start = ureg_DECL_temporary(shader);
    405 
    406    --first_output;
    407 
    408    o_l_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_L_ADDR0);
    409    o_l_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_L_ADDR1);
    410 
    411    o_r_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_R_ADDR0);
    412    o_r_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_R_ADDR1);
    413 
    414    scale = ureg_imm2f(shader,
    415       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    416       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    417 
    418    ureg_MUL(shader, ureg_writemask(tex, TGSI_WRITEMASK_Z),
    419       ureg_scalar(vrect, TGSI_SWIZZLE_X),
    420       ureg_imm1f(shader, VL_BLOCK_WIDTH / idct->nr_of_render_targets));
    421    ureg_MUL(shader, ureg_writemask(t_start, TGSI_WRITEMASK_XY), vpos, scale);
    422 
    423    calc_addr(shader, o_l_addr, vrect, ureg_imm1f(shader, 0.0f), false, false, VL_BLOCK_WIDTH / 4);
    424    calc_addr(shader, o_r_addr, ureg_src(tex), ureg_src(t_start), true, false, idct->buffer_height / 4);
    425 
    426    ureg_MOV(shader, ureg_writemask(o_r_addr[0], TGSI_WRITEMASK_Z), ureg_src(tex));
    427    ureg_MOV(shader, ureg_writemask(o_r_addr[1], TGSI_WRITEMASK_Z), ureg_src(tex));
    428 }
    429 
    430 void
    431 vl_idct_stage2_frag_shader(struct vl_idct *idct, struct ureg_program *shader,
    432                            unsigned first_input, struct ureg_dst fragment)
    433 {
    434    struct ureg_src l_addr[2], r_addr[2];
    435 
    436    struct ureg_dst l[2], r[2];
    437 
    438    --first_input;
    439 
    440    l_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    441    l_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    442 
    443    r_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_R_ADDR0, TGSI_INTERPOLATE_LINEAR);
    444    r_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_R_ADDR1, TGSI_INTERPOLATE_LINEAR);
    445 
    446    l[0] = ureg_DECL_temporary(shader);
    447    l[1] = ureg_DECL_temporary(shader);
    448    r[0] = ureg_DECL_temporary(shader);
    449    r[1] = ureg_DECL_temporary(shader);
    450 
    451    fetch_four(shader, l, l_addr, ureg_DECL_sampler(shader, 1), false);
    452    fetch_four(shader, r, r_addr, ureg_DECL_sampler(shader, 0), true);
    453 
    454    matrix_mul(shader, fragment, l, r);
    455 
    456    ureg_release_temporary(shader, l[0]);
    457    ureg_release_temporary(shader, l[1]);
    458    ureg_release_temporary(shader, r[0]);
    459    ureg_release_temporary(shader, r[1]);
    460 }
    461 
    462 static bool
    463 init_shaders(struct vl_idct *idct)
    464 {
    465    idct->vs_mismatch = create_mismatch_vert_shader(idct);
    466    if (!idct->vs_mismatch)
    467       goto error_vs_mismatch;
    468 
    469    idct->fs_mismatch = create_mismatch_frag_shader(idct);
    470    if (!idct->fs_mismatch)
    471       goto error_fs_mismatch;
    472 
    473    idct->vs = create_stage1_vert_shader(idct);
    474    if (!idct->vs)
    475       goto error_vs;
    476 
    477    idct->fs = create_stage1_frag_shader(idct);
    478    if (!idct->fs)
    479       goto error_fs;
    480 
    481    return true;
    482 
    483 error_fs:
    484    idct->pipe->delete_vs_state(idct->pipe, idct->vs);
    485 
    486 error_vs:
    487    idct->pipe->delete_vs_state(idct->pipe, idct->vs_mismatch);
    488 
    489 error_fs_mismatch:
    490    idct->pipe->delete_vs_state(idct->pipe, idct->fs);
    491 
    492 error_vs_mismatch:
    493    return false;
    494 }
    495 
    496 static void
    497 cleanup_shaders(struct vl_idct *idct)
    498 {
    499    idct->pipe->delete_vs_state(idct->pipe, idct->vs_mismatch);
    500    idct->pipe->delete_fs_state(idct->pipe, idct->fs_mismatch);
    501    idct->pipe->delete_vs_state(idct->pipe, idct->vs);
    502    idct->pipe->delete_fs_state(idct->pipe, idct->fs);
    503 }
    504 
    505 static bool
    506 init_state(struct vl_idct *idct)
    507 {
    508    struct pipe_blend_state blend;
    509    struct pipe_rasterizer_state rs_state;
    510    struct pipe_sampler_state sampler;
    511    unsigned i;
    512 
    513    assert(idct);
    514 
    515    memset(&rs_state, 0, sizeof(rs_state));
    516    rs_state.point_size = 1;
    517    rs_state.half_pixel_center = true;
    518    rs_state.bottom_edge_rule = true;
    519    rs_state.depth_clip = 1;
    520    idct->rs_state = idct->pipe->create_rasterizer_state(idct->pipe, &rs_state);
    521    if (!idct->rs_state)
    522       goto error_rs_state;
    523 
    524    memset(&blend, 0, sizeof blend);
    525 
    526    blend.independent_blend_enable = 0;
    527    blend.rt[0].blend_enable = 0;
    528    blend.rt[0].rgb_func = PIPE_BLEND_ADD;
    529    blend.rt[0].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
    530    blend.rt[0].rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
    531    blend.rt[0].alpha_func = PIPE_BLEND_ADD;
    532    blend.rt[0].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
    533    blend.rt[0].alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
    534    blend.logicop_enable = 0;
    535    blend.logicop_func = PIPE_LOGICOP_CLEAR;
    536    /* Needed to allow color writes to FB, even if blending disabled */
    537    blend.rt[0].colormask = PIPE_MASK_RGBA;
    538    blend.dither = 0;
    539    idct->blend = idct->pipe->create_blend_state(idct->pipe, &blend);
    540    if (!idct->blend)
    541       goto error_blend;
    542 
    543    for (i = 0; i < 2; ++i) {
    544       memset(&sampler, 0, sizeof(sampler));
    545       sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
    546       sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
    547       sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
    548       sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
    549       sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    550       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
    551       sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
    552       sampler.compare_func = PIPE_FUNC_ALWAYS;
    553       sampler.normalized_coords = 1;
    554       idct->samplers[i] = idct->pipe->create_sampler_state(idct->pipe, &sampler);
    555       if (!idct->samplers[i])
    556          goto error_samplers;
    557    }
    558 
    559    return true;
    560 
    561 error_samplers:
    562    for (i = 0; i < 2; ++i)
    563       if (idct->samplers[i])
    564          idct->pipe->delete_sampler_state(idct->pipe, idct->samplers[i]);
    565 
    566    idct->pipe->delete_rasterizer_state(idct->pipe, idct->rs_state);
    567 
    568 error_blend:
    569    idct->pipe->delete_blend_state(idct->pipe, idct->blend);
    570 
    571 error_rs_state:
    572    return false;
    573 }
    574 
    575 static void
    576 cleanup_state(struct vl_idct *idct)
    577 {
    578    unsigned i;
    579 
    580    for (i = 0; i < 2; ++i)
    581       idct->pipe->delete_sampler_state(idct->pipe, idct->samplers[i]);
    582 
    583    idct->pipe->delete_rasterizer_state(idct->pipe, idct->rs_state);
    584    idct->pipe->delete_blend_state(idct->pipe, idct->blend);
    585 }
    586 
    587 static bool
    588 init_source(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    589 {
    590    struct pipe_resource *tex;
    591    struct pipe_surface surf_templ;
    592 
    593    assert(idct && buffer);
    594 
    595    tex = buffer->sampler_views.individual.source->texture;
    596 
    597    buffer->fb_state_mismatch.width = tex->width0;
    598    buffer->fb_state_mismatch.height = tex->height0;
    599    buffer->fb_state_mismatch.nr_cbufs = 1;
    600 
    601    memset(&surf_templ, 0, sizeof(surf_templ));
    602    surf_templ.format = tex->format;
    603    surf_templ.u.tex.first_layer = 0;
    604    surf_templ.u.tex.last_layer = 0;
    605    buffer->fb_state_mismatch.cbufs[0] = idct->pipe->create_surface(idct->pipe, tex, &surf_templ);
    606 
    607    buffer->viewport_mismatch.scale[0] = tex->width0;
    608    buffer->viewport_mismatch.scale[1] = tex->height0;
    609    buffer->viewport_mismatch.scale[2] = 1;
    610 
    611    return true;
    612 }
    613 
    614 static void
    615 cleanup_source(struct vl_idct_buffer *buffer)
    616 {
    617    assert(buffer);
    618 
    619    pipe_surface_reference(&buffer->fb_state_mismatch.cbufs[0], NULL);
    620 
    621    pipe_sampler_view_reference(&buffer->sampler_views.individual.source, NULL);
    622 }
    623 
    624 static bool
    625 init_intermediate(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    626 {
    627    struct pipe_resource *tex;
    628    struct pipe_surface surf_templ;
    629    unsigned i;
    630 
    631    assert(idct && buffer);
    632 
    633    tex = buffer->sampler_views.individual.intermediate->texture;
    634 
    635    buffer->fb_state.width = tex->width0;
    636    buffer->fb_state.height = tex->height0;
    637    buffer->fb_state.nr_cbufs = idct->nr_of_render_targets;
    638    for(i = 0; i < idct->nr_of_render_targets; ++i) {
    639       memset(&surf_templ, 0, sizeof(surf_templ));
    640       surf_templ.format = tex->format;
    641       surf_templ.u.tex.first_layer = i;
    642       surf_templ.u.tex.last_layer = i;
    643       buffer->fb_state.cbufs[i] = idct->pipe->create_surface(
    644          idct->pipe, tex, &surf_templ);
    645 
    646       if (!buffer->fb_state.cbufs[i])
    647          goto error_surfaces;
    648    }
    649 
    650    buffer->viewport.scale[0] = tex->width0;
    651    buffer->viewport.scale[1] = tex->height0;
    652    buffer->viewport.scale[2] = 1;
    653 
    654    return true;
    655 
    656 error_surfaces:
    657    for(i = 0; i < idct->nr_of_render_targets; ++i)
    658       pipe_surface_reference(&buffer->fb_state.cbufs[i], NULL);
    659 
    660    return false;
    661 }
    662 
    663 static void
    664 cleanup_intermediate(struct vl_idct_buffer *buffer)
    665 {
    666    unsigned i;
    667 
    668    assert(buffer);
    669 
    670    for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i)
    671       pipe_surface_reference(&buffer->fb_state.cbufs[i], NULL);
    672 
    673    pipe_sampler_view_reference(&buffer->sampler_views.individual.intermediate, NULL);
    674 }
    675 
    676 struct pipe_sampler_view *
    677 vl_idct_upload_matrix(struct pipe_context *pipe, float scale)
    678 {
    679    struct pipe_resource tex_templ, *matrix;
    680    struct pipe_sampler_view sv_templ, *sv;
    681    struct pipe_transfer *buf_transfer;
    682    unsigned i, j, pitch;
    683    float *f;
    684 
    685    struct pipe_box rect =
    686    {
    687       0, 0, 0,
    688       VL_BLOCK_WIDTH / 4,
    689       VL_BLOCK_HEIGHT,
    690       1
    691    };
    692 
    693    assert(pipe);
    694 
    695    memset(&tex_templ, 0, sizeof(tex_templ));
    696    tex_templ.target = PIPE_TEXTURE_2D;
    697    tex_templ.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
    698    tex_templ.last_level = 0;
    699    tex_templ.width0 = 2;
    700    tex_templ.height0 = 8;
    701    tex_templ.depth0 = 1;
    702    tex_templ.array_size = 1;
    703    tex_templ.usage = PIPE_USAGE_IMMUTABLE;
    704    tex_templ.bind = PIPE_BIND_SAMPLER_VIEW;
    705    tex_templ.flags = 0;
    706 
    707    matrix = pipe->screen->resource_create(pipe->screen, &tex_templ);
    708    if (!matrix)
    709       goto error_matrix;
    710 
    711    f = pipe->transfer_map(pipe, matrix, 0,
    712                                      PIPE_TRANSFER_WRITE |
    713                                      PIPE_TRANSFER_DISCARD_RANGE,
    714                                      &rect, &buf_transfer);
    715    if (!f)
    716       goto error_map;
    717 
    718    pitch = buf_transfer->stride / sizeof(float);
    719 
    720    for(i = 0; i < VL_BLOCK_HEIGHT; ++i)
    721       for(j = 0; j < VL_BLOCK_WIDTH; ++j)
    722          // transpose and scale
    723          f[i * pitch + j] = ((const float (*)[8])const_matrix)[j][i] * scale;
    724 
    725    pipe->transfer_unmap(pipe, buf_transfer);
    726 
    727    memset(&sv_templ, 0, sizeof(sv_templ));
    728    u_sampler_view_default_template(&sv_templ, matrix, matrix->format);
    729    sv = pipe->create_sampler_view(pipe, matrix, &sv_templ);
    730    pipe_resource_reference(&matrix, NULL);
    731    if (!sv)
    732       goto error_map;
    733 
    734    return sv;
    735 
    736 error_map:
    737    pipe_resource_reference(&matrix, NULL);
    738 
    739 error_matrix:
    740    return NULL;
    741 }
    742 
    743 bool vl_idct_init(struct vl_idct *idct, struct pipe_context *pipe,
    744                   unsigned buffer_width, unsigned buffer_height,
    745                   unsigned nr_of_render_targets,
    746                   struct pipe_sampler_view *matrix,
    747                   struct pipe_sampler_view *transpose)
    748 {
    749    assert(idct && pipe);
    750    assert(matrix && transpose);
    751 
    752    idct->pipe = pipe;
    753    idct->buffer_width = buffer_width;
    754    idct->buffer_height = buffer_height;
    755    idct->nr_of_render_targets = nr_of_render_targets;
    756 
    757    pipe_sampler_view_reference(&idct->matrix, matrix);
    758    pipe_sampler_view_reference(&idct->transpose, transpose);
    759 
    760    if(!init_shaders(idct))
    761       return false;
    762 
    763    if(!init_state(idct)) {
    764       cleanup_shaders(idct);
    765       return false;
    766    }
    767 
    768    return true;
    769 }
    770 
    771 void
    772 vl_idct_cleanup(struct vl_idct *idct)
    773 {
    774    cleanup_shaders(idct);
    775    cleanup_state(idct);
    776 
    777    pipe_sampler_view_reference(&idct->matrix, NULL);
    778    pipe_sampler_view_reference(&idct->transpose, NULL);
    779 }
    780 
    781 bool
    782 vl_idct_init_buffer(struct vl_idct *idct, struct vl_idct_buffer *buffer,
    783                     struct pipe_sampler_view *source,
    784                     struct pipe_sampler_view *intermediate)
    785 {
    786    assert(buffer && idct);
    787    assert(source && intermediate);
    788 
    789    memset(buffer, 0, sizeof(struct vl_idct_buffer));
    790 
    791    pipe_sampler_view_reference(&buffer->sampler_views.individual.matrix, idct->matrix);
    792    pipe_sampler_view_reference(&buffer->sampler_views.individual.source, source);
    793    pipe_sampler_view_reference(&buffer->sampler_views.individual.transpose, idct->transpose);
    794    pipe_sampler_view_reference(&buffer->sampler_views.individual.intermediate, intermediate);
    795 
    796    if (!init_source(idct, buffer))
    797       return false;
    798 
    799    if (!init_intermediate(idct, buffer))
    800       return false;
    801 
    802    return true;
    803 }
    804 
    805 void
    806 vl_idct_cleanup_buffer(struct vl_idct_buffer *buffer)
    807 {
    808    assert(buffer);
    809 
    810    cleanup_source(buffer);
    811    cleanup_intermediate(buffer);
    812 
    813    pipe_sampler_view_reference(&buffer->sampler_views.individual.matrix, NULL);
    814    pipe_sampler_view_reference(&buffer->sampler_views.individual.transpose, NULL);
    815 }
    816 
    817 void
    818 vl_idct_flush(struct vl_idct *idct, struct vl_idct_buffer *buffer, unsigned num_instances)
    819 {
    820    assert(buffer);
    821 
    822    idct->pipe->bind_rasterizer_state(idct->pipe, idct->rs_state);
    823    idct->pipe->bind_blend_state(idct->pipe, idct->blend);
    824 
    825    idct->pipe->bind_sampler_states(idct->pipe, PIPE_SHADER_FRAGMENT,
    826                                    0, 2, idct->samplers);
    827 
    828    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT, 0, 2,
    829                                  buffer->sampler_views.stage[0]);
    830 
    831    /* mismatch control */
    832    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state_mismatch);
    833    idct->pipe->set_viewport_states(idct->pipe, 0, 1, &buffer->viewport_mismatch);
    834    idct->pipe->bind_vs_state(idct->pipe, idct->vs_mismatch);
    835    idct->pipe->bind_fs_state(idct->pipe, idct->fs_mismatch);
    836    util_draw_arrays_instanced(idct->pipe, PIPE_PRIM_POINTS, 0, 1, 0, num_instances);
    837 
    838    /* first stage */
    839    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state);
    840    idct->pipe->set_viewport_states(idct->pipe, 0, 1, &buffer->viewport);
    841    idct->pipe->bind_vs_state(idct->pipe, idct->vs);
    842    idct->pipe->bind_fs_state(idct->pipe, idct->fs);
    843    util_draw_arrays_instanced(idct->pipe, PIPE_PRIM_QUADS, 0, 4, 0, num_instances);
    844 }
    845 
    846 void
    847 vl_idct_prepare_stage2(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    848 {
    849    assert(buffer);
    850 
    851    /* second stage */
    852    idct->pipe->bind_rasterizer_state(idct->pipe, idct->rs_state);
    853    idct->pipe->bind_sampler_states(idct->pipe, PIPE_SHADER_FRAGMENT,
    854                                    0, 2, idct->samplers);
    855    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT,
    856                                  0, 2, buffer->sampler_views.stage[1]);
    857 }
    858 
    859