Home | History | Annotate | Download | only in vl
      1 /**************************************************************************
      2  *
      3  * Copyright 2010 Christian Knig
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sub license, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial portions
     16  * of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
     21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
     22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  **************************************************************************/
     27 
     28 #include <assert.h>
     29 
     30 #include "pipe/p_context.h"
     31 #include "pipe/p_screen.h"
     32 
     33 #include "util/u_draw.h"
     34 #include "util/u_sampler.h"
     35 #include "util/u_memory.h"
     36 
     37 #include "tgsi/tgsi_ureg.h"
     38 
     39 #include "vl_defines.h"
     40 #include "vl_types.h"
     41 #include "vl_vertex_buffers.h"
     42 #include "vl_idct.h"
     43 
     44 enum VS_OUTPUT
     45 {
     46    VS_O_VPOS = 0,
     47    VS_O_L_ADDR0 = 0,
     48    VS_O_L_ADDR1,
     49    VS_O_R_ADDR0,
     50    VS_O_R_ADDR1
     51 };
     52 
     53 /**
     54  * The DCT matrix stored as hex representation of floats. Equal to the following equation:
     55  * for (i = 0; i < 8; ++i)
     56  *    for (j = 0; j < 8; ++j)
     57  *       if (i == 0) const_matrix[i][j] = 1.0f / sqrtf(8.0f);
     58  *       else const_matrix[i][j] = sqrtf(2.0f / 8.0f) * cosf((2 * j + 1) * i * M_PI / (2.0f * 8.0f));
     59  */
     60 static const uint32_t const_matrix[8][8] = {
     61    { 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3, 0x3eb504f3 },
     62    { 0x3efb14be, 0x3ed4db31, 0x3e8e39da, 0x3dc7c5c4, 0xbdc7c5c2, 0xbe8e39d9, 0xbed4db32, 0xbefb14bf },
     63    { 0x3eec835f, 0x3e43ef15, 0xbe43ef14, 0xbeec835e, 0xbeec835f, 0xbe43ef1a, 0x3e43ef1b, 0x3eec835f },
     64    { 0x3ed4db31, 0xbdc7c5c2, 0xbefb14bf, 0xbe8e39dd, 0x3e8e39d7, 0x3efb14bf, 0x3dc7c5d0, 0xbed4db34 },
     65    { 0x3eb504f3, 0xbeb504f3, 0xbeb504f4, 0x3eb504f1, 0x3eb504f3, 0xbeb504f0, 0xbeb504ef, 0x3eb504f4 },
     66    { 0x3e8e39da, 0xbefb14bf, 0x3dc7c5c8, 0x3ed4db32, 0xbed4db34, 0xbdc7c5bb, 0x3efb14bf, 0xbe8e39d7 },
     67    { 0x3e43ef15, 0xbeec835f, 0x3eec835f, 0xbe43ef07, 0xbe43ef23, 0x3eec8361, 0xbeec835c, 0x3e43ef25 },
     68    { 0x3dc7c5c4, 0xbe8e39dd, 0x3ed4db32, 0xbefb14c0, 0x3efb14be, 0xbed4db31, 0x3e8e39ce, 0xbdc7c596 },
     69 };
     70 
     71 static void
     72 calc_addr(struct ureg_program *shader, struct ureg_dst addr[2],
     73           struct ureg_src tc, struct ureg_src start, bool right_side,
     74           bool transposed, float size)
     75 {
     76    unsigned wm_start = (right_side == transposed) ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_Y;
     77    unsigned sw_start = right_side ? TGSI_SWIZZLE_Y : TGSI_SWIZZLE_X;
     78 
     79    unsigned wm_tc = (right_side == transposed) ? TGSI_WRITEMASK_Y : TGSI_WRITEMASK_X;
     80    unsigned sw_tc = right_side ? TGSI_SWIZZLE_X : TGSI_SWIZZLE_Y;
     81 
     82    /*
     83     * addr[0..1].(start) = right_side ? start.x : tc.x
     84     * addr[0..1].(tc) = right_side ? tc.y : start.y
     85     * addr[0..1].z = tc.z
     86     * addr[1].(start) += 1.0f / scale
     87     */
     88    ureg_MOV(shader, ureg_writemask(addr[0], wm_start), ureg_scalar(start, sw_start));
     89    ureg_MOV(shader, ureg_writemask(addr[0], wm_tc), ureg_scalar(tc, sw_tc));
     90 
     91    ureg_ADD(shader, ureg_writemask(addr[1], wm_start), ureg_scalar(start, sw_start), ureg_imm1f(shader, 1.0f / size));
     92    ureg_MOV(shader, ureg_writemask(addr[1], wm_tc), ureg_scalar(tc, sw_tc));
     93 }
     94 
     95 static void
     96 increment_addr(struct ureg_program *shader, struct ureg_dst daddr[2],
     97                struct ureg_src saddr[2], bool right_side, bool transposed,
     98                int pos, float size)
     99 {
    100    unsigned wm_start = (right_side == transposed) ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_Y;
    101    unsigned wm_tc = (right_side == transposed) ? TGSI_WRITEMASK_Y : TGSI_WRITEMASK_X;
    102 
    103    /*
    104     * daddr[0..1].(start) = saddr[0..1].(start)
    105     * daddr[0..1].(tc) = saddr[0..1].(tc)
    106     */
    107 
    108    ureg_MOV(shader, ureg_writemask(daddr[0], wm_start), saddr[0]);
    109    ureg_ADD(shader, ureg_writemask(daddr[0], wm_tc), saddr[0], ureg_imm1f(shader, pos / size));
    110    ureg_MOV(shader, ureg_writemask(daddr[1], wm_start), saddr[1]);
    111    ureg_ADD(shader, ureg_writemask(daddr[1], wm_tc), saddr[1], ureg_imm1f(shader, pos / size));
    112 }
    113 
    114 static void
    115 fetch_four(struct ureg_program *shader, struct ureg_dst m[2], struct ureg_src addr[2],
    116            struct ureg_src sampler, bool resource3d)
    117 {
    118    ureg_TEX(shader, m[0], resource3d ? TGSI_TEXTURE_3D : TGSI_TEXTURE_2D, addr[0], sampler);
    119    ureg_TEX(shader, m[1], resource3d ? TGSI_TEXTURE_3D : TGSI_TEXTURE_2D, addr[1], sampler);
    120 }
    121 
    122 static void
    123 matrix_mul(struct ureg_program *shader, struct ureg_dst dst, struct ureg_dst l[2], struct ureg_dst r[2])
    124 {
    125    struct ureg_dst tmp;
    126 
    127    tmp = ureg_DECL_temporary(shader);
    128 
    129    /*
    130     * tmp.xy = dot4(m[0][0..1], m[1][0..1])
    131     * dst = tmp.x + tmp.y
    132     */
    133    ureg_DP4(shader, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(l[0]), ureg_src(r[0]));
    134    ureg_DP4(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(l[1]), ureg_src(r[1]));
    135    ureg_ADD(shader, dst,
    136       ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X),
    137       ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
    138 
    139    ureg_release_temporary(shader, tmp);
    140 }
    141 
    142 static void *
    143 create_mismatch_vert_shader(struct vl_idct *idct)
    144 {
    145    struct ureg_program *shader;
    146    struct ureg_src vpos;
    147    struct ureg_src scale;
    148    struct ureg_dst t_tex;
    149    struct ureg_dst o_vpos, o_addr[2];
    150 
    151    shader = ureg_create(TGSI_PROCESSOR_VERTEX);
    152    if (!shader)
    153       return NULL;
    154 
    155    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    156 
    157    t_tex = ureg_DECL_temporary(shader);
    158 
    159    o_vpos = ureg_DECL_output(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS);
    160 
    161    o_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0);
    162    o_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1);
    163 
    164    /*
    165     * scale = (VL_BLOCK_WIDTH, VL_BLOCK_HEIGHT) / (dst.width, dst.height)
    166     *
    167     * t_vpos = vpos + 7 / VL_BLOCK_WIDTH
    168     * o_vpos.xy = t_vpos * scale
    169     *
    170     * o_addr = calc_addr(...)
    171     *
    172     */
    173 
    174    scale = ureg_imm2f(shader,
    175       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    176       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    177 
    178    ureg_MAD(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), vpos, scale, scale);
    179    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));
    180 
    181    ureg_MUL(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), vpos, scale);
    182    calc_addr(shader, o_addr, ureg_src(t_tex), ureg_src(t_tex), false, false, idct->buffer_width / 4);
    183 
    184    ureg_release_temporary(shader, t_tex);
    185 
    186    ureg_END(shader);
    187 
    188    return ureg_create_shader_and_destroy(shader, idct->pipe);
    189 }
    190 
    191 static void *
    192 create_mismatch_frag_shader(struct vl_idct *idct)
    193 {
    194    struct ureg_program *shader;
    195 
    196    struct ureg_src addr[2];
    197 
    198    struct ureg_dst m[8][2];
    199    struct ureg_dst fragment;
    200 
    201    unsigned i;
    202 
    203    shader = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    204    if (!shader)
    205       return NULL;
    206 
    207    addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    208    addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    209 
    210    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
    211 
    212    for (i = 0; i < 8; ++i) {
    213       m[i][0] = ureg_DECL_temporary(shader);
    214       m[i][1] = ureg_DECL_temporary(shader);
    215    }
    216 
    217    for (i = 0; i < 8; ++i) {
    218       increment_addr(shader, m[i], addr, false, false, i, idct->buffer_height);
    219    }
    220 
    221    for (i = 0; i < 8; ++i) {
    222       struct ureg_src s_addr[2];
    223       s_addr[0] = ureg_src(m[i][0]);
    224       s_addr[1] = ureg_src(m[i][1]);
    225       fetch_four(shader, m[i], s_addr, ureg_DECL_sampler(shader, 0), false);
    226    }
    227 
    228    for (i = 1; i < 8; ++i) {
    229       ureg_ADD(shader, m[0][0], ureg_src(m[0][0]), ureg_src(m[i][0]));
    230       ureg_ADD(shader, m[0][1], ureg_src(m[0][1]), ureg_src(m[i][1]));
    231    }
    232 
    233    ureg_ADD(shader, m[0][0], ureg_src(m[0][0]), ureg_src(m[0][1]));
    234    ureg_DP4(shader, m[0][0], ureg_abs(ureg_src(m[0][0])), ureg_imm1f(shader, 1 << 14));
    235 
    236    ureg_MUL(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_abs(ureg_src(m[7][1])), ureg_imm1f(shader, 1 << 14));
    237    ureg_FRC(shader, m[0][0], ureg_src(m[0][0]));
    238    ureg_SGT(shader, m[0][0], ureg_imm1f(shader, 0.5f), ureg_abs(ureg_src(m[0][0])));
    239 
    240    ureg_CMP(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_negate(ureg_src(m[0][0])),
    241             ureg_imm1f(shader, 1.0f / (1 << 15)), ureg_imm1f(shader, -1.0f / (1 << 15)));
    242    ureg_MUL(shader, ureg_writemask(m[0][0], TGSI_WRITEMASK_W), ureg_src(m[0][0]),
    243             ureg_scalar(ureg_src(m[0][0]), TGSI_SWIZZLE_X));
    244 
    245    ureg_MOV(shader, ureg_writemask(fragment, TGSI_WRITEMASK_XYZ), ureg_src(m[7][1]));
    246    ureg_ADD(shader, ureg_writemask(fragment, TGSI_WRITEMASK_W), ureg_src(m[0][0]), ureg_src(m[7][1]));
    247 
    248    for (i = 0; i < 8; ++i) {
    249       ureg_release_temporary(shader, m[i][0]);
    250       ureg_release_temporary(shader, m[i][1]);
    251    }
    252 
    253    ureg_END(shader);
    254 
    255    return ureg_create_shader_and_destroy(shader, idct->pipe);
    256 }
    257 
    258 static void *
    259 create_stage1_vert_shader(struct vl_idct *idct)
    260 {
    261    struct ureg_program *shader;
    262    struct ureg_src vrect, vpos;
    263    struct ureg_src scale;
    264    struct ureg_dst t_tex, t_start;
    265    struct ureg_dst o_vpos, o_l_addr[2], o_r_addr[2];
    266 
    267    shader = ureg_create(TGSI_PROCESSOR_VERTEX);
    268    if (!shader)
    269       return NULL;
    270 
    271    vrect = ureg_DECL_vs_input(shader, VS_I_RECT);
    272    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    273 
    274    t_tex = ureg_DECL_temporary(shader);
    275    t_start = ureg_DECL_temporary(shader);
    276 
    277    o_vpos = ureg_DECL_output(shader, TGSI_SEMANTIC_POSITION, VS_O_VPOS);
    278 
    279    o_l_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0);
    280    o_l_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1);
    281 
    282    o_r_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR0);
    283    o_r_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR1);
    284 
    285    /*
    286     * scale = (VL_BLOCK_WIDTH, VL_BLOCK_HEIGHT) / (dst.width, dst.height)
    287     *
    288     * t_vpos = vpos + vrect
    289     * o_vpos.xy = t_vpos * scale
    290     * o_vpos.zw = vpos
    291     *
    292     * o_l_addr = calc_addr(...)
    293     * o_r_addr = calc_addr(...)
    294     *
    295     */
    296 
    297    scale = ureg_imm2f(shader,
    298       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    299       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    300 
    301    ureg_ADD(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), vpos, vrect);
    302    ureg_MUL(shader, ureg_writemask(t_tex, TGSI_WRITEMASK_XY), ureg_src(t_tex), scale);
    303 
    304    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), ureg_src(t_tex));
    305    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));
    306 
    307    ureg_MUL(shader, ureg_writemask(t_start, TGSI_WRITEMASK_XY), vpos, scale);
    308 
    309    calc_addr(shader, o_l_addr, ureg_src(t_tex), ureg_src(t_start), false, false, idct->buffer_width / 4);
    310    calc_addr(shader, o_r_addr, vrect, ureg_imm1f(shader, 0.0f), true, true, VL_BLOCK_WIDTH / 4);
    311 
    312    ureg_release_temporary(shader, t_tex);
    313    ureg_release_temporary(shader, t_start);
    314 
    315    ureg_END(shader);
    316 
    317    return ureg_create_shader_and_destroy(shader, idct->pipe);
    318 }
    319 
    320 static void *
    321 create_stage1_frag_shader(struct vl_idct *idct)
    322 {
    323    struct ureg_program *shader;
    324 
    325    struct ureg_src l_addr[2], r_addr[2];
    326 
    327    struct ureg_dst l[4][2], r[2];
    328    struct ureg_dst *fragment;
    329 
    330    int i, j;
    331 
    332    shader = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    333    if (!shader)
    334       return NULL;
    335 
    336    fragment = MALLOC(idct->nr_of_render_targets * sizeof(struct ureg_dst));
    337 
    338    l_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    339    l_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    340 
    341    r_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR0, TGSI_INTERPOLATE_LINEAR);
    342    r_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_R_ADDR1, TGSI_INTERPOLATE_LINEAR);
    343 
    344    for (i = 0; i < idct->nr_of_render_targets; ++i)
    345        fragment[i] = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, i);
    346 
    347    for (i = 0; i < 4; ++i) {
    348       l[i][0] = ureg_DECL_temporary(shader);
    349       l[i][1] = ureg_DECL_temporary(shader);
    350    }
    351 
    352    r[0] = ureg_DECL_temporary(shader);
    353    r[1] = ureg_DECL_temporary(shader);
    354 
    355    for (i = 0; i < 4; ++i) {
    356       increment_addr(shader, l[i], l_addr, false, false, i - 2, idct->buffer_height);
    357    }
    358 
    359    for (i = 0; i < 4; ++i) {
    360       struct ureg_src s_addr[2];
    361       s_addr[0] = ureg_src(l[i][0]);
    362       s_addr[1] = ureg_src(l[i][1]);
    363       fetch_four(shader, l[i], s_addr, ureg_DECL_sampler(shader, 0), false);
    364    }
    365 
    366    for (i = 0; i < idct->nr_of_render_targets; ++i) {
    367       struct ureg_src s_addr[2];
    368 
    369       increment_addr(shader, r, r_addr, true, true, i - (signed)idct->nr_of_render_targets / 2, VL_BLOCK_HEIGHT);
    370 
    371       s_addr[0] = ureg_src(r[0]);
    372       s_addr[1] = ureg_src(r[1]);
    373       fetch_four(shader, r, s_addr, ureg_DECL_sampler(shader, 1), false);
    374 
    375       for (j = 0; j < 4; ++j) {
    376          matrix_mul(shader, ureg_writemask(fragment[i], TGSI_WRITEMASK_X << j), l[j], r);
    377       }
    378    }
    379 
    380    for (i = 0; i < 4; ++i) {
    381       ureg_release_temporary(shader, l[i][0]);
    382       ureg_release_temporary(shader, l[i][1]);
    383    }
    384    ureg_release_temporary(shader, r[0]);
    385    ureg_release_temporary(shader, r[1]);
    386 
    387    ureg_END(shader);
    388 
    389    FREE(fragment);
    390 
    391    return ureg_create_shader_and_destroy(shader, idct->pipe);
    392 }
    393 
    394 void
    395 vl_idct_stage2_vert_shader(struct vl_idct *idct, struct ureg_program *shader,
    396                            unsigned first_output, struct ureg_dst tex)
    397 {
    398    struct ureg_src vrect, vpos;
    399    struct ureg_src scale;
    400    struct ureg_dst t_start;
    401    struct ureg_dst o_l_addr[2], o_r_addr[2];
    402 
    403    vrect = ureg_DECL_vs_input(shader, VS_I_RECT);
    404    vpos = ureg_DECL_vs_input(shader, VS_I_VPOS);
    405 
    406    t_start = ureg_DECL_temporary(shader);
    407 
    408    --first_output;
    409 
    410    o_l_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_L_ADDR0);
    411    o_l_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_L_ADDR1);
    412 
    413    o_r_addr[0] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_R_ADDR0);
    414    o_r_addr[1] = ureg_DECL_output(shader, TGSI_SEMANTIC_GENERIC, first_output + VS_O_R_ADDR1);
    415 
    416    scale = ureg_imm2f(shader,
    417       (float)VL_BLOCK_WIDTH / idct->buffer_width,
    418       (float)VL_BLOCK_HEIGHT / idct->buffer_height);
    419 
    420    ureg_MUL(shader, ureg_writemask(tex, TGSI_WRITEMASK_Z),
    421       ureg_scalar(vrect, TGSI_SWIZZLE_X),
    422       ureg_imm1f(shader, VL_BLOCK_WIDTH / idct->nr_of_render_targets));
    423    ureg_MUL(shader, ureg_writemask(t_start, TGSI_WRITEMASK_XY), vpos, scale);
    424 
    425    calc_addr(shader, o_l_addr, vrect, ureg_imm1f(shader, 0.0f), false, false, VL_BLOCK_WIDTH / 4);
    426    calc_addr(shader, o_r_addr, ureg_src(tex), ureg_src(t_start), true, false, idct->buffer_height / 4);
    427 
    428    ureg_MOV(shader, ureg_writemask(o_r_addr[0], TGSI_WRITEMASK_Z), ureg_src(tex));
    429    ureg_MOV(shader, ureg_writemask(o_r_addr[1], TGSI_WRITEMASK_Z), ureg_src(tex));
    430 }
    431 
    432 void
    433 vl_idct_stage2_frag_shader(struct vl_idct *idct, struct ureg_program *shader,
    434                            unsigned first_input, struct ureg_dst fragment)
    435 {
    436    struct ureg_src l_addr[2], r_addr[2];
    437 
    438    struct ureg_dst l[2], r[2];
    439 
    440    --first_input;
    441 
    442    l_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_L_ADDR0, TGSI_INTERPOLATE_LINEAR);
    443    l_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_L_ADDR1, TGSI_INTERPOLATE_LINEAR);
    444 
    445    r_addr[0] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_R_ADDR0, TGSI_INTERPOLATE_LINEAR);
    446    r_addr[1] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, first_input + VS_O_R_ADDR1, TGSI_INTERPOLATE_LINEAR);
    447 
    448    l[0] = ureg_DECL_temporary(shader);
    449    l[1] = ureg_DECL_temporary(shader);
    450    r[0] = ureg_DECL_temporary(shader);
    451    r[1] = ureg_DECL_temporary(shader);
    452 
    453    fetch_four(shader, l, l_addr, ureg_DECL_sampler(shader, 1), false);
    454    fetch_four(shader, r, r_addr, ureg_DECL_sampler(shader, 0), true);
    455 
    456    matrix_mul(shader, fragment, l, r);
    457 
    458    ureg_release_temporary(shader, l[0]);
    459    ureg_release_temporary(shader, l[1]);
    460    ureg_release_temporary(shader, r[0]);
    461    ureg_release_temporary(shader, r[1]);
    462 }
    463 
    464 static bool
    465 init_shaders(struct vl_idct *idct)
    466 {
    467    idct->vs_mismatch = create_mismatch_vert_shader(idct);
    468    if (!idct->vs_mismatch)
    469       goto error_vs_mismatch;
    470 
    471    idct->fs_mismatch = create_mismatch_frag_shader(idct);
    472    if (!idct->fs_mismatch)
    473       goto error_fs_mismatch;
    474 
    475    idct->vs = create_stage1_vert_shader(idct);
    476    if (!idct->vs)
    477       goto error_vs;
    478 
    479    idct->fs = create_stage1_frag_shader(idct);
    480    if (!idct->fs)
    481       goto error_fs;
    482 
    483    return true;
    484 
    485 error_fs:
    486    idct->pipe->delete_vs_state(idct->pipe, idct->vs);
    487 
    488 error_vs:
    489    idct->pipe->delete_vs_state(idct->pipe, idct->vs_mismatch);
    490 
    491 error_fs_mismatch:
    492    idct->pipe->delete_vs_state(idct->pipe, idct->fs);
    493 
    494 error_vs_mismatch:
    495    return false;
    496 }
    497 
    498 static void
    499 cleanup_shaders(struct vl_idct *idct)
    500 {
    501    idct->pipe->delete_vs_state(idct->pipe, idct->vs_mismatch);
    502    idct->pipe->delete_fs_state(idct->pipe, idct->fs_mismatch);
    503    idct->pipe->delete_vs_state(idct->pipe, idct->vs);
    504    idct->pipe->delete_fs_state(idct->pipe, idct->fs);
    505 }
    506 
    507 static bool
    508 init_state(struct vl_idct *idct)
    509 {
    510    struct pipe_blend_state blend;
    511    struct pipe_rasterizer_state rs_state;
    512    struct pipe_sampler_state sampler;
    513    unsigned i;
    514 
    515    assert(idct);
    516 
    517    memset(&rs_state, 0, sizeof(rs_state));
    518    rs_state.point_size = 1;
    519    rs_state.gl_rasterization_rules = true;
    520    rs_state.depth_clip = 1;
    521    idct->rs_state = idct->pipe->create_rasterizer_state(idct->pipe, &rs_state);
    522    if (!idct->rs_state)
    523       goto error_rs_state;
    524 
    525    memset(&blend, 0, sizeof blend);
    526 
    527    blend.independent_blend_enable = 0;
    528    blend.rt[0].blend_enable = 0;
    529    blend.rt[0].rgb_func = PIPE_BLEND_ADD;
    530    blend.rt[0].rgb_src_factor = PIPE_BLENDFACTOR_ONE;
    531    blend.rt[0].rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
    532    blend.rt[0].alpha_func = PIPE_BLEND_ADD;
    533    blend.rt[0].alpha_src_factor = PIPE_BLENDFACTOR_ONE;
    534    blend.rt[0].alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
    535    blend.logicop_enable = 0;
    536    blend.logicop_func = PIPE_LOGICOP_CLEAR;
    537    /* Needed to allow color writes to FB, even if blending disabled */
    538    blend.rt[0].colormask = PIPE_MASK_RGBA;
    539    blend.dither = 0;
    540    idct->blend = idct->pipe->create_blend_state(idct->pipe, &blend);
    541    if (!idct->blend)
    542       goto error_blend;
    543 
    544    for (i = 0; i < 2; ++i) {
    545       memset(&sampler, 0, sizeof(sampler));
    546       sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
    547       sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
    548       sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
    549       sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
    550       sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    551       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
    552       sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
    553       sampler.compare_func = PIPE_FUNC_ALWAYS;
    554       sampler.normalized_coords = 1;
    555       idct->samplers[i] = idct->pipe->create_sampler_state(idct->pipe, &sampler);
    556       if (!idct->samplers[i])
    557          goto error_samplers;
    558    }
    559 
    560    return true;
    561 
    562 error_samplers:
    563    for (i = 0; i < 2; ++i)
    564       if (idct->samplers[i])
    565          idct->pipe->delete_sampler_state(idct->pipe, idct->samplers[i]);
    566 
    567    idct->pipe->delete_rasterizer_state(idct->pipe, idct->rs_state);
    568 
    569 error_blend:
    570    idct->pipe->delete_blend_state(idct->pipe, idct->blend);
    571 
    572 error_rs_state:
    573    return false;
    574 }
    575 
    576 static void
    577 cleanup_state(struct vl_idct *idct)
    578 {
    579    unsigned i;
    580 
    581    for (i = 0; i < 2; ++i)
    582       idct->pipe->delete_sampler_state(idct->pipe, idct->samplers[i]);
    583 
    584    idct->pipe->delete_rasterizer_state(idct->pipe, idct->rs_state);
    585    idct->pipe->delete_blend_state(idct->pipe, idct->blend);
    586 }
    587 
    588 static bool
    589 init_source(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    590 {
    591    struct pipe_resource *tex;
    592    struct pipe_surface surf_templ;
    593 
    594    assert(idct && buffer);
    595 
    596    tex = buffer->sampler_views.individual.source->texture;
    597 
    598    buffer->fb_state_mismatch.width = tex->width0;
    599    buffer->fb_state_mismatch.height = tex->height0;
    600    buffer->fb_state_mismatch.nr_cbufs = 1;
    601 
    602    memset(&surf_templ, 0, sizeof(surf_templ));
    603    surf_templ.format = tex->format;
    604    surf_templ.u.tex.first_layer = 0;
    605    surf_templ.u.tex.last_layer = 0;
    606    surf_templ.usage = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
    607    buffer->fb_state_mismatch.cbufs[0] = idct->pipe->create_surface(idct->pipe, tex, &surf_templ);
    608 
    609    buffer->viewport_mismatch.scale[0] = tex->width0;
    610    buffer->viewport_mismatch.scale[1] = tex->height0;
    611    buffer->viewport_mismatch.scale[2] = 1;
    612    buffer->viewport_mismatch.scale[3] = 1;
    613 
    614    return true;
    615 }
    616 
    617 static void
    618 cleanup_source(struct vl_idct_buffer *buffer)
    619 {
    620    assert(buffer);
    621 
    622    pipe_surface_reference(&buffer->fb_state_mismatch.cbufs[0], NULL);
    623 
    624    pipe_sampler_view_reference(&buffer->sampler_views.individual.source, NULL);
    625 }
    626 
    627 static bool
    628 init_intermediate(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    629 {
    630    struct pipe_resource *tex;
    631    struct pipe_surface surf_templ;
    632    unsigned i;
    633 
    634    assert(idct && buffer);
    635 
    636    tex = buffer->sampler_views.individual.intermediate->texture;
    637 
    638    buffer->fb_state.width = tex->width0;
    639    buffer->fb_state.height = tex->height0;
    640    buffer->fb_state.nr_cbufs = idct->nr_of_render_targets;
    641    for(i = 0; i < idct->nr_of_render_targets; ++i) {
    642       memset(&surf_templ, 0, sizeof(surf_templ));
    643       surf_templ.format = tex->format;
    644       surf_templ.u.tex.first_layer = i;
    645       surf_templ.u.tex.last_layer = i;
    646       surf_templ.usage = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
    647       buffer->fb_state.cbufs[i] = idct->pipe->create_surface(
    648          idct->pipe, tex, &surf_templ);
    649 
    650       if (!buffer->fb_state.cbufs[i])
    651          goto error_surfaces;
    652    }
    653 
    654    buffer->viewport.scale[0] = tex->width0;
    655    buffer->viewport.scale[1] = tex->height0;
    656    buffer->viewport.scale[2] = 1;
    657    buffer->viewport.scale[3] = 1;
    658 
    659    return true;
    660 
    661 error_surfaces:
    662    for(i = 0; i < idct->nr_of_render_targets; ++i)
    663       pipe_surface_reference(&buffer->fb_state.cbufs[i], NULL);
    664 
    665    return false;
    666 }
    667 
    668 static void
    669 cleanup_intermediate(struct vl_idct_buffer *buffer)
    670 {
    671    unsigned i;
    672 
    673    assert(buffer);
    674 
    675    for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i)
    676       pipe_surface_reference(&buffer->fb_state.cbufs[i], NULL);
    677 
    678    pipe_sampler_view_reference(&buffer->sampler_views.individual.intermediate, NULL);
    679 }
    680 
    681 struct pipe_sampler_view *
    682 vl_idct_upload_matrix(struct pipe_context *pipe, float scale)
    683 {
    684    struct pipe_resource tex_templ, *matrix;
    685    struct pipe_sampler_view sv_templ, *sv;
    686    struct pipe_transfer *buf_transfer;
    687    unsigned i, j, pitch;
    688    float *f;
    689 
    690    struct pipe_box rect =
    691    {
    692       0, 0, 0,
    693       VL_BLOCK_WIDTH / 4,
    694       VL_BLOCK_HEIGHT,
    695       1
    696    };
    697 
    698    assert(pipe);
    699 
    700    memset(&tex_templ, 0, sizeof(tex_templ));
    701    tex_templ.target = PIPE_TEXTURE_2D;
    702    tex_templ.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
    703    tex_templ.last_level = 0;
    704    tex_templ.width0 = 2;
    705    tex_templ.height0 = 8;
    706    tex_templ.depth0 = 1;
    707    tex_templ.array_size = 1;
    708    tex_templ.usage = PIPE_USAGE_IMMUTABLE;
    709    tex_templ.bind = PIPE_BIND_SAMPLER_VIEW;
    710    tex_templ.flags = 0;
    711 
    712    matrix = pipe->screen->resource_create(pipe->screen, &tex_templ);
    713    if (!matrix)
    714       goto error_matrix;
    715 
    716    buf_transfer = pipe->get_transfer
    717    (
    718       pipe, matrix,
    719       0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
    720       &rect
    721    );
    722    if (!buf_transfer)
    723       goto error_transfer;
    724 
    725    pitch = buf_transfer->stride / sizeof(float);
    726 
    727    f = pipe->transfer_map(pipe, buf_transfer);
    728    if (!f)
    729       goto error_map;
    730 
    731    for(i = 0; i < VL_BLOCK_HEIGHT; ++i)
    732       for(j = 0; j < VL_BLOCK_WIDTH; ++j)
    733          // transpose and scale
    734          f[i * pitch + j] = ((const float (*)[8])const_matrix)[j][i] * scale;
    735 
    736    pipe->transfer_unmap(pipe, buf_transfer);
    737    pipe->transfer_destroy(pipe, buf_transfer);
    738 
    739    memset(&sv_templ, 0, sizeof(sv_templ));
    740    u_sampler_view_default_template(&sv_templ, matrix, matrix->format);
    741    sv = pipe->create_sampler_view(pipe, matrix, &sv_templ);
    742    pipe_resource_reference(&matrix, NULL);
    743    if (!sv)
    744       goto error_map;
    745 
    746    return sv;
    747 
    748 error_map:
    749    pipe->transfer_destroy(pipe, buf_transfer);
    750 
    751 error_transfer:
    752    pipe_resource_reference(&matrix, NULL);
    753 
    754 error_matrix:
    755    return NULL;
    756 }
    757 
    758 bool vl_idct_init(struct vl_idct *idct, struct pipe_context *pipe,
    759                   unsigned buffer_width, unsigned buffer_height,
    760                   unsigned nr_of_render_targets,
    761                   struct pipe_sampler_view *matrix,
    762                   struct pipe_sampler_view *transpose)
    763 {
    764    assert(idct && pipe);
    765    assert(matrix && transpose);
    766 
    767    idct->pipe = pipe;
    768    idct->buffer_width = buffer_width;
    769    idct->buffer_height = buffer_height;
    770    idct->nr_of_render_targets = nr_of_render_targets;
    771 
    772    pipe_sampler_view_reference(&idct->matrix, matrix);
    773    pipe_sampler_view_reference(&idct->transpose, transpose);
    774 
    775    if(!init_shaders(idct))
    776       return false;
    777 
    778    if(!init_state(idct)) {
    779       cleanup_shaders(idct);
    780       return false;
    781    }
    782 
    783    return true;
    784 }
    785 
    786 void
    787 vl_idct_cleanup(struct vl_idct *idct)
    788 {
    789    cleanup_shaders(idct);
    790    cleanup_state(idct);
    791 
    792    pipe_sampler_view_reference(&idct->matrix, NULL);
    793    pipe_sampler_view_reference(&idct->transpose, NULL);
    794 }
    795 
    796 bool
    797 vl_idct_init_buffer(struct vl_idct *idct, struct vl_idct_buffer *buffer,
    798                     struct pipe_sampler_view *source,
    799                     struct pipe_sampler_view *intermediate)
    800 {
    801    assert(buffer && idct);
    802    assert(source && intermediate);
    803 
    804    memset(buffer, 0, sizeof(struct vl_idct_buffer));
    805 
    806    pipe_sampler_view_reference(&buffer->sampler_views.individual.matrix, idct->matrix);
    807    pipe_sampler_view_reference(&buffer->sampler_views.individual.source, source);
    808    pipe_sampler_view_reference(&buffer->sampler_views.individual.transpose, idct->transpose);
    809    pipe_sampler_view_reference(&buffer->sampler_views.individual.intermediate, intermediate);
    810 
    811    if (!init_source(idct, buffer))
    812       return false;
    813 
    814    if (!init_intermediate(idct, buffer))
    815       return false;
    816 
    817    return true;
    818 }
    819 
    820 void
    821 vl_idct_cleanup_buffer(struct vl_idct_buffer *buffer)
    822 {
    823    assert(buffer);
    824 
    825    cleanup_source(buffer);
    826    cleanup_intermediate(buffer);
    827 
    828    pipe_sampler_view_reference(&buffer->sampler_views.individual.matrix, NULL);
    829    pipe_sampler_view_reference(&buffer->sampler_views.individual.transpose, NULL);
    830 }
    831 
    832 void
    833 vl_idct_flush(struct vl_idct *idct, struct vl_idct_buffer *buffer, unsigned num_instances)
    834 {
    835    assert(buffer);
    836 
    837    idct->pipe->bind_rasterizer_state(idct->pipe, idct->rs_state);
    838    idct->pipe->bind_blend_state(idct->pipe, idct->blend);
    839    idct->pipe->bind_fragment_sampler_states(idct->pipe, 2, idct->samplers);
    840    idct->pipe->set_fragment_sampler_views(idct->pipe, 2, buffer->sampler_views.stage[0]);
    841 
    842    /* mismatch control */
    843    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state_mismatch);
    844    idct->pipe->set_viewport_state(idct->pipe, &buffer->viewport_mismatch);
    845    idct->pipe->bind_vs_state(idct->pipe, idct->vs_mismatch);
    846    idct->pipe->bind_fs_state(idct->pipe, idct->fs_mismatch);
    847    util_draw_arrays_instanced(idct->pipe, PIPE_PRIM_POINTS, 0, 1, 0, num_instances);
    848 
    849    /* first stage */
    850    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state);
    851    idct->pipe->set_viewport_state(idct->pipe, &buffer->viewport);
    852    idct->pipe->bind_vs_state(idct->pipe, idct->vs);
    853    idct->pipe->bind_fs_state(idct->pipe, idct->fs);
    854    util_draw_arrays_instanced(idct->pipe, PIPE_PRIM_QUADS, 0, 4, 0, num_instances);
    855 }
    856 
    857 void
    858 vl_idct_prepare_stage2(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    859 {
    860    assert(buffer);
    861 
    862    /* second stage */
    863    idct->pipe->bind_rasterizer_state(idct->pipe, idct->rs_state);
    864    idct->pipe->bind_fragment_sampler_states(idct->pipe, 2, idct->samplers);
    865    idct->pipe->set_fragment_sampler_views(idct->pipe, 2, buffer->sampler_views.stage[1]);
    866 }
    867 
    868