Home | History | Annotate | Download | only in translate
      1 /*
      2  * Copyright 2003 Tungsten Graphics, inc.
      3  * All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * on the rights to use, copy, modify, merge, publish, distribute, sub
      9  * license, and/or sell copies of the Software, and to permit persons to whom
     10  * the Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
     19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  *
     24  * Authors:
     25  *    Keith Whitwell <keithw (at) tungstengraphics.com>
     26  */
     27 
     28 
     29 #include "pipe/p_config.h"
     30 #include "pipe/p_compiler.h"
     31 #include "util/u_memory.h"
     32 #include "util/u_math.h"
     33 #include "util/u_format.h"
     34 
     35 #include "translate.h"
     36 
     37 
     38 #if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
     39 
     40 #include "rtasm/rtasm_cpu.h"
     41 #include "rtasm/rtasm_x86sse.h"
     42 
     43 
     44 #define X    0
     45 #define Y    1
     46 #define Z    2
     47 #define W    3
     48 
     49 
     50 struct translate_buffer {
     51    const void *base_ptr;
     52    uintptr_t stride;
     53    unsigned max_index;
     54 };
     55 
     56 struct translate_buffer_variant {
     57    unsigned buffer_index;
     58    unsigned instance_divisor;
     59    void *ptr;                    /* updated either per vertex or per instance */
     60 };
     61 
     62 
     63 #define ELEMENT_BUFFER_INSTANCE_ID  1001
     64 
     65 #define NUM_CONSTS 7
     66 
     67 enum
     68 {
     69    CONST_IDENTITY,
     70    CONST_INV_127,
     71    CONST_INV_255,
     72    CONST_INV_32767,
     73    CONST_INV_65535,
     74    CONST_INV_2147483647,
     75    CONST_255
     76 };
     77 
     78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
     79 static float consts[NUM_CONSTS][4] = {
     80       {0, 0, 0, 1},
     81       C(1.0 / 127.0),
     82       C(1.0 / 255.0),
     83       C(1.0 / 32767.0),
     84       C(1.0 / 65535.0),
     85       C(1.0 / 2147483647.0),
     86       C(255.0)
     87 };
     88 #undef C
     89 
     90 struct translate_sse {
     91    struct translate translate;
     92 
     93    struct x86_function linear_func;
     94    struct x86_function elt_func;
     95    struct x86_function elt16_func;
     96    struct x86_function elt8_func;
     97    struct x86_function *func;
     98 
     99    PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
    100    int8_t reg_to_const[16];
    101    int8_t const_to_reg[NUM_CONSTS];
    102 
    103    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    104    unsigned nr_buffers;
    105 
    106    /* Multiple buffer variants can map to a single buffer. */
    107    struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
    108    unsigned nr_buffer_variants;
    109 
    110    /* Multiple elements can map to a single buffer variant. */
    111    unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
    112 
    113    boolean use_instancing;
    114    unsigned instance_id;
    115 
    116    /* these are actually known values, but putting them in a struct
    117     * like this is helpful to keep them in sync across the file.
    118     */
    119    struct x86_reg tmp_EAX;
    120    struct x86_reg tmp2_EDX;
    121    struct x86_reg src_ECX;
    122    struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
    123    struct x86_reg machine_EDI;
    124    struct x86_reg outbuf_EBX;
    125    struct x86_reg count_EBP;    /* decrements to zero */
    126 };
    127 
    128 static int get_offset( const void *a, const void *b )
    129 {
    130    return (const char *)b - (const char *)a;
    131 }
    132 
    133 static struct x86_reg get_const( struct translate_sse *p, unsigned id)
    134 {
    135    struct x86_reg reg;
    136    unsigned i;
    137 
    138    if(p->const_to_reg[id] >= 0)
    139       return x86_make_reg(file_XMM, p->const_to_reg[id]);
    140 
    141    for(i = 2; i < 8; ++i)
    142    {
    143       if(p->reg_to_const[i] < 0)
    144          break;
    145    }
    146 
    147    /* TODO: be smarter here */
    148    if(i == 8)
    149       --i;
    150 
    151    reg = x86_make_reg(file_XMM, i);
    152 
    153    if(p->reg_to_const[i] >= 0)
    154       p->const_to_reg[p->reg_to_const[i]] = -1;
    155 
    156    p->reg_to_const[i] = id;
    157    p->const_to_reg[id] = i;
    158 
    159    /* TODO: this should happen outside the loop, if possible */
    160    sse_movaps(p->func, reg,
    161          x86_make_disp(p->machine_EDI,
    162                get_offset(p, &p->consts[id][0])));
    163 
    164    return reg;
    165 }
    166 
    167 /* load the data in a SSE2 register, padding with zeros */
    168 static boolean emit_load_sse2( struct translate_sse *p,
    169 				       struct x86_reg data,
    170 				       struct x86_reg src,
    171 				       unsigned size)
    172 {
    173    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
    174    struct x86_reg tmp = p->tmp_EAX;
    175    switch(size)
    176    {
    177    case 1:
    178       x86_movzx8(p->func, tmp, src);
    179       sse2_movd(p->func, data, tmp);
    180       break;
    181    case 2:
    182       x86_movzx16(p->func, tmp, src);
    183       sse2_movd(p->func, data, tmp);
    184       break;
    185    case 3:
    186       x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
    187       x86_shl_imm(p->func, tmp, 16);
    188       x86_mov16(p->func, tmp, src);
    189       sse2_movd(p->func, data, tmp);
    190       break;
    191    case 4:
    192       sse2_movd(p->func, data, src);
    193       break;
    194    case 6:
    195       sse2_movd(p->func, data, src);
    196       x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
    197       sse2_movd(p->func, tmpXMM, tmp);
    198       sse2_punpckldq(p->func, data, tmpXMM);
    199       break;
    200    case 8:
    201       sse2_movq(p->func, data, src);
    202       break;
    203    case 12:
    204       sse2_movq(p->func, data, src);
    205       sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
    206       sse2_punpcklqdq(p->func, data, tmpXMM);
    207       break;
    208    case 16:
    209       sse2_movdqu(p->func, data, src);
    210       break;
    211    default:
    212       return FALSE;
    213    }
    214    return TRUE;
    215 }
    216 
    217 /* this value can be passed for the out_chans argument */
    218 #define CHANNELS_0001 5
    219 
    220 /* this function will load #chans float values, and will
    221  * pad the register with zeroes at least up to out_chans.
    222  *
    223  * If out_chans is set to CHANNELS_0001, then the fourth
    224  * value will be padded with 1. Only pass this value if
    225  * chans < 4 or results are undefined.
    226  */
    227 static void emit_load_float32( struct translate_sse *p,
    228                                        struct x86_reg data,
    229                                        struct x86_reg arg0,
    230                                        unsigned out_chans,
    231                                        unsigned chans)
    232 {
    233    switch(chans)
    234    {
    235    case 1:
    236       /* a 0 0 0
    237        * a 0 0 1
    238        */
    239       sse_movss(p->func, data, arg0);
    240       if(out_chans == CHANNELS_0001)
    241          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
    242       break;
    243    case 2:
    244       /* 0 0 0 1
    245        * a b 0 1
    246        */
    247       if(out_chans == CHANNELS_0001)
    248          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
    249       else if(out_chans > 2)
    250          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
    251       sse_movlps(p->func, data, arg0);
    252       break;
    253    case 3:
    254       /* Have to jump through some hoops:
    255        *
    256        * c 0 0 0
    257        * c 0 0 1 if out_chans == CHANNELS_0001
    258        * 0 0 c 0/1
    259        * a b c 0/1
    260        */
    261       sse_movss(p->func, data, x86_make_disp(arg0, 8));
    262       if(out_chans == CHANNELS_0001)
    263          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
    264       sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
    265       sse_movlps(p->func, data, arg0);
    266       break;
    267    case 4:
    268       sse_movups(p->func, data, arg0);
    269       break;
    270    }
    271 }
    272 
    273 /* this function behaves like emit_load_float32, but loads
    274    64-bit floating point numbers, converting them to 32-bit
    275   ones */
    276 static void emit_load_float64to32( struct translate_sse *p,
    277                                        struct x86_reg data,
    278                                        struct x86_reg arg0,
    279                                        unsigned out_chans,
    280                                        unsigned chans)
    281 {
    282    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
    283    switch(chans)
    284    {
    285    case 1:
    286       sse2_movsd(p->func, data, arg0);
    287       if(out_chans > 1)
    288          sse2_cvtpd2ps(p->func, data, data);
    289       else
    290          sse2_cvtsd2ss(p->func, data, data);
    291       if(out_chans == CHANNELS_0001)
    292          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
    293       break;
    294    case 2:
    295       sse2_movupd(p->func, data, arg0);
    296       sse2_cvtpd2ps(p->func, data, data);
    297       if(out_chans == CHANNELS_0001)
    298          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
    299       else if(out_chans > 2)
    300          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
    301        break;
    302    case 3:
    303       sse2_movupd(p->func, data, arg0);
    304       sse2_cvtpd2ps(p->func, data, data);
    305       sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
    306       if(out_chans > 3)
    307          sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
    308       else
    309          sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
    310       sse_movlhps(p->func, data, tmpXMM);
    311       if(out_chans == CHANNELS_0001)
    312          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
    313       break;
    314    case 4:
    315       sse2_movupd(p->func, data, arg0);
    316       sse2_cvtpd2ps(p->func, data, data);
    317       sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
    318       sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
    319       sse_movlhps(p->func, data, tmpXMM);
    320       break;
    321    }
    322 }
    323 
    324 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
    325 {
    326    if(x86_target(p->func) != X86_32)
    327       x64_mov64(p->func, dst_gpr, src_gpr);
    328    else
    329    {
    330       /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
    331       if(x86_target_caps(p->func) & X86_SSE2)
    332          sse2_movq(p->func, dst_xmm, src_xmm);
    333       else
    334          sse_movlps(p->func, dst_xmm, src_xmm);
    335    }
    336 }
    337 
    338 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
    339 {
    340    emit_mov64(p, dst_gpr, dst_xmm, src, src);
    341 }
    342 
    343 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
    344 {
    345    emit_mov64(p, dst, dst, src_gpr, src_xmm);
    346 }
    347 
    348 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
    349 {
    350    if(x86_target_caps(p->func) & X86_SSE2)
    351       sse2_movdqu(p->func, dst, src);
    352    else
    353       sse_movups(p->func, dst, src);
    354 }
    355 
    356 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
    357  * but may or may not be good on older processors
    358  * TODO: may perhaps want to use non-temporal stores here if possible
    359  */
    360 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
    361 {
    362    struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
    363    struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
    364    struct x86_reg dataGPR = p->tmp_EAX;
    365    struct x86_reg dataGPR2 = p->tmp2_EDX;
    366 
    367    if(size < 8)
    368    {
    369       switch (size)
    370       {
    371       case 1:
    372          x86_mov8(p->func, dataGPR, src);
    373          x86_mov8(p->func, dst, dataGPR);
    374          break;
    375       case 2:
    376          x86_mov16(p->func, dataGPR, src);
    377          x86_mov16(p->func, dst, dataGPR);
    378          break;
    379       case 3:
    380          x86_mov16(p->func, dataGPR, src);
    381          x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
    382          x86_mov16(p->func, dst, dataGPR);
    383          x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
    384          break;
    385       case 4:
    386          x86_mov(p->func, dataGPR, src);
    387          x86_mov(p->func, dst, dataGPR);
    388          break;
    389       case 6:
    390          x86_mov(p->func, dataGPR, src);
    391          x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
    392          x86_mov(p->func, dst, dataGPR);
    393          x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
    394          break;
    395       }
    396    }
    397    else if(!(x86_target_caps(p->func) & X86_SSE))
    398    {
    399       unsigned i = 0;
    400       assert((size & 3) == 0);
    401       for(i = 0; i < size; i += 4)
    402       {
    403          x86_mov(p->func, dataGPR, x86_make_disp(src, i));
    404          x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
    405       }
    406    }
    407    else
    408    {
    409       switch(size)
    410       {
    411       case 8:
    412          emit_load64(p, dataGPR, dataXMM, src);
    413          emit_store64(p, dst, dataGPR, dataXMM);
    414          break;
    415       case 12:
    416          emit_load64(p, dataGPR2, dataXMM, src);
    417          x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
    418          emit_store64(p, dst, dataGPR2, dataXMM);
    419          x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
    420          break;
    421       case 16:
    422          emit_mov128(p, dataXMM, src);
    423          emit_mov128(p, dst, dataXMM);
    424          break;
    425       case 24:
    426          emit_mov128(p, dataXMM, src);
    427          emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
    428          emit_mov128(p, dst, dataXMM);
    429          emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
    430          break;
    431       case 32:
    432          emit_mov128(p, dataXMM, src);
    433          emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
    434          emit_mov128(p, dst, dataXMM);
    435          emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
    436          break;
    437       default:
    438          assert(0);
    439       }
    440    }
    441 }
    442 
    443 static boolean translate_attr_convert( struct translate_sse *p,
    444                                const struct translate_element *a,
    445                                struct x86_reg src,
    446                                struct x86_reg dst)
    447 
    448 {
    449    const struct util_format_description* input_desc = util_format_description(a->input_format);
    450    const struct util_format_description* output_desc = util_format_description(a->output_format);
    451    unsigned i;
    452    boolean id_swizzle = TRUE;
    453    unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
    454    unsigned needed_chans = 0;
    455    unsigned imms[2] = {0, 0x3f800000};
    456 
    457    if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
    458       return FALSE;
    459 
    460    if(input_desc->channel[0].size & 7)
    461       return FALSE;
    462 
    463    if(input_desc->colorspace != output_desc->colorspace)
    464       return FALSE;
    465 
    466    for(i = 1; i < input_desc->nr_channels; ++i)
    467    {
    468       if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
    469          return FALSE;
    470    }
    471 
    472    for(i = 1; i < output_desc->nr_channels; ++i)
    473    {
    474       if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
    475          return FALSE;
    476    }
    477 
    478    for(i = 0; i < output_desc->nr_channels; ++i)
    479    {
    480       if(output_desc->swizzle[i] < 4)
    481          swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
    482    }
    483 
    484    if((x86_target_caps(p->func) & X86_SSE) && (0
    485          || a->output_format == PIPE_FORMAT_R32_FLOAT
    486          || a->output_format == PIPE_FORMAT_R32G32_FLOAT
    487          || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
    488          || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
    489    {
    490       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
    491 
    492       for(i = 0; i < output_desc->nr_channels; ++i)
    493       {
    494          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
    495             swizzle[i] = i;
    496       }
    497 
    498       for(i = 0; i < output_desc->nr_channels; ++i)
    499       {
    500          if(swizzle[i] < 4)
    501             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
    502          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
    503             id_swizzle = FALSE;
    504       }
    505 
    506       if(needed_chans > 0)
    507       {
    508          switch(input_desc->channel[0].type)
    509          {
    510          case UTIL_FORMAT_TYPE_UNSIGNED:
    511             if(!(x86_target_caps(p->func) & X86_SSE2))
    512                return FALSE;
    513             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
    514 
    515             /* TODO: add support for SSE4.1 pmovzx */
    516             switch(input_desc->channel[0].size)
    517             {
    518             case 8:
    519                /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
    520                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
    521                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
    522                break;
    523             case 16:
    524                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
    525                break;
    526             case 32: /* we lose precision here */
    527                sse2_psrld_imm(p->func, dataXMM, 1);
    528                break;
    529             default:
    530                return FALSE;
    531             }
    532             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
    533             if(input_desc->channel[0].normalized)
    534             {
    535                struct x86_reg factor;
    536                switch(input_desc->channel[0].size)
    537                {
    538                case 8:
    539                   factor = get_const(p, CONST_INV_255);
    540                   break;
    541                case 16:
    542                   factor = get_const(p, CONST_INV_65535);
    543                   break;
    544                case 32:
    545                   factor = get_const(p, CONST_INV_2147483647);
    546                   break;
    547                default:
    548                   assert(0);
    549                   factor.disp = 0;
    550                   factor.file = 0;
    551                   factor.idx = 0;
    552                   factor.mod = 0;
    553                   break;
    554                }
    555                sse_mulps(p->func, dataXMM, factor);
    556             }
    557             else if(input_desc->channel[0].size == 32)
    558                sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
    559             break;
    560          case UTIL_FORMAT_TYPE_SIGNED:
    561             if(!(x86_target_caps(p->func) & X86_SSE2))
    562                return FALSE;
    563             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
    564 
    565             /* TODO: add support for SSE4.1 pmovsx */
    566             switch(input_desc->channel[0].size)
    567             {
    568             case 8:
    569                sse2_punpcklbw(p->func, dataXMM, dataXMM);
    570                sse2_punpcklbw(p->func, dataXMM, dataXMM);
    571                sse2_psrad_imm(p->func, dataXMM, 24);
    572                break;
    573             case 16:
    574                sse2_punpcklwd(p->func, dataXMM, dataXMM);
    575                sse2_psrad_imm(p->func, dataXMM, 16);
    576                break;
    577             case 32: /* we lose precision here */
    578                break;
    579             default:
    580                return FALSE;
    581             }
    582             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
    583             if(input_desc->channel[0].normalized)
    584             {
    585                struct x86_reg factor;
    586                switch(input_desc->channel[0].size)
    587                {
    588                case 8:
    589                   factor = get_const(p, CONST_INV_127);
    590                   break;
    591                case 16:
    592                   factor = get_const(p, CONST_INV_32767);
    593                   break;
    594                case 32:
    595                   factor = get_const(p, CONST_INV_2147483647);
    596                   break;
    597                default:
    598                   assert(0);
    599                   factor.disp = 0;
    600                   factor.file = 0;
    601                   factor.idx = 0;
    602                   factor.mod = 0;
    603                   break;
    604                }
    605                sse_mulps(p->func, dataXMM, factor);
    606             }
    607             break;
    608 
    609             break;
    610          case UTIL_FORMAT_TYPE_FLOAT:
    611             if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
    612                return FALSE;
    613             if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
    614             {
    615                swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
    616                needed_chans = CHANNELS_0001;
    617             }
    618             switch(input_desc->channel[0].size)
    619             {
    620             case 32:
    621                emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
    622                break;
    623             case 64: /* we lose precision here */
    624                if(!(x86_target_caps(p->func) & X86_SSE2))
    625                   return FALSE;
    626                emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
    627                break;
    628             default:
    629                return FALSE;
    630             }
    631             break;
    632          default:
    633             return FALSE;
    634          }
    635 
    636          if(!id_swizzle)
    637             sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
    638       }
    639 
    640       if(output_desc->nr_channels >= 4
    641             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
    642             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
    643             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
    644             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
    645             )
    646          sse_movups(p->func, dst, dataXMM);
    647       else
    648       {
    649          if(output_desc->nr_channels >= 2
    650                && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
    651                && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
    652             sse_movlps(p->func, dst, dataXMM);
    653          else
    654          {
    655             if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
    656                sse_movss(p->func, dst, dataXMM);
    657             else
    658                x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
    659 
    660             if(output_desc->nr_channels >= 2)
    661             {
    662                if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
    663                {
    664                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
    665                   sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
    666                }
    667                else
    668                   x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
    669             }
    670          }
    671 
    672          if(output_desc->nr_channels >= 3)
    673          {
    674             if(output_desc->nr_channels >= 4
    675                   && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
    676                   && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
    677                sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
    678             else
    679             {
    680                if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
    681                {
    682                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
    683                   sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
    684                }
    685                else
    686                   x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
    687 
    688                if(output_desc->nr_channels >= 4)
    689                {
    690                   if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
    691                   {
    692                      sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
    693                      sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
    694                   }
    695                   else
    696                      x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
    697                }
    698             }
    699          }
    700       }
    701       return TRUE;
    702    }
    703    else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
    704          && output_desc->channel[0].normalized == input_desc->channel[0].normalized
    705          && (0
    706                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
    707                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
    708                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
    709                ))
    710    {
    711       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
    712       struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
    713       struct x86_reg tmp = p->tmp_EAX;
    714       unsigned imms[2] = {0, 1};
    715 
    716       for(i = 0; i < output_desc->nr_channels; ++i)
    717       {
    718          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
    719             swizzle[i] = i;
    720       }
    721 
    722       for(i = 0; i < output_desc->nr_channels; ++i)
    723       {
    724          if(swizzle[i] < 4)
    725             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
    726          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
    727             id_swizzle = FALSE;
    728       }
    729 
    730       if(needed_chans > 0)
    731       {
    732          emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
    733 
    734          switch(input_desc->channel[0].type)
    735          {
    736          case UTIL_FORMAT_TYPE_UNSIGNED:
    737             if(input_desc->channel[0].normalized)
    738             {
    739                sse2_punpcklbw(p->func, dataXMM, dataXMM);
    740                if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
    741         	       sse2_psrlw_imm(p->func, dataXMM, 1);
    742             }
    743             else
    744                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
    745             break;
    746          case UTIL_FORMAT_TYPE_SIGNED:
    747             if(input_desc->channel[0].normalized)
    748             {
    749                sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
    750                sse2_punpcklbw(p->func, tmpXMM, dataXMM);
    751                sse2_psllw_imm(p->func, dataXMM, 9);
    752                sse2_psrlw_imm(p->func, dataXMM, 8);
    753                sse2_por(p->func, tmpXMM, dataXMM);
    754                sse2_psrlw_imm(p->func, dataXMM, 7);
    755                sse2_por(p->func, tmpXMM, dataXMM);
    756                {
    757                   struct x86_reg t = dataXMM;
    758                   dataXMM = tmpXMM;
    759                   tmpXMM = t;
    760                }
    761             }
    762             else
    763             {
    764                sse2_punpcklbw(p->func, dataXMM, dataXMM);
    765                sse2_psraw_imm(p->func, dataXMM, 8);
    766             }
    767             break;
    768          default:
    769             assert(0);
    770          }
    771 
    772          if(output_desc->channel[0].normalized)
    773             imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
    774 
    775          if(!id_swizzle)
    776             sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
    777       }
    778 
    779       if(output_desc->nr_channels >= 4
    780             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
    781             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
    782             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
    783             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
    784             )
    785          sse2_movq(p->func, dst, dataXMM);
    786       else
    787       {
    788          if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
    789          {
    790             if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
    791                sse2_movd(p->func, dst, dataXMM);
    792             else
    793             {
    794                sse2_movd(p->func, tmp, dataXMM);
    795                x86_mov16(p->func, dst, tmp);
    796                if(output_desc->nr_channels >= 2)
    797                   x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
    798             }
    799          }
    800          else
    801          {
    802             if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
    803                x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
    804             else
    805             {
    806                x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
    807                if(output_desc->nr_channels >= 2)
    808                {
    809                   sse2_movd(p->func, tmp, dataXMM);
    810                   x86_shr_imm(p->func, tmp, 16);
    811                   x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
    812                }
    813             }
    814          }
    815 
    816          if(output_desc->nr_channels >= 3)
    817          {
    818             if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
    819             {
    820                if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
    821                {
    822                   sse2_psrlq_imm(p->func, dataXMM, 32);
    823                   sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
    824                }
    825                else
    826                {
    827                   sse2_psrlq_imm(p->func, dataXMM, 32);
    828                   sse2_movd(p->func, tmp, dataXMM);
    829                   x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
    830                   if(output_desc->nr_channels >= 4)
    831                   {
    832                      x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
    833                   }
    834                }
    835             }
    836             else
    837             {
    838                if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
    839                   x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
    840                else
    841                {
    842                   x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
    843 
    844                   if(output_desc->nr_channels >= 4)
    845                   {
    846                      sse2_psrlq_imm(p->func, dataXMM, 48);
    847                      sse2_movd(p->func, tmp, dataXMM);
    848                      x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
    849                   }
    850                }
    851             }
    852          }
    853       }
    854       return TRUE;
    855    }
    856    else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
    857    {
    858       struct x86_reg tmp = p->tmp_EAX;
    859       unsigned i;
    860       if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
    861                      && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
    862                      && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
    863                      && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
    864                      && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
    865       {
    866          /* TODO: support movbe */
    867          x86_mov(p->func, tmp, src);
    868          x86_bswap(p->func, tmp);
    869          x86_mov(p->func, dst, tmp);
    870          return TRUE;
    871       }
    872 
    873       for(i = 0; i < output_desc->nr_channels; ++i)
    874       {
    875          switch(output_desc->channel[0].size)
    876          {
    877          case 8:
    878             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
    879             {
    880                unsigned v = 0;
    881                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
    882                {
    883                   switch(output_desc->channel[0].type)
    884                   {
    885                   case UTIL_FORMAT_TYPE_UNSIGNED:
    886                      v = output_desc->channel[0].normalized ? 0xff : 1;
    887                      break;
    888                   case UTIL_FORMAT_TYPE_SIGNED:
    889                      v = output_desc->channel[0].normalized ? 0x7f : 1;
    890                      break;
    891                   default:
    892                      return FALSE;
    893                   }
    894                }
    895                x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
    896             }
    897             else
    898             {
    899                x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
    900                x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
    901             }
    902             break;
    903          case 16:
    904             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
    905             {
    906                unsigned v = 0;
    907                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
    908                {
    909                   switch(output_desc->channel[1].type)
    910                   {
    911                   case UTIL_FORMAT_TYPE_UNSIGNED:
    912                      v = output_desc->channel[1].normalized ? 0xffff : 1;
    913                      break;
    914                   case UTIL_FORMAT_TYPE_SIGNED:
    915                      v = output_desc->channel[1].normalized ? 0x7fff : 1;
    916                      break;
    917                   case UTIL_FORMAT_TYPE_FLOAT:
    918                      v = 0x3c00;
    919                      break;
    920                   default:
    921                      return FALSE;
    922                   }
    923                }
    924                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
    925             }
    926             else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
    927                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
    928             else
    929             {
    930                x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
    931                x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
    932             }
    933             break;
    934          case 32:
    935             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
    936             {
    937                unsigned v = 0;
    938                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
    939                {
    940                   switch(output_desc->channel[1].type)
    941                   {
    942                   case UTIL_FORMAT_TYPE_UNSIGNED:
    943                      v = output_desc->channel[1].normalized ? 0xffffffff : 1;
    944                      break;
    945                   case UTIL_FORMAT_TYPE_SIGNED:
    946                      v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
    947                      break;
    948                   case UTIL_FORMAT_TYPE_FLOAT:
    949                      v = 0x3f800000;
    950                      break;
    951                   default:
    952                      return FALSE;
    953                   }
    954                }
    955                x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
    956             }
    957             else
    958             {
    959                x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
    960                x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
    961             }
    962             break;
    963          case 64:
    964             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
    965             {
    966                unsigned l = 0;
    967                unsigned h = 0;
    968                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
    969                {
    970                   switch(output_desc->channel[1].type)
    971                   {
    972                   case UTIL_FORMAT_TYPE_UNSIGNED:
    973                      h = output_desc->channel[1].normalized ? 0xffffffff : 0;
    974                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
    975                      break;
    976                   case UTIL_FORMAT_TYPE_SIGNED:
    977                      h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
    978                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
    979                      break;
    980                   case UTIL_FORMAT_TYPE_FLOAT:
    981                      h = 0x3ff00000;
    982                      l = 0;
    983                      break;
    984                   default:
    985                      return FALSE;
    986                   }
    987                }
    988                x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
    989                x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
    990             }
    991             else
    992             {
    993                if(x86_target_caps(p->func) & X86_SSE)
    994                {
    995                   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
    996                   emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
    997                   emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
    998                }
    999                else
   1000                {
   1001                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
   1002                   x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
   1003                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
   1004                   x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
   1005                }
   1006             }
   1007             break;
   1008          default:
   1009             return FALSE;
   1010          }
   1011       }
   1012       return TRUE;
   1013    }
   1014    /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
   1015    else if((x86_target_caps(p->func) & X86_SSE2) &&
   1016          a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
   1017                || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
   1018                || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
   1019          ))
   1020    {
   1021       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
   1022 
   1023       /* load */
   1024       sse_movups(p->func, dataXMM, src);
   1025 
   1026       if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
   1027          sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
   1028 
   1029       /* scale by 255.0 */
   1030       sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
   1031 
   1032       /* pack and emit */
   1033       sse2_cvtps2dq(p->func, dataXMM, dataXMM);
   1034       sse2_packssdw(p->func, dataXMM, dataXMM);
   1035       sse2_packuswb(p->func, dataXMM, dataXMM);
   1036       sse2_movd(p->func, dst, dataXMM);
   1037 
   1038       return TRUE;
   1039    }
   1040 
   1041    return FALSE;
   1042 }
   1043 
   1044 static boolean translate_attr( struct translate_sse *p,
   1045 			       const struct translate_element *a,
   1046 			       struct x86_reg src,
   1047 			       struct x86_reg dst)
   1048 {
   1049    if(a->input_format == a->output_format)
   1050    {
   1051       emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
   1052       return TRUE;
   1053    }
   1054 
   1055    return translate_attr_convert(p, a, src, dst);
   1056 }
   1057 
   1058 static boolean init_inputs( struct translate_sse *p,
   1059                             unsigned index_size )
   1060 {
   1061    unsigned i;
   1062    struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
   1063                                               get_offset(p, &p->instance_id));
   1064 
   1065    for (i = 0; i < p->nr_buffer_variants; i++) {
   1066       struct translate_buffer_variant *variant = &p->buffer_variant[i];
   1067       struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
   1068 
   1069       if (!index_size || variant->instance_divisor) {
   1070          struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
   1071                                                      get_offset(p, &buffer->max_index));
   1072          struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
   1073                                                      get_offset(p, &buffer->stride));
   1074          struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
   1075                                                      get_offset(p, &variant->ptr));
   1076          struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
   1077                                                      get_offset(p, &buffer->base_ptr));
   1078          struct x86_reg elt = p->idx_ESI;
   1079          struct x86_reg tmp_EAX = p->tmp_EAX;
   1080 
   1081          /* Calculate pointer to first attrib:
   1082           *   base_ptr + stride * index, where index depends on instance divisor
   1083           */
   1084          if (variant->instance_divisor) {
   1085             /* Our index is instance ID divided by instance divisor.
   1086              */
   1087             x86_mov(p->func, tmp_EAX, instance_id);
   1088 
   1089             if (variant->instance_divisor != 1) {
   1090                struct x86_reg tmp_EDX = p->tmp2_EDX;
   1091                struct x86_reg tmp_ECX = p->src_ECX;
   1092 
   1093                /* TODO: Add x86_shr() to rtasm and use it whenever
   1094                 *       instance divisor is power of two.
   1095                 */
   1096 
   1097                x86_xor(p->func, tmp_EDX, tmp_EDX);
   1098                x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
   1099                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
   1100             }
   1101 
   1102             /* XXX we need to clamp the index here too, but to a
   1103              * per-array max value, not the draw->pt.max_index value
   1104              * that's being given to us via translate->set_buffer().
   1105              */
   1106          } else {
   1107             x86_mov(p->func, tmp_EAX, elt);
   1108 
   1109             /* Clamp to max_index
   1110              */
   1111             x86_cmp(p->func, tmp_EAX, buf_max_index);
   1112             x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
   1113          }
   1114 
   1115          x86_imul(p->func, tmp_EAX, buf_stride);
   1116          x64_rexw(p->func);
   1117          x86_add(p->func, tmp_EAX, buf_base_ptr);
   1118 
   1119          x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
   1120 
   1121          /* In the linear case, keep the buffer pointer instead of the
   1122           * index number.
   1123           */
   1124          if (!index_size && p->nr_buffer_variants == 1)
   1125          {
   1126             x64_rexw(p->func);
   1127             x86_mov(p->func, elt, tmp_EAX);
   1128          }
   1129          else
   1130          {
   1131             x64_rexw(p->func);
   1132             x86_mov(p->func, buf_ptr, tmp_EAX);
   1133          }
   1134       }
   1135    }
   1136 
   1137    return TRUE;
   1138 }
   1139 
   1140 
   1141 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
   1142                                       unsigned index_size,
   1143                                       unsigned var_idx,
   1144                                       struct x86_reg elt )
   1145 {
   1146    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
   1147       return x86_make_disp(p->machine_EDI,
   1148                            get_offset(p, &p->instance_id));
   1149    }
   1150    if (!index_size && p->nr_buffer_variants == 1) {
   1151       return p->idx_ESI;
   1152    }
   1153    else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
   1154       struct x86_reg ptr = p->src_ECX;
   1155       struct x86_reg buf_ptr =
   1156          x86_make_disp(p->machine_EDI,
   1157                        get_offset(p, &p->buffer_variant[var_idx].ptr));
   1158 
   1159       x64_rexw(p->func);
   1160       x86_mov(p->func, ptr, buf_ptr);
   1161       return ptr;
   1162    }
   1163    else {
   1164       struct x86_reg ptr = p->src_ECX;
   1165       const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
   1166 
   1167       struct x86_reg buf_stride =
   1168          x86_make_disp(p->machine_EDI,
   1169                        get_offset(p, &p->buffer[variant->buffer_index].stride));
   1170 
   1171       struct x86_reg buf_base_ptr =
   1172          x86_make_disp(p->machine_EDI,
   1173                        get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
   1174 
   1175       struct x86_reg buf_max_index =
   1176          x86_make_disp(p->machine_EDI,
   1177                        get_offset(p, &p->buffer[variant->buffer_index].max_index));
   1178 
   1179 
   1180 
   1181       /* Calculate pointer to current attrib:
   1182        */
   1183       switch(index_size)
   1184       {
   1185       case 1:
   1186          x86_movzx8(p->func, ptr, elt);
   1187          break;
   1188       case 2:
   1189          x86_movzx16(p->func, ptr, elt);
   1190          break;
   1191       case 4:
   1192          x86_mov(p->func, ptr, elt);
   1193          break;
   1194       }
   1195 
   1196       /* Clamp to max_index
   1197        */
   1198       x86_cmp(p->func, ptr, buf_max_index);
   1199       x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
   1200 
   1201       x86_imul(p->func, ptr, buf_stride);
   1202       x64_rexw(p->func);
   1203       x86_add(p->func, ptr, buf_base_ptr);
   1204       return ptr;
   1205    }
   1206 }
   1207 
   1208 
   1209 
   1210 static boolean incr_inputs( struct translate_sse *p,
   1211                             unsigned index_size )
   1212 {
   1213    if (!index_size && p->nr_buffer_variants == 1) {
   1214       struct x86_reg stride = x86_make_disp(p->machine_EDI,
   1215                                             get_offset(p, &p->buffer[0].stride));
   1216 
   1217       if (p->buffer_variant[0].instance_divisor == 0) {
   1218          x64_rexw(p->func);
   1219          x86_add(p->func, p->idx_ESI, stride);
   1220          sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
   1221       }
   1222    }
   1223    else if (!index_size) {
   1224       unsigned i;
   1225 
   1226       /* Is this worthwhile??
   1227        */
   1228       for (i = 0; i < p->nr_buffer_variants; i++) {
   1229          struct translate_buffer_variant *variant = &p->buffer_variant[i];
   1230          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
   1231                                                 get_offset(p, &variant->ptr));
   1232          struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
   1233                                                    get_offset(p, &p->buffer[variant->buffer_index].stride));
   1234 
   1235          if (variant->instance_divisor == 0) {
   1236             x86_mov(p->func, p->tmp_EAX, buf_stride);
   1237             x64_rexw(p->func);
   1238             x86_add(p->func, p->tmp_EAX, buf_ptr);
   1239             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
   1240             x64_rexw(p->func);
   1241             x86_mov(p->func, buf_ptr, p->tmp_EAX);
   1242          }
   1243       }
   1244    }
   1245    else {
   1246       x64_rexw(p->func);
   1247       x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
   1248    }
   1249 
   1250    return TRUE;
   1251 }
   1252 
   1253 
   1254 /* Build run( struct translate *machine,
   1255  *            unsigned start,
   1256  *            unsigned count,
   1257  *            void *output_buffer )
   1258  * or
   1259  *  run_elts( struct translate *machine,
   1260  *            unsigned *elts,
   1261  *            unsigned count,
   1262  *            void *output_buffer )
   1263  *
   1264  *  Lots of hardcoding
   1265  *
   1266  * EAX -- pointer to current output vertex
   1267  * ECX -- pointer to current attribute
   1268  *
   1269  */
   1270 static boolean build_vertex_emit( struct translate_sse *p,
   1271 				  struct x86_function *func,
   1272 				  unsigned index_size )
   1273 {
   1274    int fixup, label;
   1275    unsigned j;
   1276 
   1277    memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
   1278    memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
   1279 
   1280    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
   1281    p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
   1282    p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
   1283    p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
   1284    p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
   1285    p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
   1286    p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
   1287 
   1288    p->func = func;
   1289 
   1290    x86_init_func(p->func);
   1291 
   1292    if(x86_target(p->func) == X86_64_WIN64_ABI)
   1293    {
   1294 	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
   1295 	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
   1296 	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
   1297    }
   1298 
   1299    x86_push(p->func, p->outbuf_EBX);
   1300    x86_push(p->func, p->count_EBP);
   1301 
   1302 /* on non-Win64 x86-64, these are already in the right registers */
   1303    if(x86_target(p->func) != X86_64_STD_ABI)
   1304    {
   1305       x86_push(p->func, p->machine_EDI);
   1306       x86_push(p->func, p->idx_ESI);
   1307 
   1308       x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
   1309       x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
   1310    }
   1311 
   1312    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
   1313 
   1314    if(x86_target(p->func) != X86_32)
   1315       x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
   1316    else
   1317       x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
   1318 
   1319    /* Load instance ID.
   1320     */
   1321    if (p->use_instancing) {
   1322       x86_mov(p->func,
   1323               p->tmp_EAX,
   1324               x86_fn_arg(p->func, 4));
   1325       x86_mov(p->func,
   1326               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
   1327               p->tmp_EAX);
   1328    }
   1329 
   1330    /* Get vertex count, compare to zero
   1331     */
   1332    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
   1333    x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
   1334    fixup = x86_jcc_forward(p->func, cc_E);
   1335 
   1336    /* always load, needed or not:
   1337     */
   1338    init_inputs(p, index_size);
   1339 
   1340    /* Note address for loop jump
   1341     */
   1342    label = x86_get_label(p->func);
   1343    {
   1344       struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
   1345       int last_variant = -1;
   1346       struct x86_reg vb;
   1347 
   1348       for (j = 0; j < p->translate.key.nr_elements; j++) {
   1349          const struct translate_element *a = &p->translate.key.element[j];
   1350          unsigned variant = p->element_to_buffer_variant[j];
   1351 
   1352          /* Figure out source pointer address:
   1353           */
   1354          if (variant != last_variant) {
   1355             last_variant = variant;
   1356             vb = get_buffer_ptr(p, index_size, variant, elt);
   1357          }
   1358 
   1359          if (!translate_attr( p, a,
   1360                               x86_make_disp(vb, a->input_offset),
   1361                               x86_make_disp(p->outbuf_EBX, a->output_offset)))
   1362             return FALSE;
   1363       }
   1364 
   1365       /* Next output vertex:
   1366        */
   1367       x64_rexw(p->func);
   1368       x86_lea(p->func,
   1369               p->outbuf_EBX,
   1370               x86_make_disp(p->outbuf_EBX,
   1371                             p->translate.key.output_stride));
   1372 
   1373       /* Incr index
   1374        */
   1375       incr_inputs( p, index_size );
   1376    }
   1377 
   1378    /* decr count, loop if not zero
   1379     */
   1380    x86_dec(p->func, p->count_EBP);
   1381    x86_jcc(p->func, cc_NZ, label);
   1382 
   1383    /* Exit mmx state?
   1384     */
   1385    if (p->func->need_emms)
   1386       mmx_emms(p->func);
   1387 
   1388    /* Land forward jump here:
   1389     */
   1390    x86_fixup_fwd_jump(p->func, fixup);
   1391 
   1392    /* Pop regs and return
   1393     */
   1394 
   1395    if(x86_target(p->func) != X86_64_STD_ABI)
   1396    {
   1397       x86_pop(p->func, p->idx_ESI);
   1398       x86_pop(p->func, p->machine_EDI);
   1399    }
   1400 
   1401    x86_pop(p->func, p->count_EBP);
   1402    x86_pop(p->func, p->outbuf_EBX);
   1403 
   1404    if(x86_target(p->func) == X86_64_WIN64_ABI)
   1405    {
   1406 	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
   1407 	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
   1408    }
   1409    x86_ret(p->func);
   1410 
   1411    return TRUE;
   1412 }
   1413 
   1414 
   1415 
   1416 
   1417 
   1418 
   1419 
   1420 static void translate_sse_set_buffer( struct translate *translate,
   1421 				unsigned buf,
   1422 				const void *ptr,
   1423 				unsigned stride,
   1424 				unsigned max_index )
   1425 {
   1426    struct translate_sse *p = (struct translate_sse *)translate;
   1427 
   1428    if (buf < p->nr_buffers) {
   1429       p->buffer[buf].base_ptr = (char *)ptr;
   1430       p->buffer[buf].stride = stride;
   1431       p->buffer[buf].max_index = max_index;
   1432    }
   1433 
   1434    if (0) debug_printf("%s %d/%d: %p %d\n",
   1435                        __FUNCTION__, buf,
   1436                        p->nr_buffers,
   1437                        ptr, stride);
   1438 }
   1439 
   1440 
   1441 static void translate_sse_release( struct translate *translate )
   1442 {
   1443    struct translate_sse *p = (struct translate_sse *)translate;
   1444 
   1445    x86_release_func( &p->elt8_func );
   1446    x86_release_func( &p->elt16_func );
   1447    x86_release_func( &p->elt_func );
   1448    x86_release_func( &p->linear_func );
   1449 
   1450    os_free_aligned(p);
   1451 }
   1452 
   1453 
   1454 struct translate *translate_sse2_create( const struct translate_key *key )
   1455 {
   1456    struct translate_sse *p = NULL;
   1457    unsigned i;
   1458 
   1459    /* this is misnamed, it actually refers to whether rtasm is enabled or not */
   1460    if (!rtasm_cpu_has_sse())
   1461       goto fail;
   1462 
   1463    p = os_malloc_aligned(sizeof(struct translate_sse), 16);
   1464    if (p == NULL)
   1465       goto fail;
   1466    memset(p, 0, sizeof(*p));
   1467    memcpy(p->consts, consts, sizeof(consts));
   1468 
   1469    p->translate.key = *key;
   1470    p->translate.release = translate_sse_release;
   1471    p->translate.set_buffer = translate_sse_set_buffer;
   1472 
   1473    for (i = 0; i < key->nr_elements; i++) {
   1474       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
   1475          unsigned j;
   1476 
   1477          p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
   1478 
   1479          if (key->element[i].instance_divisor) {
   1480             p->use_instancing = TRUE;
   1481          }
   1482 
   1483          /*
   1484           * Map vertex element to vertex buffer variant.
   1485           */
   1486          for (j = 0; j < p->nr_buffer_variants; j++) {
   1487             if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
   1488                 p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
   1489                break;
   1490             }
   1491          }
   1492          if (j == p->nr_buffer_variants) {
   1493             p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
   1494             p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
   1495             p->nr_buffer_variants++;
   1496          }
   1497          p->element_to_buffer_variant[i] = j;
   1498       } else {
   1499          assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
   1500 
   1501          p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
   1502       }
   1503    }
   1504 
   1505    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
   1506 
   1507    if (!build_vertex_emit(p, &p->linear_func, 0))
   1508       goto fail;
   1509 
   1510    if (!build_vertex_emit(p, &p->elt_func, 4))
   1511       goto fail;
   1512 
   1513    if (!build_vertex_emit(p, &p->elt16_func, 2))
   1514       goto fail;
   1515 
   1516    if (!build_vertex_emit(p, &p->elt8_func, 1))
   1517       goto fail;
   1518 
   1519    p->translate.run = (run_func) x86_get_func(&p->linear_func);
   1520    if (p->translate.run == NULL)
   1521       goto fail;
   1522 
   1523    p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
   1524    if (p->translate.run_elts == NULL)
   1525       goto fail;
   1526 
   1527    p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
   1528    if (p->translate.run_elts16 == NULL)
   1529       goto fail;
   1530 
   1531    p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
   1532    if (p->translate.run_elts8 == NULL)
   1533       goto fail;
   1534 
   1535    return &p->translate;
   1536 
   1537  fail:
   1538    if (p)
   1539       translate_sse_release( &p->translate );
   1540 
   1541    return NULL;
   1542 }
   1543 
   1544 
   1545 
   1546 #else
   1547 
   1548 struct translate *translate_sse2_create( const struct translate_key *key )
   1549 {
   1550    return NULL;
   1551 }
   1552 
   1553 #endif
   1554