Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file blend_jit.cpp
     24 *
     25 * @brief Implementation of the blend jitter
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_pch.hpp"
     31 #include "builder.h"
     32 #include "jit_api.h"
     33 #include "blend_jit.h"
     34 #include "gen_state_llvm.h"
     35 
     36 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
     37 #define QUANTIZE_THRESHOLD 2
     38 
     39 using namespace llvm;
     40 using namespace SwrJit;
     41 
     42 //////////////////////////////////////////////////////////////////////////
     43 /// Interface to Jitting a blend shader
     44 //////////////////////////////////////////////////////////////////////////
     45 struct BlendJit : public Builder
     46 {
     47     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
     48 
     49     template<bool Color, bool Alpha>
     50     void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
     51     {
     52         Value* out[4];
     53 
     54         switch (factor)
     55         {
     56         case BLENDFACTOR_ONE:
     57             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
     58             break;
     59         case BLENDFACTOR_SRC_COLOR:
     60             out[0] = src[0];
     61             out[1] = src[1];
     62             out[2] = src[2];
     63             out[3] = src[3];
     64             break;
     65         case BLENDFACTOR_SRC_ALPHA:
     66             out[0] = out[1] = out[2] = out[3] = src[3];
     67             break;
     68         case BLENDFACTOR_DST_ALPHA:
     69             out[0] = out[1] = out[2] = out[3] = dst[3];
     70             break;
     71         case BLENDFACTOR_DST_COLOR:
     72             out[0] = dst[0];
     73             out[1] = dst[1];
     74             out[2] = dst[2];
     75             out[3] = dst[3];
     76             break;
     77         case BLENDFACTOR_SRC_ALPHA_SATURATE:
     78             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
     79             out[3] = VIMMED1(1.0f);
     80             break;
     81         case BLENDFACTOR_CONST_COLOR:
     82             out[0] = constColor[0];
     83             out[1] = constColor[1];
     84             out[2] = constColor[2];
     85             out[3] = constColor[3];
     86             break;
     87         case BLENDFACTOR_CONST_ALPHA:
     88             out[0] = out[1] = out[2] = out[3] = constColor[3];
     89             break;
     90         case BLENDFACTOR_SRC1_COLOR:
     91             out[0] = src1[0];
     92             out[1] = src1[1];
     93             out[2] = src1[2];
     94             out[3] = src1[3];
     95             break;
     96         case BLENDFACTOR_SRC1_ALPHA:
     97             out[0] = out[1] = out[2] = out[3] = src1[3];
     98             break;
     99         case BLENDFACTOR_ZERO:
    100             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    101             break;
    102         case BLENDFACTOR_INV_SRC_COLOR:
    103             out[0] = FSUB(VIMMED1(1.0f), src[0]);
    104             out[1] = FSUB(VIMMED1(1.0f), src[1]);
    105             out[2] = FSUB(VIMMED1(1.0f), src[2]);
    106             out[3] = FSUB(VIMMED1(1.0f), src[3]);
    107             break;
    108         case BLENDFACTOR_INV_SRC_ALPHA:
    109             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
    110             break;
    111         case BLENDFACTOR_INV_DST_ALPHA:
    112             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
    113             break;
    114         case BLENDFACTOR_INV_DST_COLOR:
    115             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
    116             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
    117             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
    118             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
    119             break;
    120         case BLENDFACTOR_INV_CONST_COLOR:
    121             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
    122             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
    123             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
    124             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
    125             break;
    126         case BLENDFACTOR_INV_CONST_ALPHA:
    127             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
    128             break;
    129         case BLENDFACTOR_INV_SRC1_COLOR:
    130             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
    131             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
    132             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
    133             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
    134             break;
    135         case BLENDFACTOR_INV_SRC1_ALPHA:
    136             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
    137             break;
    138         default:
    139             SWR_INVALID("Unsupported blend factor: %d", factor);
    140             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    141             break;
    142         }
    143 
    144         if (Color)
    145         {
    146             result[0] = out[0];
    147             result[1] = out[1];
    148             result[2] = out[2];
    149         }
    150 
    151         if (Alpha)
    152         {
    153             result[3] = out[3];
    154         }
    155     }
    156 
    157     void Clamp(SWR_FORMAT format, Value* src[4])
    158     {
    159         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    160         SWR_TYPE type = info.type[0];
    161 
    162         switch (type)
    163         {
    164         default:
    165             break;
    166 
    167         case SWR_TYPE_UNORM:
    168             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
    169             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
    170             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
    171             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
    172             break;
    173 
    174         case SWR_TYPE_SNORM:
    175             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
    176             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
    177             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
    178             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
    179             break;
    180 
    181         case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type);
    182         }
    183     }
    184 
    185     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
    186     {
    187         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    188 
    189         bool valid[] = { false, false, false, false };
    190         for (uint32_t c = 0; c < info.numComps; ++c)
    191         {
    192             valid[info.swizzle[c]] = true;
    193         }
    194 
    195         for (uint32_t c = 0; c < 4; ++c)
    196         {
    197             if (!valid[c])
    198             {
    199                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
    200             }
    201         }
    202     }
    203 
    204     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
    205     {
    206         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    207 
    208         for (uint32_t c = 0; c < info.numComps; ++c)
    209         {
    210             if (info.type[c] == SWR_TYPE_UNUSED)
    211             {
    212                 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
    213             }
    214         }
    215     }
    216 
    217     void Quantize(SWR_FORMAT format, Value* src[4])
    218     {
    219         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    220         for (uint32_t c = 0; c < info.numComps; ++c)
    221         {
    222             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
    223             {
    224                 uint32_t swizComp = info.swizzle[c];
    225                 float factor = (float)((1 << info.bpc[c]) - 1);
    226                 switch (info.type[c])
    227                 {
    228                 case SWR_TYPE_UNORM:
    229                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
    230                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
    231                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
    232                     break;
    233                 default: SWR_INVALID("Unsupported format type: %d", info.type[c]);
    234                 }
    235             }
    236         }
    237     }
    238 
    239     template<bool Color, bool Alpha>
    240     void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
    241     {
    242         Value* out[4];
    243         Value* srcBlend[4];
    244         Value* dstBlend[4];
    245         for (uint32_t i = 0; i < 4; ++i)
    246         {
    247             srcBlend[i] = FMUL(src[i], srcFactor[i]);
    248             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
    249         }
    250 
    251         switch (blendOp)
    252         {
    253         case BLENDOP_ADD:
    254             out[0] = FADD(srcBlend[0], dstBlend[0]);
    255             out[1] = FADD(srcBlend[1], dstBlend[1]);
    256             out[2] = FADD(srcBlend[2], dstBlend[2]);
    257             out[3] = FADD(srcBlend[3], dstBlend[3]);
    258             break;
    259 
    260         case BLENDOP_SUBTRACT:
    261             out[0] = FSUB(srcBlend[0], dstBlend[0]);
    262             out[1] = FSUB(srcBlend[1], dstBlend[1]);
    263             out[2] = FSUB(srcBlend[2], dstBlend[2]);
    264             out[3] = FSUB(srcBlend[3], dstBlend[3]);
    265             break;
    266 
    267         case BLENDOP_REVSUBTRACT:
    268             out[0] = FSUB(dstBlend[0], srcBlend[0]);
    269             out[1] = FSUB(dstBlend[1], srcBlend[1]);
    270             out[2] = FSUB(dstBlend[2], srcBlend[2]);
    271             out[3] = FSUB(dstBlend[3], srcBlend[3]);
    272             break;
    273 
    274         case BLENDOP_MIN:
    275             out[0] = VMINPS(src[0], dst[0]);
    276             out[1] = VMINPS(src[1], dst[1]);
    277             out[2] = VMINPS(src[2], dst[2]);
    278             out[3] = VMINPS(src[3], dst[3]);
    279             break;
    280 
    281         case BLENDOP_MAX:
    282             out[0] = VMAXPS(src[0], dst[0]);
    283             out[1] = VMAXPS(src[1], dst[1]);
    284             out[2] = VMAXPS(src[2], dst[2]);
    285             out[3] = VMAXPS(src[3], dst[3]);
    286             break;
    287 
    288         default:
    289             SWR_INVALID("Unsupported blend operation: %d", blendOp);
    290             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    291             break;
    292         }
    293 
    294         if (Color)
    295         {
    296             result[0] = out[0];
    297             result[1] = out[1];
    298             result[2] = out[2];
    299         }
    300 
    301         if (Alpha)
    302         {
    303             result[3] = out[3];
    304         }
    305     }
    306 
    307     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
    308     {
    309         // Op: (s == PS output, d = RT contents)
    310         switch(logicOp)
    311         {
    312         case LOGICOP_CLEAR:
    313             result[0] = VIMMED1(0);
    314             result[1] = VIMMED1(0);
    315             result[2] = VIMMED1(0);
    316             result[3] = VIMMED1(0);
    317             break;
    318 
    319         case LOGICOP_NOR:
    320             // ~(s | d)
    321             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    322             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    323             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    324             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    325             break;
    326 
    327         case LOGICOP_AND_INVERTED:
    328             // ~s & d
    329             // todo: use avx andnot instr when I can find the intrinsic to call
    330             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
    331             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
    332             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
    333             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
    334             break;
    335 
    336         case LOGICOP_COPY_INVERTED:
    337             // ~s
    338             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
    339             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
    340             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
    341             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
    342             break;
    343 
    344         case LOGICOP_AND_REVERSE:
    345             // s & ~d
    346             // todo: use avx andnot instr when I can find the intrinsic to call
    347             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
    348             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
    349             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
    350             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
    351             break;
    352 
    353         case LOGICOP_INVERT:
    354             // ~d
    355             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
    356             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
    357             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
    358             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
    359             break;
    360 
    361         case LOGICOP_XOR:
    362             // s ^ d
    363             result[0] = XOR(src[0], dst[0]);
    364             result[1] = XOR(src[1], dst[1]);
    365             result[2] = XOR(src[2], dst[2]);
    366             result[3] = XOR(src[3], dst[3]);
    367             break;
    368 
    369         case LOGICOP_NAND:
    370             // ~(s & d)
    371             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    372             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    373             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    374             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    375             break;
    376 
    377         case LOGICOP_AND:
    378             // s & d
    379             result[0] = AND(src[0], dst[0]);
    380             result[1] = AND(src[1], dst[1]);
    381             result[2] = AND(src[2], dst[2]);
    382             result[3] = AND(src[3], dst[3]);
    383             break;
    384 
    385         case LOGICOP_EQUIV:
    386             // ~(s ^ d)
    387             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    388             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    389             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    390             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    391             break;
    392 
    393         case LOGICOP_NOOP:
    394             result[0] = dst[0];
    395             result[1] = dst[1];
    396             result[2] = dst[2];
    397             result[3] = dst[3];
    398             break;
    399 
    400         case LOGICOP_OR_INVERTED:
    401             // ~s | d
    402             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
    403             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
    404             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
    405             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
    406             break;
    407 
    408         case LOGICOP_COPY:
    409             result[0] = src[0];
    410             result[1] = src[1];
    411             result[2] = src[2];
    412             result[3] = src[3];
    413             break;
    414 
    415         case LOGICOP_OR_REVERSE:
    416             // s | ~d
    417             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
    418             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
    419             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
    420             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
    421             break;
    422 
    423         case LOGICOP_OR:
    424             // s | d
    425             result[0] = OR(src[0], dst[0]);
    426             result[1] = OR(src[1], dst[1]);
    427             result[2] = OR(src[2], dst[2]);
    428             result[3] = OR(src[3], dst[3]);
    429             break;
    430 
    431         case LOGICOP_SET:
    432             result[0] = VIMMED1(0xFFFFFFFF);
    433             result[1] = VIMMED1(0xFFFFFFFF);
    434             result[2] = VIMMED1(0xFFFFFFFF);
    435             result[3] = VIMMED1(0xFFFFFFFF);
    436             break;
    437 
    438         default:
    439             SWR_INVALID("Unsupported logic operation: %d", logicOp);
    440             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
    441             break;
    442         }
    443     }
    444 
    445     void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
    446     {
    447         // load uint32_t reference
    448         Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
    449 
    450         // load alpha
    451         Value* pAlpha = LOAD(ppAlpha);
    452 
    453         Value* pTest = nullptr;
    454         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
    455         {
    456             // convert float alpha to unorm8
    457             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
    458             pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
    459 
    460             // compare
    461             switch (state.alphaTestFunction)
    462             {
    463             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
    464             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
    465             case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
    466             case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
    467             case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
    468             case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
    469             case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
    470             case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
    471             default:
    472                 SWR_INVALID("Invalid alpha test function");
    473                 break;
    474             }
    475         }
    476         else
    477         {
    478             // cast ref to float
    479             pRef = BITCAST(pRef, mSimdFP32Ty);
    480 
    481             // compare
    482             switch (state.alphaTestFunction)
    483             {
    484             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
    485             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
    486             case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
    487             case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
    488             case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
    489             case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
    490             case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
    491             case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
    492             default:
    493                 SWR_INVALID("Invalid alpha test function");
    494                 break;
    495             }
    496         }
    497 
    498         // load current mask
    499         Value* pMask = LOAD(ppMask);
    500 
    501         // convert to int1 mask
    502         pMask = MASK(pMask);
    503 
    504         // and with alpha test result
    505         pMask = AND(pMask, pTest);
    506 
    507         // convert back to vector mask
    508         pMask = VMASK(pMask);
    509 
    510         // store new mask
    511         STORE(pMask, ppMask);
    512     }
    513 
    514     Function* Create(const BLEND_COMPILE_STATE& state)
    515     {
    516         std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
    517         fnName << ComputeCRC(0, &state, sizeof(state));
    518 
    519         // blend function signature
    520         //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
    521 
    522         std::vector<Type*> args{
    523             PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
    524             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
    525             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
    526             PointerType::get(mSimdFP32Ty, 0),               // src0alpha
    527             Type::getInt32Ty(JM()->mContext),               // sampleNum
    528             PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
    529             PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
    530             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
    531             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
    532         };
    533 
    534         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
    535         Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
    536         blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
    537 
    538         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
    539 
    540         IRB()->SetInsertPoint(entry);
    541 
    542         // arguments
    543         auto argitr = blendFunc->arg_begin();
    544         Value* pBlendState = &*argitr++;
    545         pBlendState->setName("pBlendState");
    546         Value* pSrc = &*argitr++;
    547         pSrc->setName("src");
    548         Value* pSrc1 = &*argitr++;
    549         pSrc1->setName("src1");
    550         Value* pSrc0Alpha = &*argitr++;
    551         pSrc0Alpha->setName("src0alpha");
    552         Value* sampleNum = &*argitr++;
    553         sampleNum->setName("sampleNum");
    554         Value* pDst = &*argitr++;
    555         pDst->setName("pDst");
    556         Value* pResult = &*argitr++;
    557         pResult->setName("result");
    558         Value* ppoMask = &*argitr++;
    559         ppoMask->setName("ppoMask");
    560         Value* ppMask = &*argitr++;
    561         ppMask->setName("pMask");
    562 
    563         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    564         Value* dst[4];
    565         Value* constantColor[4];
    566         Value* src[4];
    567         Value* src1[4];
    568         Value* result[4];
    569         for (uint32_t i = 0; i < 4; ++i)
    570         {
    571             // load hot tile
    572             dst[i] = LOAD(pDst, { i });
    573 
    574             // load constant color
    575             constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
    576 
    577             // load src
    578             src[i] = LOAD(pSrc, { i });
    579 
    580             // load src1
    581             src1[i] = LOAD(pSrc1, { i });
    582         }
    583         Value* currentSampleMask = VIMMED1(-1);
    584         if (state.desc.alphaToCoverageEnable)
    585         {
    586             Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
    587             uint32_t bits = (1 << state.desc.numSamples) - 1;
    588             currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
    589             currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
    590         }
    591 
    592         // alpha test
    593         if (state.desc.alphaTestEnable)
    594         {
    595             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
    596         }
    597 
    598         // color blend
    599         if (state.blendState.blendEnable)
    600         {
    601             // clamp sources
    602             Clamp(state.format, src);
    603             Clamp(state.format, src1);
    604             Clamp(state.format, dst);
    605             Clamp(state.format, constantColor);
    606 
    607             // apply defaults to hottile contents to take into account missing components
    608             ApplyDefaults(state.format, dst);
    609 
    610             // Force defaults for unused 'X' components
    611             ApplyUnusedDefaults(state.format, dst);
    612 
    613             // Quantize low precision components
    614             Quantize(state.format, dst);
    615 
    616             // special case clamping for R11G11B10_float which has no sign bit
    617             if (state.format == R11G11B10_FLOAT)
    618             {
    619                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
    620                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
    621                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
    622                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
    623             }
    624 
    625             Value* srcFactor[4];
    626             Value* dstFactor[4];
    627             if (state.desc.independentAlphaBlendEnable)
    628             {
    629                 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
    630                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
    631 
    632                 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
    633                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
    634 
    635                 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
    636                 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
    637             }
    638             else
    639             {
    640                 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
    641                 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
    642 
    643                 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
    644             }
    645 
    646             // store results out
    647             for (uint32_t i = 0; i < 4; ++i)
    648             {
    649                 STORE(result[i], pResult, { i });
    650             }
    651         }
    652 
    653         if(state.blendState.logicOpEnable)
    654         {
    655             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
    656             Value* vMask[4];
    657             float scale[4];
    658 
    659             if (!state.blendState.blendEnable)
    660             {
    661                 Clamp(state.format, src);
    662                 Clamp(state.format, dst);
    663             }
    664 
    665             for(uint32_t i = 0; i < 4; i++)
    666             {
    667                 if (info.type[i] == SWR_TYPE_UNUSED)
    668                 {
    669                     continue;
    670                 }
    671 
    672                 if (info.bpc[i] >= 32)
    673                 {
    674                     vMask[i] = VIMMED1(0xFFFFFFFF);
    675                     scale[i] = 0xFFFFFFFF;
    676                 }
    677                 else
    678                 {
    679                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
    680                     if (info.type[i] == SWR_TYPE_SNORM)
    681                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
    682                     else
    683                         scale[i] = (1 << info.bpc[i]) - 1;
    684                 }
    685 
    686                 switch (info.type[i])
    687                 {
    688                 default:
    689                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
    690                     break;
    691 
    692                 case SWR_TYPE_UNKNOWN:
    693                 case SWR_TYPE_UNUSED:
    694                     // fallthrough
    695 
    696                 case SWR_TYPE_UINT:
    697                 case SWR_TYPE_SINT:
    698                     src[i] = BITCAST(src[i], mSimdInt32Ty);
    699                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
    700                     break;
    701                 case SWR_TYPE_SNORM:
    702                     src[i] = FP_TO_SI(
    703                         FMUL(src[i], VIMMED1(scale[i])),
    704                         mSimdInt32Ty);
    705                     dst[i] = FP_TO_SI(
    706                         FMUL(dst[i], VIMMED1(scale[i])),
    707                         mSimdInt32Ty);
    708                     break;
    709                 case SWR_TYPE_UNORM:
    710                     src[i] = FP_TO_UI(
    711                         FMUL(src[i], VIMMED1(scale[i])),
    712                         mSimdInt32Ty);
    713                     dst[i] = FP_TO_UI(
    714                         FMUL(dst[i], VIMMED1(scale[i])),
    715                         mSimdInt32Ty);
    716                     break;
    717                 }
    718             }
    719 
    720             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
    721 
    722             // store results out
    723             for(uint32_t i = 0; i < 4; ++i)
    724             {
    725                 if (info.type[i] == SWR_TYPE_UNUSED)
    726                 {
    727                     continue;
    728                 }
    729 
    730                 // clear upper bits from PS output not in RT format after doing logic op
    731                 result[i] = AND(result[i], vMask[i]);
    732 
    733                 switch (info.type[i])
    734                 {
    735                 default:
    736                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
    737                     break;
    738 
    739                 case SWR_TYPE_UNKNOWN:
    740                 case SWR_TYPE_UNUSED:
    741                     // fallthrough
    742 
    743                 case SWR_TYPE_UINT:
    744                 case SWR_TYPE_SINT:
    745                     result[i] = BITCAST(result[i], mSimdFP32Ty);
    746                     break;
    747                 case SWR_TYPE_SNORM:
    748                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
    749                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
    750                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
    751                                      VIMMED1(1.0f / scale[i]));
    752                     break;
    753                 case SWR_TYPE_UNORM:
    754                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
    755                                      VIMMED1(1.0f / scale[i]));
    756                     break;
    757                 }
    758 
    759                 STORE(result[i], pResult, {i});
    760             }
    761         }
    762 
    763         if(state.desc.oMaskEnable)
    764         {
    765             assert(!(state.desc.alphaToCoverageEnable));
    766             // load current mask
    767             Value* oMask = LOAD(ppoMask);
    768             currentSampleMask = AND(oMask, currentSampleMask);
    769         }
    770 
    771         if(state.desc.sampleMaskEnable)
    772         {
    773             Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
    774             currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
    775         }
    776 
    777         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
    778            state.desc.oMaskEnable)
    779         {
    780             // load coverage mask and mask off any lanes with no samples
    781             Value* pMask = LOAD(ppMask);
    782             Value* sampleMasked = SHL(C(1), sampleNum);
    783             currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
    784             currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
    785             Value* outputMask = AND(pMask, currentSampleMask);
    786             // store new mask
    787             STORE(outputMask, GEP(ppMask, C(0)));
    788         }
    789 
    790         RET_VOID();
    791 
    792         JitManager::DumpToFile(blendFunc, "");
    793 
    794         ::FunctionPassManager passes(JM()->mpCurrentModule);
    795 
    796         passes.add(createBreakCriticalEdgesPass());
    797         passes.add(createCFGSimplificationPass());
    798         passes.add(createEarlyCSEPass());
    799         passes.add(createPromoteMemoryToRegisterPass());
    800         passes.add(createCFGSimplificationPass());
    801         passes.add(createEarlyCSEPass());
    802         passes.add(createInstructionCombiningPass());
    803         passes.add(createInstructionSimplifierPass());
    804         passes.add(createConstantPropagationPass());
    805         passes.add(createSCCPPass());
    806         passes.add(createAggressiveDCEPass());
    807 
    808         passes.run(*blendFunc);
    809 
    810         JitManager::DumpToFile(blendFunc, "optimized");
    811 
    812         return blendFunc;
    813     }
    814 };
    815 
    816 //////////////////////////////////////////////////////////////////////////
    817 /// @brief JITs from fetch shader IR
    818 /// @param hJitMgr - JitManager handle
    819 /// @param func   - LLVM function IR
    820 /// @return PFN_FETCH_FUNC - pointer to fetch code
    821 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
    822 {
    823     const llvm::Function *func = (const llvm::Function*)hFunc;
    824     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    825     PFN_BLEND_JIT_FUNC pfnBlend;
    826     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
    827     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
    828     pJitMgr->mIsModuleFinalized = true;
    829 
    830     return pfnBlend;
    831 }
    832 
    833 //////////////////////////////////////////////////////////////////////////
    834 /// @brief JIT compiles blend shader
    835 /// @param hJitMgr - JitManager handle
    836 /// @param state   - blend state to build function from
    837 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
    838 {
    839     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    840 
    841     pJitMgr->SetupNewModule();
    842 
    843     BlendJit theJit(pJitMgr);
    844     HANDLE hFunc = theJit.Create(state);
    845 
    846     return JitBlendFunc(hJitMgr, hFunc);
    847 }
    848