Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file blend_jit.cpp
     24 *
     25 * @brief Implementation of the blend jitter
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_api.h"
     31 #include "blend_jit.h"
     32 #include "builder.h"
     33 #include "state_llvm.h"
     34 
     35 #include <sstream>
     36 
     37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
     38 #define QUANTIZE_THRESHOLD 2
     39 
     40 using namespace llvm;
     41 using namespace SwrJit;
     42 
     43 //////////////////////////////////////////////////////////////////////////
     44 /// Interface to Jitting a blend shader
     45 //////////////////////////////////////////////////////////////////////////
     46 struct BlendJit : public Builder
     47 {
     48     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
     49 
     50     template<bool Color, bool Alpha>
     51     void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
     52     {
     53         Value* out[4];
     54 
     55         switch (factor)
     56         {
     57         case BLENDFACTOR_ONE:
     58             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
     59             break;
     60         case BLENDFACTOR_SRC_COLOR:
     61             out[0] = src[0];
     62             out[1] = src[1];
     63             out[2] = src[2];
     64             out[3] = src[3];
     65             break;
     66         case BLENDFACTOR_SRC_ALPHA:
     67             out[0] = out[1] = out[2] = out[3] = src[3];
     68             break;
     69         case BLENDFACTOR_DST_ALPHA:
     70             out[0] = out[1] = out[2] = out[3] = dst[3];
     71             break;
     72         case BLENDFACTOR_DST_COLOR:
     73             out[0] = dst[0];
     74             out[1] = dst[1];
     75             out[2] = dst[2];
     76             out[3] = dst[3];
     77             break;
     78         case BLENDFACTOR_SRC_ALPHA_SATURATE:
     79             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
     80             out[3] = VIMMED1(1.0f);
     81             break;
     82         case BLENDFACTOR_CONST_COLOR:
     83             out[0] = constColor[0];
     84             out[1] = constColor[1];
     85             out[2] = constColor[2];
     86             out[3] = constColor[3];
     87             break;
     88         case BLENDFACTOR_CONST_ALPHA:
     89             out[0] = out[1] = out[2] = out[3] = constColor[3];
     90             break;
     91         case BLENDFACTOR_SRC1_COLOR:
     92             out[0] = src1[0];
     93             out[1] = src1[1];
     94             out[2] = src1[2];
     95             out[3] = src1[3];
     96             break;
     97         case BLENDFACTOR_SRC1_ALPHA:
     98             out[0] = out[1] = out[2] = out[3] = src1[3];
     99             break;
    100         case BLENDFACTOR_ZERO:
    101             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    102             break;
    103         case BLENDFACTOR_INV_SRC_COLOR:
    104             out[0] = FSUB(VIMMED1(1.0f), src[0]);
    105             out[1] = FSUB(VIMMED1(1.0f), src[1]);
    106             out[2] = FSUB(VIMMED1(1.0f), src[2]);
    107             out[3] = FSUB(VIMMED1(1.0f), src[3]);
    108             break;
    109         case BLENDFACTOR_INV_SRC_ALPHA:
    110             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
    111             break;
    112         case BLENDFACTOR_INV_DST_ALPHA:
    113             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
    114             break;
    115         case BLENDFACTOR_INV_DST_COLOR:
    116             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
    117             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
    118             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
    119             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
    120             break;
    121         case BLENDFACTOR_INV_CONST_COLOR:
    122             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
    123             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
    124             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
    125             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
    126             break;
    127         case BLENDFACTOR_INV_CONST_ALPHA:
    128             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
    129             break;
    130         case BLENDFACTOR_INV_SRC1_COLOR:
    131             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
    132             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
    133             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
    134             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
    135             break;
    136         case BLENDFACTOR_INV_SRC1_ALPHA:
    137             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
    138             break;
    139         default:
    140             SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
    141             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    142             break;
    143         }
    144 
    145         if (Color)
    146         {
    147             result[0] = out[0];
    148             result[1] = out[1];
    149             result[2] = out[2];
    150         }
    151 
    152         if (Alpha)
    153         {
    154             result[3] = out[3];
    155         }
    156     }
    157 
    158     void Clamp(SWR_FORMAT format, Value* src[4])
    159     {
    160         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    161         SWR_TYPE type = info.type[0];
    162 
    163         switch (type)
    164         {
    165         case SWR_TYPE_FLOAT:
    166             break;
    167 
    168         case SWR_TYPE_UNORM:
    169             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
    170             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
    171             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
    172             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
    173             break;
    174 
    175         case SWR_TYPE_SNORM:
    176             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
    177             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
    178             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
    179             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
    180             break;
    181 
    182         default: SWR_ASSERT(false, "Unsupport format type: %d", type);
    183         }
    184     }
    185 
    186     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
    187     {
    188         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    189 
    190         bool valid[] = { false, false, false, false };
    191         for (uint32_t c = 0; c < info.numComps; ++c)
    192         {
    193             valid[info.swizzle[c]] = true;
    194         }
    195 
    196         for (uint32_t c = 0; c < 4; ++c)
    197         {
    198             if (!valid[c])
    199             {
    200                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
    201             }
    202         }
    203     }
    204 
    205     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
    206     {
    207         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    208 
    209         for (uint32_t c = 0; c < info.numComps; ++c)
    210         {
    211             if (info.type[c] == SWR_TYPE_UNUSED)
    212             {
    213                 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
    214             }
    215         }
    216     }
    217 
    218     void Quantize(SWR_FORMAT format, Value* src[4])
    219     {
    220         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
    221         for (uint32_t c = 0; c < info.numComps; ++c)
    222         {
    223             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
    224             {
    225                 uint32_t swizComp = info.swizzle[c];
    226                 float factor = (float)((1 << info.bpc[c]) - 1);
    227                 switch (info.type[c])
    228                 {
    229                 case SWR_TYPE_UNORM:
    230                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
    231                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
    232                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
    233                     break;
    234                 default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
    235                 }
    236             }
    237         }
    238     }
    239 
    240     template<bool Color, bool Alpha>
    241     void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
    242     {
    243         Value* out[4];
    244         Value* srcBlend[4];
    245         Value* dstBlend[4];
    246         for (uint32_t i = 0; i < 4; ++i)
    247         {
    248             srcBlend[i] = FMUL(src[i], srcFactor[i]);
    249             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
    250         }
    251 
    252         switch (blendOp)
    253         {
    254         case BLENDOP_ADD:
    255             out[0] = FADD(srcBlend[0], dstBlend[0]);
    256             out[1] = FADD(srcBlend[1], dstBlend[1]);
    257             out[2] = FADD(srcBlend[2], dstBlend[2]);
    258             out[3] = FADD(srcBlend[3], dstBlend[3]);
    259             break;
    260 
    261         case BLENDOP_SUBTRACT:
    262             out[0] = FSUB(srcBlend[0], dstBlend[0]);
    263             out[1] = FSUB(srcBlend[1], dstBlend[1]);
    264             out[2] = FSUB(srcBlend[2], dstBlend[2]);
    265             out[3] = FSUB(srcBlend[3], dstBlend[3]);
    266             break;
    267 
    268         case BLENDOP_REVSUBTRACT:
    269             out[0] = FSUB(dstBlend[0], srcBlend[0]);
    270             out[1] = FSUB(dstBlend[1], srcBlend[1]);
    271             out[2] = FSUB(dstBlend[2], srcBlend[2]);
    272             out[3] = FSUB(dstBlend[3], srcBlend[3]);
    273             break;
    274 
    275         case BLENDOP_MIN:
    276             out[0] = VMINPS(src[0], dst[0]);
    277             out[1] = VMINPS(src[1], dst[1]);
    278             out[2] = VMINPS(src[2], dst[2]);
    279             out[3] = VMINPS(src[3], dst[3]);
    280             break;
    281 
    282         case BLENDOP_MAX:
    283             out[0] = VMAXPS(src[0], dst[0]);
    284             out[1] = VMAXPS(src[1], dst[1]);
    285             out[2] = VMAXPS(src[2], dst[2]);
    286             out[3] = VMAXPS(src[3], dst[3]);
    287             break;
    288 
    289         default:
    290             SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
    291             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
    292             break;
    293         }
    294 
    295         if (Color)
    296         {
    297             result[0] = out[0];
    298             result[1] = out[1];
    299             result[2] = out[2];
    300         }
    301 
    302         if (Alpha)
    303         {
    304             result[3] = out[3];
    305         }
    306     }
    307 
    308     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
    309     {
    310         // Op: (s == PS output, d = RT contents)
    311         switch(logicOp)
    312         {
    313         case LOGICOP_CLEAR:
    314             result[0] = VIMMED1(0);
    315             result[1] = VIMMED1(0);
    316             result[2] = VIMMED1(0);
    317             result[3] = VIMMED1(0);
    318             break;
    319 
    320         case LOGICOP_NOR:
    321             // ~(s | d)
    322             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    323             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    324             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    325             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    326             break;
    327 
    328         case LOGICOP_AND_INVERTED:
    329             // ~s & d
    330             // todo: use avx andnot instr when I can find the intrinsic to call
    331             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
    332             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
    333             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
    334             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
    335             break;
    336 
    337         case LOGICOP_COPY_INVERTED:
    338             // ~s
    339             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
    340             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
    341             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
    342             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
    343             break;
    344 
    345         case LOGICOP_AND_REVERSE:
    346             // s & ~d
    347             // todo: use avx andnot instr when I can find the intrinsic to call
    348             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
    349             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
    350             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
    351             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
    352             break;
    353 
    354         case LOGICOP_INVERT:
    355             // ~d
    356             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
    357             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
    358             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
    359             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
    360             break;
    361 
    362         case LOGICOP_XOR:
    363             // s ^ d
    364             result[0] = XOR(src[0], dst[0]);
    365             result[1] = XOR(src[1], dst[1]);
    366             result[2] = XOR(src[2], dst[2]);
    367             result[3] = XOR(src[3], dst[3]);
    368             break;
    369 
    370         case LOGICOP_NAND:
    371             // ~(s & d)
    372             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    373             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    374             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    375             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    376             break;
    377 
    378         case LOGICOP_AND:
    379             // s & d
    380             result[0] = AND(src[0], dst[0]);
    381             result[1] = AND(src[1], dst[1]);
    382             result[2] = AND(src[2], dst[2]);
    383             result[3] = AND(src[3], dst[3]);
    384             break;
    385 
    386         case LOGICOP_EQUIV:
    387             // ~(s ^ d)
    388             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
    389             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
    390             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
    391             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
    392             break;
    393 
    394         case LOGICOP_NOOP:
    395             result[0] = dst[0];
    396             result[1] = dst[1];
    397             result[2] = dst[2];
    398             result[3] = dst[3];
    399             break;
    400 
    401         case LOGICOP_OR_INVERTED:
    402             // ~s | d
    403             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
    404             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
    405             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
    406             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
    407             break;
    408 
    409         case LOGICOP_COPY:
    410             result[0] = src[0];
    411             result[1] = src[1];
    412             result[2] = src[2];
    413             result[3] = src[3];
    414             break;
    415 
    416         case LOGICOP_OR_REVERSE:
    417             // s | ~d
    418             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
    419             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
    420             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
    421             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
    422             break;
    423 
    424         case LOGICOP_OR:
    425             // s | d
    426             result[0] = OR(src[0], dst[0]);
    427             result[1] = OR(src[1], dst[1]);
    428             result[2] = OR(src[2], dst[2]);
    429             result[3] = OR(src[3], dst[3]);
    430             break;
    431 
    432         case LOGICOP_SET:
    433             result[0] = VIMMED1(0xFFFFFFFF);
    434             result[1] = VIMMED1(0xFFFFFFFF);
    435             result[2] = VIMMED1(0xFFFFFFFF);
    436             result[3] = VIMMED1(0xFFFFFFFF);
    437             break;
    438 
    439         default:
    440             SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
    441             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
    442             break;
    443         }
    444     }
    445 
    446     void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
    447     {
    448         // load uint32_t reference
    449         Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
    450 
    451         // load alpha
    452         Value* pAlpha = LOAD(ppAlpha);
    453 
    454         Value* pTest = nullptr;
    455         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
    456         {
    457             // convert float alpha to unorm8
    458             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
    459             pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
    460 
    461             // compare
    462             switch (state.alphaTestFunction)
    463             {
    464             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
    465             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
    466             case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
    467             case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
    468             case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
    469             case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
    470             case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
    471             case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
    472             default:
    473                 SWR_ASSERT(false, "Invalid alpha test function");
    474                 break;
    475             }
    476         }
    477         else
    478         {
    479             // cast ref to float
    480             pRef = BITCAST(pRef, mSimdFP32Ty);
    481 
    482             // compare
    483             switch (state.alphaTestFunction)
    484             {
    485             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
    486             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
    487             case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
    488             case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
    489             case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
    490             case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
    491             case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
    492             case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
    493             default:
    494                 SWR_ASSERT(false, "Invalid alpha test function");
    495                 break;
    496             }
    497         }
    498 
    499         // load current mask
    500         Value* pMask = LOAD(ppMask);
    501 
    502         // convert to int1 mask
    503         pMask = MASK(pMask);
    504 
    505         // and with alpha test result
    506         pMask = AND(pMask, pTest);
    507 
    508         // convert back to vector mask
    509         pMask = VMASK(pMask);
    510 
    511         // store new mask
    512         STORE(pMask, ppMask);
    513     }
    514 
    515     Function* Create(const BLEND_COMPILE_STATE& state)
    516     {
    517         static std::size_t jitNum = 0;
    518 
    519         std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
    520         fnName << jitNum++;
    521 
    522         // blend function signature
    523         //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
    524 
    525         std::vector<Type*> args{
    526             PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
    527             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
    528             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
    529             PointerType::get(mSimdFP32Ty, 0),               // src0alpha
    530             Type::getInt32Ty(JM()->mContext),               // sampleNum
    531             PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
    532             PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
    533             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
    534             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
    535         };
    536 
    537         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
    538         Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
    539 
    540         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
    541 
    542         IRB()->SetInsertPoint(entry);
    543 
    544         // arguments
    545         auto argitr = blendFunc->arg_begin();
    546         Value* pBlendState = &*argitr++;
    547         pBlendState->setName("pBlendState");
    548         Value* pSrc = &*argitr++;
    549         pSrc->setName("src");
    550         Value* pSrc1 = &*argitr++;
    551         pSrc1->setName("src1");
    552         Value* pSrc0Alpha = &*argitr++;
    553         pSrc0Alpha->setName("src0alpha");
    554         Value* sampleNum = &*argitr++;
    555         sampleNum->setName("sampleNum");
    556         Value* pDst = &*argitr++;
    557         pDst->setName("pDst");
    558         Value* pResult = &*argitr++;
    559         pResult->setName("result");
    560         Value* ppoMask = &*argitr++;
    561         ppoMask->setName("ppoMask");
    562         Value* ppMask = &*argitr++;
    563         ppMask->setName("pMask");
    564 
    565         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
    566         Value* dst[4];
    567         Value* constantColor[4];
    568         Value* src[4];
    569         Value* src1[4];
    570         Value* result[4];
    571         for (uint32_t i = 0; i < 4; ++i)
    572         {
    573             // load hot tile
    574             dst[i] = LOAD(pDst, { i });
    575 
    576             // load constant color
    577             constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
    578 
    579             // load src
    580             src[i] = LOAD(pSrc, { i });
    581 
    582             // load src1
    583             src1[i] = LOAD(pSrc1, { i });
    584         }
    585         Value* currentMask = VIMMED1(-1);
    586         if (state.desc.alphaToCoverageEnable)
    587         {
    588             Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
    589             uint32_t bits = (1 << state.desc.numSamples) - 1;
    590             currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
    591             currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
    592         }
    593 
    594         // alpha test
    595         if (state.desc.alphaTestEnable)
    596         {
    597             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
    598         }
    599 
    600         // color blend
    601         if (state.blendState.blendEnable)
    602         {
    603             // clamp sources
    604             Clamp(state.format, src);
    605             Clamp(state.format, src1);
    606             Clamp(state.format, dst);
    607             Clamp(state.format, constantColor);
    608 
    609             // apply defaults to hottile contents to take into account missing components
    610             ApplyDefaults(state.format, dst);
    611 
    612             // Force defaults for unused 'X' components
    613             ApplyUnusedDefaults(state.format, dst);
    614 
    615             // Quantize low precision components
    616             Quantize(state.format, dst);
    617 
    618             // special case clamping for R11G11B10_float which has no sign bit
    619             if (state.format == R11G11B10_FLOAT)
    620             {
    621                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
    622                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
    623                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
    624                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
    625             }
    626 
    627             Value* srcFactor[4];
    628             Value* dstFactor[4];
    629             if (state.desc.independentAlphaBlendEnable)
    630             {
    631                 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
    632                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
    633 
    634                 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
    635                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
    636 
    637                 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
    638                 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
    639             }
    640             else
    641             {
    642                 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
    643                 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
    644 
    645                 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
    646             }
    647 
    648             // store results out
    649             for (uint32_t i = 0; i < 4; ++i)
    650             {
    651                 STORE(result[i], pResult, { i });
    652             }
    653         }
    654 
    655         if(state.blendState.logicOpEnable)
    656         {
    657             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
    658             Value* vMask[4];
    659             float scale[4];
    660 
    661             if (!state.blendState.blendEnable)
    662             {
    663                 Clamp(state.format, src);
    664                 Clamp(state.format, dst);
    665             }
    666 
    667             for(uint32_t i = 0; i < 4; i++)
    668             {
    669                 if (info.type[i] == SWR_TYPE_UNUSED)
    670                 {
    671                     continue;
    672                 }
    673 
    674                 if (info.bpc[i] >= 32) {
    675                     vMask[i] = VIMMED1(0xFFFFFFFF);
    676                     scale[i] = 0xFFFFFFFF;
    677                 } else {
    678                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
    679                     if (info.type[i] == SWR_TYPE_SNORM)
    680                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
    681                     else
    682                         scale[i] = (1 << info.bpc[i]) - 1;
    683                 }
    684 
    685                 switch (info.type[i]) {
    686                 default:
    687                     SWR_ASSERT(0, "Unsupported type for logic op\n");
    688                     /* fallthrough */
    689                 case SWR_TYPE_UINT:
    690                 case SWR_TYPE_SINT:
    691                     src[i] = BITCAST(src[i], mSimdInt32Ty);
    692                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
    693                     break;
    694                 case SWR_TYPE_SNORM:
    695                     src[i] = FP_TO_SI(
    696                         FMUL(src[i], VIMMED1(scale[i])),
    697                         mSimdInt32Ty);
    698                     dst[i] = FP_TO_SI(
    699                         FMUL(dst[i], VIMMED1(scale[i])),
    700                         mSimdInt32Ty);
    701                     break;
    702                 case SWR_TYPE_UNORM:
    703                     src[i] = FP_TO_UI(
    704                         FMUL(src[i], VIMMED1(scale[i])),
    705                         mSimdInt32Ty);
    706                     dst[i] = FP_TO_UI(
    707                         FMUL(dst[i], VIMMED1(scale[i])),
    708                         mSimdInt32Ty);
    709                     break;
    710                 }
    711             }
    712 
    713             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
    714 
    715             // store results out
    716             for(uint32_t i = 0; i < 4; ++i)
    717             {
    718                 if (info.type[i] == SWR_TYPE_UNUSED)
    719                 {
    720                     continue;
    721                 }
    722 
    723                 // clear upper bits from PS output not in RT format after doing logic op
    724                 result[i] = AND(result[i], vMask[i]);
    725 
    726                 switch (info.type[i]) {
    727                 default:
    728                     SWR_ASSERT(0, "Unsupported type for logic op\n");
    729                     /* fallthrough */
    730                 case SWR_TYPE_UINT:
    731                 case SWR_TYPE_SINT:
    732                     result[i] = BITCAST(result[i], mSimdFP32Ty);
    733                     break;
    734                 case SWR_TYPE_SNORM:
    735                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
    736                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
    737                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
    738                                      VIMMED1(1.0f / scale[i]));
    739                     break;
    740                 case SWR_TYPE_UNORM:
    741                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
    742                                      VIMMED1(1.0f / scale[i]));
    743                     break;
    744                 }
    745 
    746                 STORE(result[i], pResult, {i});
    747             }
    748         }
    749 
    750         if(state.desc.oMaskEnable)
    751         {
    752             assert(!(state.desc.alphaToCoverageEnable));
    753             // load current mask
    754             Value* oMask = LOAD(ppoMask);
    755             Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
    756             oMask = AND(oMask, sampleMasked);
    757             currentMask = AND(oMask, currentMask);
    758         }
    759 
    760         if(state.desc.sampleMaskEnable)
    761         {
    762             Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
    763             Value* sampleMasked = SHL(C(1), sampleNum);
    764             sampleMask = AND(sampleMask, sampleMasked);
    765             sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
    766             sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
    767             currentMask = AND(sampleMask, currentMask);
    768         }
    769 
    770         if (state.desc.alphaToCoverageEnable)
    771         {
    772             Value* sampleMasked = SHL(C(1), sampleNum);
    773             currentMask = AND(currentMask, VBROADCAST(sampleMasked));
    774         }
    775 
    776         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
    777            state.desc.oMaskEnable)
    778         {
    779             // load current mask
    780             Value* pMask = LOAD(ppMask);
    781             currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
    782             Value* outputMask = AND(pMask, currentMask);
    783             // store new mask
    784             STORE(outputMask, GEP(ppMask, C(0)));
    785         }
    786 
    787         RET_VOID();
    788 
    789         JitManager::DumpToFile(blendFunc, "");
    790 
    791         ::FunctionPassManager passes(JM()->mpCurrentModule);
    792 
    793         passes.add(createBreakCriticalEdgesPass());
    794         passes.add(createCFGSimplificationPass());
    795         passes.add(createEarlyCSEPass());
    796         passes.add(createPromoteMemoryToRegisterPass());
    797         passes.add(createCFGSimplificationPass());
    798         passes.add(createEarlyCSEPass());
    799         passes.add(createInstructionCombiningPass());
    800         passes.add(createInstructionSimplifierPass());
    801         passes.add(createConstantPropagationPass());
    802         passes.add(createSCCPPass());
    803         passes.add(createAggressiveDCEPass());
    804 
    805         passes.run(*blendFunc);
    806 
    807         JitManager::DumpToFile(blendFunc, "optimized");
    808 
    809         return blendFunc;
    810     }
    811 };
    812 
    813 //////////////////////////////////////////////////////////////////////////
    814 /// @brief JITs from fetch shader IR
    815 /// @param hJitMgr - JitManager handle
    816 /// @param func   - LLVM function IR
    817 /// @return PFN_FETCH_FUNC - pointer to fetch code
    818 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
    819 {
    820     const llvm::Function *func = (const llvm::Function*)hFunc;
    821     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    822     PFN_BLEND_JIT_FUNC pfnBlend;
    823     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
    824     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
    825     pJitMgr->mIsModuleFinalized = true;
    826 
    827     return pfnBlend;
    828 }
    829 
    830 //////////////////////////////////////////////////////////////////////////
    831 /// @brief JIT compiles blend shader
    832 /// @param hJitMgr - JitManager handle
    833 /// @param state   - blend state to build function from
    834 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
    835 {
    836     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    837 
    838     pJitMgr->SetupNewModule();
    839 
    840     BlendJit theJit(pJitMgr);
    841     HANDLE hFunc = theJit.Create(state);
    842 
    843     return JitBlendFunc(hJitMgr, hFunc);
    844 }
    845