Home | History | Annotate | Download | only in jitter
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file streamout_jit.cpp
     24 *
     25 * @brief Implementation of the streamout jitter
     26 *
     27 * Notes:
     28 *
     29 ******************************************************************************/
     30 #include "jit_pch.hpp"
     31 #include "builder.h"
     32 #include "jit_api.h"
     33 #include "streamout_jit.h"
     34 #include "gen_state_llvm.h"
     35 
     36 using namespace llvm;
     37 using namespace SwrJit;
     38 
     39 //////////////////////////////////////////////////////////////////////////
     40 /// Interface to Jitting a fetch shader
     41 //////////////////////////////////////////////////////////////////////////
     42 struct StreamOutJit : public Builder
     43 {
     44     StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
     45 
     46     // returns pointer to SWR_STREAMOUT_BUFFER
     47     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
     48     {
     49         return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
     50     }
     51 
     52 
     53     //////////////////////////////////////////////////////////////////////////
     54     // @brief checks if streamout buffer is oob
     55     // @return <i1> true/false
     56     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
     57     {
     58         Value* returnMask = C(false);
     59 
     60         Value* pBuf = getSOBuffer(pSoCtx, buffer);
     61 
     62         // load enable
     63         // @todo bool data types should generate <i1> llvm type
     64         Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
     65 
     66         // load buffer size
     67         Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
     68 
     69         // load current streamOffset
     70         Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
     71 
     72         // load buffer pitch
     73         Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
     74 
     75         // buffer is considered oob if in use in a decl but not enabled
     76         returnMask = OR(returnMask, NOT(enabled));
     77 
     78         // buffer is oob if cannot fit a prims worth of verts
     79         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
     80         returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
     81 
     82         return returnMask;
     83     }
     84 
     85 
     86     //////////////////////////////////////////////////////////////////////////
     87     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
     88     //        packing the active mask bits
     89     //        ex. bitmask 0011 -> (0, 1, 0, 0)
     90     //            bitmask 1000 -> (3, 0, 0, 0)
     91     //            bitmask 1100 -> (2, 3, 0, 0)
     92     Value* PackMask(uint32_t bitmask)
     93     {
     94         std::vector<Constant*> indices(4, C(0));
     95         DWORD index;
     96         uint32_t elem = 0;
     97         while (_BitScanForward(&index, bitmask))
     98         {
     99             indices[elem++] = C((int)index);
    100             bitmask &= ~(1 << index);
    101         }
    102 
    103         return ConstantVector::get(indices);
    104     }
    105 
    106     //////////////////////////////////////////////////////////////////////////
    107     // @brief convert scalar bitmask to <4xfloat> bitmask
    108     Value* ToMask(uint32_t bitmask)
    109     {
    110         std::vector<Constant*> indices;
    111         for (uint32_t i = 0; i < 4; ++i)
    112         {
    113             if (bitmask & (1 << i))
    114             {
    115                 indices.push_back(C(-1.0f));
    116             }
    117             else
    118             {
    119                 indices.push_back(C(0.0f));
    120             }
    121         }
    122         return ConstantVector::get(indices);
    123     }
    124 
    125     //////////////////////////////////////////////////////////////////////////
    126     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
    127     //        stream and writes N components to the output buffer given the componentMask or if
    128     //        a hole, just increments the buffer pointer
    129     // @param pStream - pointer to current attribute
    130     // @param pOutBuffers - pointers to the current location of each output buffer
    131     // @param decl - input decl
    132     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
    133     {
    134         // @todo add this to x86 macros
    135         Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
    136 
    137         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
    138         uint32_t packedMask = (1 << numComponents) - 1;
    139         if (!decl.hole)
    140         {
    141             // increment stream pointer to correct slot
    142             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
    143 
    144             // load 4 components from stream
    145             Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
    146             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
    147             pAttrib = BITCAST(pAttrib, simd4PtrTy);
    148             Value *vattrib = LOAD(pAttrib);
    149 
    150             // shuffle/pack enabled components
    151             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
    152 
    153             // store to output buffer
    154             // cast SO buffer to i8*, needed by maskstore
    155             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
    156 
    157             // cast input to <4xfloat>
    158             Value* src = BITCAST(vpackedAttrib, simd4Ty);
    159 
    160             // cast mask to <4xint>
    161             Value* mask = ToMask(packedMask);
    162             mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4));
    163             CALL(maskStore, {pOut, mask, src});
    164         }
    165 
    166         // increment SO buffer
    167         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
    168     }
    169 
    170     //////////////////////////////////////////////////////////////////////////
    171     // @brief builds a single vertex worth of data for the given stream
    172     // @param streamState - state for this stream
    173     // @param pCurVertex - pointer to src stream vertex data
    174     // @param pOutBuffer - pointers to up to 4 SO buffers
    175     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
    176     {
    177         for (uint32_t d = 0; d < streamState.numDecls; ++d)
    178         {
    179             const STREAMOUT_DECL& decl = streamState.decl[d];
    180             buildDecl(pCurVertex, pOutBuffer, decl);
    181         }
    182     }
    183 
    184     void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
    185     {
    186         // get list of active SO buffers
    187         std::unordered_set<uint32_t> activeSOBuffers;
    188         for (uint32_t d = 0; d < streamState.numDecls; ++d)
    189         {
    190             const STREAMOUT_DECL& decl = streamState.decl[d];
    191             activeSOBuffers.insert(decl.bufferIndex);
    192         }
    193 
    194         // always increment numPrimStorageNeeded
    195         Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
    196         numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
    197         STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
    198 
    199         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
    200         // the primitive to any buffer
    201         Value* oobMask = C(false);
    202         for (uint32_t buffer : activeSOBuffers)
    203         {
    204             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
    205         }
    206 
    207         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
    208 
    209         // early out if OOB
    210         COND_BR(oobMask, returnBB, validBB);
    211 
    212         IRB()->SetInsertPoint(validBB);
    213 
    214         Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
    215         numPrimsWritten = ADD(numPrimsWritten, C(1));
    216         STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
    217 
    218         // compute start pointer for each output buffer
    219         Value* pOutBuffer[4];
    220         Value* pOutBufferStartVertex[4];
    221         Value* outBufferPitch[4];
    222         for (uint32_t b: activeSOBuffers)
    223         {
    224             Value* pBuf = getSOBuffer(pSoCtx, b);
    225             Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
    226             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
    227             pOutBuffer[b] = GEP(pData, streamOffset);
    228             pOutBufferStartVertex[b] = pOutBuffer[b];
    229 
    230             outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
    231         }
    232 
    233         // loop over the vertices of the prim
    234         Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
    235         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
    236         {
    237             buildVertex(streamState, pStreamData, pOutBuffer);
    238 
    239             // increment stream and output buffer pointers
    240             // stream verts are always 32*4 dwords apart
    241             pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
    242 
    243             // output buffers offset using pitch in buffer state
    244             for (uint32_t b : activeSOBuffers)
    245             {
    246                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
    247                 pOutBuffer[b] = pOutBufferStartVertex[b];
    248             }
    249         }
    250 
    251         // update each active buffer's streamOffset
    252         for (uint32_t b : activeSOBuffers)
    253         {
    254             Value* pBuf = getSOBuffer(pSoCtx, b);
    255             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
    256             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
    257             STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
    258         }
    259     }
    260 
    261     Function* Create(const STREAMOUT_COMPILE_STATE& state)
    262     {
    263         std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
    264         fnName << ComputeCRC(0, &state, sizeof(state));
    265 
    266         // SO function signature
    267         // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
    268 
    269         std::vector<Type*> args{
    270             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
    271         };
    272 
    273         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
    274         Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
    275 
    276         soFunc->getParent()->setModuleIdentifier(soFunc->getName());
    277 
    278         // create return basic block
    279         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
    280         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
    281 
    282         IRB()->SetInsertPoint(entry);
    283 
    284         // arguments
    285         auto argitr = soFunc->arg_begin();
    286         Value* pSoCtx = &*argitr++;
    287         pSoCtx->setName("pSoCtx");
    288 
    289         const STREAMOUT_STREAM& streamState = state.stream;
    290         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
    291 
    292         BR(returnBB);
    293 
    294         IRB()->SetInsertPoint(returnBB);
    295         RET_VOID();
    296 
    297         JitManager::DumpToFile(soFunc, "SoFunc");
    298 
    299         ::FunctionPassManager passes(JM()->mpCurrentModule);
    300 
    301         passes.add(createBreakCriticalEdgesPass());
    302         passes.add(createCFGSimplificationPass());
    303         passes.add(createEarlyCSEPass());
    304         passes.add(createPromoteMemoryToRegisterPass());
    305         passes.add(createCFGSimplificationPass());
    306         passes.add(createEarlyCSEPass());
    307         passes.add(createInstructionCombiningPass());
    308         passes.add(createInstructionSimplifierPass());
    309         passes.add(createConstantPropagationPass());
    310         passes.add(createSCCPPass());
    311         passes.add(createAggressiveDCEPass());
    312 
    313         passes.run(*soFunc);
    314 
    315         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
    316 
    317         return soFunc;
    318     }
    319 };
    320 
    321 //////////////////////////////////////////////////////////////////////////
    322 /// @brief JITs from streamout shader IR
    323 /// @param hJitMgr - JitManager handle
    324 /// @param func   - LLVM function IR
    325 /// @return PFN_SO_FUNC - pointer to SOS function
    326 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
    327 {
    328     const llvm::Function *func = (const llvm::Function*)hFunc;
    329     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    330     PFN_SO_FUNC pfnStreamOut;
    331     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
    332     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
    333     pJitMgr->mIsModuleFinalized = true;
    334 
    335     return pfnStreamOut;
    336 }
    337 
    338 //////////////////////////////////////////////////////////////////////////
    339 /// @brief JIT compiles streamout shader
    340 /// @param hJitMgr - JitManager handle
    341 /// @param state   - SO state to build function from
    342 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
    343 {
    344     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
    345 
    346     STREAMOUT_COMPILE_STATE soState = state;
    347     if (soState.offsetAttribs)
    348     {
    349         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
    350         {
    351             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
    352         }
    353     }
    354 
    355     pJitMgr->SetupNewModule();
    356 
    357     StreamOutJit theJit(pJitMgr);
    358     HANDLE hFunc = theJit.Create(soState);
    359 
    360     return JitStreamoutFunc(hJitMgr, hFunc);
    361 }
    362