1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file streamout_jit.cpp 24 * 25 * @brief Implementation of the streamout jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_pch.hpp" 31 #include "builder.h" 32 #include "jit_api.h" 33 #include "streamout_jit.h" 34 #include "gen_state_llvm.h" 35 36 using namespace llvm; 37 using namespace SwrJit; 38 39 ////////////////////////////////////////////////////////////////////////// 40 /// Interface to Jitting a fetch shader 41 ////////////////////////////////////////////////////////////////////////// 42 struct StreamOutJit : public Builder 43 { 44 StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){}; 45 46 // returns pointer to SWR_STREAMOUT_BUFFER 47 Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) 48 { 49 return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer }); 50 } 51 52 53 ////////////////////////////////////////////////////////////////////////// 54 // @brief checks if streamout buffer is oob 55 // @return <i1> true/false 56 Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer) 57 { 58 Value* returnMask = C(false); 59 60 Value* pBuf = getSOBuffer(pSoCtx, buffer); 61 62 // load enable 63 // @todo bool data types should generate <i1> llvm type 64 Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty()); 65 66 // load buffer size 67 Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize }); 68 69 // load current streamOffset 70 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); 71 72 // load buffer pitch 73 Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); 74 75 // buffer is considered oob if in use in a decl but not enabled 76 returnMask = OR(returnMask, NOT(enabled)); 77 78 // buffer is oob if cannot fit a prims worth of verts 79 Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); 80 returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); 81 82 return returnMask; 83 } 84 85 86 ////////////////////////////////////////////////////////////////////////// 87 // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, 88 // packing the active mask bits 89 // ex. bitmask 0011 -> (0, 1, 0, 0) 90 // bitmask 1000 -> (3, 0, 0, 0) 91 // bitmask 1100 -> (2, 3, 0, 0) 92 Value* PackMask(uint32_t bitmask) 93 { 94 std::vector<Constant*> indices(4, C(0)); 95 DWORD index; 96 uint32_t elem = 0; 97 while (_BitScanForward(&index, bitmask)) 98 { 99 indices[elem++] = C((int)index); 100 bitmask &= ~(1 << index); 101 } 102 103 return ConstantVector::get(indices); 104 } 105 106 ////////////////////////////////////////////////////////////////////////// 107 // @brief convert scalar bitmask to <4xfloat> bitmask 108 Value* ToMask(uint32_t bitmask) 109 { 110 std::vector<Constant*> indices; 111 for (uint32_t i = 0; i < 4; ++i) 112 { 113 if (bitmask & (1 << i)) 114 { 115 indices.push_back(C(-1.0f)); 116 } 117 else 118 { 119 indices.push_back(C(0.0f)); 120 } 121 } 122 return ConstantVector::get(indices); 123 } 124 125 ////////////////////////////////////////////////////////////////////////// 126 // @brief processes a single decl from the streamout stream. Reads 4 components from the input 127 // stream and writes N components to the output buffer given the componentMask or if 128 // a hole, just increments the buffer pointer 129 // @param pStream - pointer to current attribute 130 // @param pOutBuffers - pointers to the current location of each output buffer 131 // @param decl - input decl 132 void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) 133 { 134 // @todo add this to x86 macros 135 Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); 136 137 uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); 138 uint32_t packedMask = (1 << numComponents) - 1; 139 if (!decl.hole) 140 { 141 // increment stream pointer to correct slot 142 Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); 143 144 // load 4 components from stream 145 Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); 146 Type* simd4PtrTy = PointerType::get(simd4Ty, 0); 147 pAttrib = BITCAST(pAttrib, simd4PtrTy); 148 Value *vattrib = LOAD(pAttrib); 149 150 // shuffle/pack enabled components 151 Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); 152 153 // store to output buffer 154 // cast SO buffer to i8*, needed by maskstore 155 Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); 156 157 // cast input to <4xfloat> 158 Value* src = BITCAST(vpackedAttrib, simd4Ty); 159 160 // cast mask to <4xint> 161 Value* mask = ToMask(packedMask); 162 mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4)); 163 CALL(maskStore, {pOut, mask, src}); 164 } 165 166 // increment SO buffer 167 pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); 168 } 169 170 ////////////////////////////////////////////////////////////////////////// 171 // @brief builds a single vertex worth of data for the given stream 172 // @param streamState - state for this stream 173 // @param pCurVertex - pointer to src stream vertex data 174 // @param pOutBuffer - pointers to up to 4 SO buffers 175 void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4]) 176 { 177 for (uint32_t d = 0; d < streamState.numDecls; ++d) 178 { 179 const STREAMOUT_DECL& decl = streamState.decl[d]; 180 buildDecl(pCurVertex, pOutBuffer, decl); 181 } 182 } 183 184 void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) 185 { 186 // get list of active SO buffers 187 std::unordered_set<uint32_t> activeSOBuffers; 188 for (uint32_t d = 0; d < streamState.numDecls; ++d) 189 { 190 const STREAMOUT_DECL& decl = streamState.decl[d]; 191 activeSOBuffers.insert(decl.bufferIndex); 192 } 193 194 // always increment numPrimStorageNeeded 195 Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); 196 numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); 197 STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); 198 199 // check OOB on active SO buffers. If any buffer is out of bound, don't write 200 // the primitive to any buffer 201 Value* oobMask = C(false); 202 for (uint32_t buffer : activeSOBuffers) 203 { 204 oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); 205 } 206 207 BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); 208 209 // early out if OOB 210 COND_BR(oobMask, returnBB, validBB); 211 212 IRB()->SetInsertPoint(validBB); 213 214 Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); 215 numPrimsWritten = ADD(numPrimsWritten, C(1)); 216 STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); 217 218 // compute start pointer for each output buffer 219 Value* pOutBuffer[4]; 220 Value* pOutBufferStartVertex[4]; 221 Value* outBufferPitch[4]; 222 for (uint32_t b: activeSOBuffers) 223 { 224 Value* pBuf = getSOBuffer(pSoCtx, b); 225 Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); 226 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); 227 pOutBuffer[b] = GEP(pData, streamOffset); 228 pOutBufferStartVertex[b] = pOutBuffer[b]; 229 230 outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); 231 } 232 233 // loop over the vertices of the prim 234 Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); 235 for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) 236 { 237 buildVertex(streamState, pStreamData, pOutBuffer); 238 239 // increment stream and output buffer pointers 240 // stream verts are always 32*4 dwords apart 241 pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4)); 242 243 // output buffers offset using pitch in buffer state 244 for (uint32_t b : activeSOBuffers) 245 { 246 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); 247 pOutBuffer[b] = pOutBufferStartVertex[b]; 248 } 249 } 250 251 // update each active buffer's streamOffset 252 for (uint32_t b : activeSOBuffers) 253 { 254 Value* pBuf = getSOBuffer(pSoCtx, b); 255 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); 256 streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); 257 STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); 258 } 259 } 260 261 Function* Create(const STREAMOUT_COMPILE_STATE& state) 262 { 263 std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 264 fnName << ComputeCRC(0, &state, sizeof(state)); 265 266 // SO function signature 267 // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*) 268 269 std::vector<Type*> args{ 270 PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* 271 }; 272 273 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); 274 Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 275 276 soFunc->getParent()->setModuleIdentifier(soFunc->getName()); 277 278 // create return basic block 279 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); 280 BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); 281 282 IRB()->SetInsertPoint(entry); 283 284 // arguments 285 auto argitr = soFunc->arg_begin(); 286 Value* pSoCtx = &*argitr++; 287 pSoCtx->setName("pSoCtx"); 288 289 const STREAMOUT_STREAM& streamState = state.stream; 290 buildStream(state, streamState, pSoCtx, returnBB, soFunc); 291 292 BR(returnBB); 293 294 IRB()->SetInsertPoint(returnBB); 295 RET_VOID(); 296 297 JitManager::DumpToFile(soFunc, "SoFunc"); 298 299 ::FunctionPassManager passes(JM()->mpCurrentModule); 300 301 passes.add(createBreakCriticalEdgesPass()); 302 passes.add(createCFGSimplificationPass()); 303 passes.add(createEarlyCSEPass()); 304 passes.add(createPromoteMemoryToRegisterPass()); 305 passes.add(createCFGSimplificationPass()); 306 passes.add(createEarlyCSEPass()); 307 passes.add(createInstructionCombiningPass()); 308 passes.add(createInstructionSimplifierPass()); 309 passes.add(createConstantPropagationPass()); 310 passes.add(createSCCPPass()); 311 passes.add(createAggressiveDCEPass()); 312 313 passes.run(*soFunc); 314 315 JitManager::DumpToFile(soFunc, "SoFunc_optimized"); 316 317 return soFunc; 318 } 319 }; 320 321 ////////////////////////////////////////////////////////////////////////// 322 /// @brief JITs from streamout shader IR 323 /// @param hJitMgr - JitManager handle 324 /// @param func - LLVM function IR 325 /// @return PFN_SO_FUNC - pointer to SOS function 326 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) 327 { 328 const llvm::Function *func = (const llvm::Function*)hFunc; 329 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 330 PFN_SO_FUNC pfnStreamOut; 331 pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 332 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module 333 pJitMgr->mIsModuleFinalized = true; 334 335 return pfnStreamOut; 336 } 337 338 ////////////////////////////////////////////////////////////////////////// 339 /// @brief JIT compiles streamout shader 340 /// @param hJitMgr - JitManager handle 341 /// @param state - SO state to build function from 342 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state) 343 { 344 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 345 346 STREAMOUT_COMPILE_STATE soState = state; 347 if (soState.offsetAttribs) 348 { 349 for (uint32_t i = 0; i < soState.stream.numDecls; ++i) 350 { 351 soState.stream.decl[i].attribSlot -= soState.offsetAttribs; 352 } 353 } 354 355 pJitMgr->SetupNewModule(); 356 357 StreamOutJit theJit(pJitMgr); 358 HANDLE hFunc = theJit.Create(soState); 359 360 return JitStreamoutFunc(hJitMgr, hFunc); 361 } 362