1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file fetch_jit.cpp 24 * 25 * @brief Implementation of the fetch jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30 #include "jit_api.h" 31 #include "fetch_jit.h" 32 #include "builder.h" 33 #include "state_llvm.h" 34 #include <sstream> 35 #include <tuple> 36 37 //#define FETCH_DUMP_VERTEX 1 38 using namespace llvm; 39 using namespace SwrJit; 40 41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); 42 43 enum ConversionType 44 { 45 CONVERT_NONE, 46 CONVERT_NORMALIZED, 47 CONVERT_USCALED, 48 CONVERT_SSCALED, 49 CONVERT_SFIXED, 50 }; 51 52 ////////////////////////////////////////////////////////////////////////// 53 /// Interface to Jitting a fetch shader 54 ////////////////////////////////////////////////////////////////////////// 55 struct FetchJit : public Builder 56 { 57 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; 58 59 Function* Create(const FETCH_COMPILE_STATE& fetchState); 60 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); 61 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); 62 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); 63 64 // package up Shuffle*bpcGatherd args into a tuple for convenience 65 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, 66 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4], 67 const uint32_t(&)[4]> Shuffle8bpcArgs; 68 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); 69 70 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType, 71 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs; 72 void Shuffle16bpcGather(Shuffle16bpcArgs &args); 73 74 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); 75 76 Value* GenerateCompCtrlVector(const ComponentControl ctrl); 77 78 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); 79 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); 80 81 bool IsOddFormat(SWR_FORMAT format); 82 bool IsUniformFormat(SWR_FORMAT format); 83 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); 84 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); 85 void ConvertFormat(SWR_FORMAT format, Value *texels[4]); 86 87 Value* mpFetchInfo; 88 }; 89 90 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) 91 { 92 static std::size_t fetchNum = 0; 93 94 std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 95 fnName << fetchNum++; 96 97 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 98 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); 99 100 IRB()->SetInsertPoint(entry); 101 102 auto argitr = fetch->arg_begin(); 103 104 // Fetch shader arguments 105 mpFetchInfo = &*argitr; ++argitr; 106 mpFetchInfo->setName("fetchInfo"); 107 Value* pVtxOut = &*argitr; 108 pVtxOut->setName("vtxOutput"); 109 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex 110 // index 0(just the pointer to the simdvertex structure 111 // index 1(which element of the simdvertex structure to offset to(in this case 0) 112 // so the indices being i32's doesn't matter 113 // TODO: generated this GEP with a VECTOR structure type so this makes sense 114 std::vector<Value*> vtxInputIndices(2, C(0)); 115 // GEP 116 pVtxOut = GEP(pVtxOut, C(0)); 117 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); 118 119 // SWR_FETCH_CONTEXT::pStreams 120 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); 121 streams->setName("pStreams"); 122 123 // SWR_FETCH_CONTEXT::pIndices 124 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices}); 125 indices->setName("pIndices"); 126 127 // SWR_FETCH_CONTEXT::pLastIndex 128 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex}); 129 pLastIndex->setName("pLastIndex"); 130 131 132 Value* vIndices; 133 switch(fetchState.indexType) 134 { 135 case R8_UINT: 136 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); 137 if(fetchState.bDisableIndexOOBCheck){ 138 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); 139 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 140 } 141 else{ 142 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); 143 vIndices = GetSimdValid8bitIndices(indices, pLastIndex); 144 } 145 break; 146 case R16_UINT: 147 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); 148 if(fetchState.bDisableIndexOOBCheck){ 149 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); 150 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 151 } 152 else{ 153 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); 154 vIndices = GetSimdValid16bitIndices(indices, pLastIndex); 155 } 156 break; 157 case R32_UINT: 158 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) 159 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); 160 break; // incoming type is already 32bit int 161 default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break; 162 } 163 164 Value* vVertexId = vIndices; 165 if (fetchState.bVertexIDOffsetEnable) 166 { 167 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct 168 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); 169 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); 170 vVertexId = ADD(vIndices, vBaseVertex); 171 vVertexId = ADD(vVertexId, vStartVertex); 172 } 173 174 // store out vertex IDs 175 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); 176 177 // store out cut mask if enabled 178 if (fetchState.bEnableCutIndex) 179 { 180 Value* vCutIndex = VIMMED1(fetchState.cutIndex); 181 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); 182 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); 183 } 184 185 // Fetch attributes from memory and output to a simdvertex struct 186 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use 187 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut) 188 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut); 189 190 RET_VOID(); 191 192 JitManager::DumpToFile(fetch, "src"); 193 194 #if defined(_DEBUG) 195 verifyFunction(*fetch); 196 #endif 197 198 ::FunctionPassManager setupPasses(JM()->mpCurrentModule); 199 200 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) 201 setupPasses.add(createBreakCriticalEdgesPass()); 202 setupPasses.add(createCFGSimplificationPass()); 203 setupPasses.add(createEarlyCSEPass()); 204 setupPasses.add(createPromoteMemoryToRegisterPass()); 205 206 setupPasses.run(*fetch); 207 208 JitManager::DumpToFile(fetch, "se"); 209 210 ::FunctionPassManager optPasses(JM()->mpCurrentModule); 211 212 ///@todo Haven't touched these either. Need to remove some of these and add others. 213 optPasses.add(createCFGSimplificationPass()); 214 optPasses.add(createEarlyCSEPass()); 215 optPasses.add(createInstructionCombiningPass()); 216 optPasses.add(createInstructionSimplifierPass()); 217 optPasses.add(createConstantPropagationPass()); 218 optPasses.add(createSCCPPass()); 219 optPasses.add(createAggressiveDCEPass()); 220 221 optPasses.run(*fetch); 222 optPasses.run(*fetch); 223 224 JitManager::DumpToFile(fetch, "opt"); 225 226 return fetch; 227 } 228 229 ////////////////////////////////////////////////////////////////////////// 230 /// @brief Loads attributes from memory using LOADs, shuffling the 231 /// components into SOA form. 232 /// *Note* currently does not support component control, 233 /// component packing, instancing 234 /// @param fetchState - info about attributes to be fetched from memory 235 /// @param streams - value pointer to the current vertex stream 236 /// @param vIndices - vector value of indices to load 237 /// @param pVtxOut - value pointer to output simdvertex struct 238 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut) 239 { 240 // Zack shuffles; a variant of the Charleston. 241 242 std::vector<Value*> vectors(16); 243 std::vector<Constant*> pMask(mVWidth); 244 for(uint32_t i = 0; i < mVWidth; ++i) 245 { 246 pMask[i] = (C(i < 4 ? i : 4)); 247 } 248 Constant* promoteMask = ConstantVector::get(pMask); 249 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4)); 250 251 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); 252 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); 253 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); 254 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 255 curInstance->setName("curInstance"); 256 257 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt) 258 { 259 Value* elements[4] = {0}; 260 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt]; 261 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); 262 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices."); 263 uint32_t numComponents = info.numComps; 264 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. 265 266 // load path doesn't support component packing 267 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing."); 268 269 vectors.clear(); 270 271 Value *vCurIndices; 272 Value *startOffset; 273 if(ied.InstanceEnable) 274 { 275 Value* stepRate = C(ied.InstanceDataStepRate); 276 277 // prevent a div by 0 for 0 step rate 278 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); 279 stepRate = SELECT(isNonZeroStep, stepRate, C(1)); 280 281 // calc the current offset into instanced data buffer 282 Value* calcInstance = UDIV(curInstance, stepRate); 283 284 // if step rate is 0, every instance gets instance 0 285 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); 286 287 vCurIndices = VBROADCAST(calcInstance); 288 289 startOffset = startInstance; 290 } 291 else 292 { 293 // offset indices by baseVertex 294 vCurIndices = ADD(vIndices, vBaseVertex); 295 296 startOffset = startVertex; 297 } 298 299 // load SWR_VERTEX_BUFFER_STATE::pData 300 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); 301 302 // load SWR_VERTEX_BUFFER_STATE::pitch 303 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); 304 stride = Z_EXT(stride, mInt64Ty); 305 306 // load SWR_VERTEX_BUFFER_STATE::size 307 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size}); 308 size = Z_EXT(size, mInt64Ty); 309 310 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride); 311 312 // Load from the stream. 313 for(uint32_t lane = 0; lane < mVWidth; ++lane) 314 { 315 // Get index 316 Value* index = VEXTRACT(vCurIndices, C(lane)); 317 index = Z_EXT(index, mInt64Ty); 318 319 Value* offset = MUL(index, stride); 320 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset)); 321 offset = ADD(offset, startVertexOffset); 322 323 if (!fetchState.bDisableIndexOOBCheck) { 324 // check for out of bound access, including partial OOB, and mask them to 0 325 Value *endOffset = ADD(offset, C((int64_t)info.Bpp)); 326 Value *oob = ICMP_ULE(endOffset, size); 327 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0)); 328 } 329 330 Value* pointer = GEP(stream, offset); 331 // We use a full-lane, but don't actually care. 332 Value* vptr = 0; 333 334 // get a pointer to a 4 component attrib in default address space 335 switch(bpc) 336 { 337 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break; 338 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break; 339 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break; 340 default: SWR_ASSERT(false, "Unsupported underlying bpp!"); 341 } 342 343 // load 4 components of attribute 344 Value* vec = ALIGNED_LOAD(vptr, 1, false); 345 346 // Convert To FP32 internally 347 switch(info.type[0]) 348 { 349 case SWR_TYPE_UNORM: 350 switch(bpc) 351 { 352 case 8: 353 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 354 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0)))); 355 break; 356 case 16: 357 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 358 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0)))); 359 break; 360 default: 361 SWR_ASSERT(false, "Unsupported underlying type!"); 362 break; 363 } 364 break; 365 case SWR_TYPE_SNORM: 366 switch(bpc) 367 { 368 case 8: 369 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 370 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0)))); 371 break; 372 case 16: 373 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 374 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0)))); 375 break; 376 default: 377 SWR_ASSERT(false, "Unsupported underlying type!"); 378 break; 379 } 380 break; 381 case SWR_TYPE_UINT: 382 // Zero extend uint32_t types. 383 switch(bpc) 384 { 385 case 8: 386 case 16: 387 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4)); 388 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); 389 break; 390 case 32: 391 break; // Pass through unchanged. 392 default: 393 SWR_ASSERT(false, "Unsupported underlying type!"); 394 break; 395 } 396 break; 397 case SWR_TYPE_SINT: 398 // Sign extend SINT types. 399 switch(bpc) 400 { 401 case 8: 402 case 16: 403 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4)); 404 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); 405 break; 406 case 32: 407 break; // Pass through unchanged. 408 default: 409 SWR_ASSERT(false, "Unsupported underlying type!"); 410 break; 411 } 412 break; 413 case SWR_TYPE_FLOAT: 414 switch(bpc) 415 { 416 case 32: 417 break; // Pass through unchanged. 418 default: 419 SWR_ASSERT(false, "Unsupported underlying type!"); 420 } 421 break; 422 case SWR_TYPE_USCALED: 423 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 424 break; 425 case SWR_TYPE_SSCALED: 426 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); 427 break; 428 case SWR_TYPE_SFIXED: 429 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f))); 430 break; 431 case SWR_TYPE_UNKNOWN: 432 case SWR_TYPE_UNUSED: 433 SWR_ASSERT(false, "Unsupported type %d!", info.type[0]); 434 } 435 436 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4) 437 // uwvec: 4 x F32, undef value 438 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask); 439 vectors.push_back(wvec); 440 } 441 442 std::vector<Constant*> v01Mask(mVWidth); 443 std::vector<Constant*> v23Mask(mVWidth); 444 std::vector<Constant*> v02Mask(mVWidth); 445 std::vector<Constant*> v13Mask(mVWidth); 446 447 // Concatenate the vectors together. 448 elements[0] = VUNDEF_F(); 449 elements[1] = VUNDEF_F(); 450 elements[2] = VUNDEF_F(); 451 elements[3] = VUNDEF_F(); 452 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b) 453 { 454 v01Mask[4 * b + 0] = C(0 + 4 * b); 455 v01Mask[4 * b + 1] = C(1 + 4 * b); 456 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); 457 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth); 458 459 v23Mask[4 * b + 0] = C(2 + 4 * b); 460 v23Mask[4 * b + 1] = C(3 + 4 * b); 461 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth); 462 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); 463 464 v02Mask[4 * b + 0] = C(0 + 4 * b); 465 v02Mask[4 * b + 1] = C(2 + 4 * b); 466 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); 467 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth); 468 469 v13Mask[4 * b + 0] = C(1 + 4 * b); 470 v13Mask[4 * b + 1] = C(3 + 4 * b); 471 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth); 472 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); 473 474 std::vector<Constant*> iMask(mVWidth); 475 for(uint32_t i = 0; i < mVWidth; ++i) 476 { 477 if(((4 * b) <= i) && (i < (4 * (b + 1)))) 478 { 479 iMask[i] = C(i % 4 + mVWidth); 480 } 481 else 482 { 483 iMask[i] = C(i); 484 } 485 } 486 Constant* insertMask = ConstantVector::get(iMask); 487 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask); 488 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask); 489 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask); 490 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask); 491 } 492 493 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask)); 494 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask)); 495 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask)); 496 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask)); 497 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask)); 498 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask)); 499 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask)); 500 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask)); 501 502 switch(numComponents + 1) 503 { 504 case 1: elements[0] = VIMMED1(0.0f); 505 case 2: elements[1] = VIMMED1(0.0f); 506 case 3: elements[2] = VIMMED1(0.0f); 507 case 4: elements[3] = VIMMED1(1.0f); 508 } 509 510 for(uint32_t c = 0; c < 4; ++c) 511 { 512 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); 513 STORE(elements[c], dest); 514 } 515 } 516 } 517 518 // returns true for odd formats that require special state.gather handling 519 bool FetchJit::IsOddFormat(SWR_FORMAT format) 520 { 521 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 522 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64) 523 { 524 return true; 525 } 526 return false; 527 } 528 529 // format is uniform if all components are the same size and type 530 bool FetchJit::IsUniformFormat(SWR_FORMAT format) 531 { 532 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 533 uint32_t bpc0 = info.bpc[0]; 534 uint32_t type0 = info.type[0]; 535 536 for (uint32_t c = 1; c < info.numComps; ++c) 537 { 538 if (bpc0 != info.bpc[c] || type0 != info.type[c]) 539 { 540 return false; 541 } 542 } 543 return true; 544 } 545 546 // unpacks components based on format 547 // foreach component in the pixel 548 // mask off everything but this component 549 // shift component to LSB 550 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]) 551 { 552 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 553 554 uint32_t bitOffset = 0; 555 for (uint32_t c = 0; c < info.numComps; ++c) 556 { 557 uint32_t swizzledIndex = info.swizzle[c]; 558 uint32_t compBits = info.bpc[c]; 559 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; 560 Value* comp = AND(vInput, bitmask); 561 comp = LSHR(comp, bitOffset); 562 563 result[swizzledIndex] = comp; 564 bitOffset += compBits; 565 } 566 } 567 568 // gather for odd component size formats 569 // gather SIMD full pixels per lane then shift/mask to move each component to their 570 // own vector 571 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]) 572 { 573 const SWR_FORMAT_INFO &info = GetFormatInfo(format); 574 575 // only works if pixel size is <= 32bits 576 SWR_ASSERT(info.bpp <= 32); 577 578 Value* gather = VUNDEF_I(); 579 580 // assign defaults 581 for (uint32_t comp = 0; comp < 4; ++comp) 582 { 583 result[comp] = VIMMED1((int)info.defaults[comp]); 584 } 585 586 // load the proper amount of data based on component size 587 PointerType* pLoadTy = nullptr; 588 switch (info.bpp) 589 { 590 case 8: pLoadTy = Type::getInt8PtrTy(JM()->mContext); break; 591 case 16: pLoadTy = Type::getInt16PtrTy(JM()->mContext); break; 592 case 24: 593 case 32: pLoadTy = Type::getInt32PtrTy(JM()->mContext); break; 594 default: SWR_ASSERT(0); 595 } 596 597 // allocate temporary memory for masked off lanes 598 Value* pTmp = ALLOCA(pLoadTy->getElementType()); 599 600 // gather SIMD pixels 601 for (uint32_t e = 0; e < JM()->mVWidth; ++e) 602 { 603 Value* pElemOffset = VEXTRACT(offsets, C(e)); 604 Value* pLoad = GEP(pBase, pElemOffset); 605 Value* pLaneMask = VEXTRACT(pMask, C(e)); 606 607 pLoad = POINTER_CAST(pLoad, pLoadTy); 608 609 // mask in tmp pointer for disabled lanes 610 pLoad = SELECT(pLaneMask, pLoad, pTmp); 611 612 // load pixel 613 Value *val = LOAD(pLoad); 614 615 // zero extend to 32bit integer 616 val = INT_CAST(val, mInt32Ty, false); 617 618 // store in simd lane 619 gather = VINSERT(gather, val, C(e)); 620 } 621 622 UnpackComponents(format, gather, result); 623 624 // cast to fp32 625 result[0] = BITCAST(result[0], mSimdFP32Ty); 626 result[1] = BITCAST(result[1], mSimdFP32Ty); 627 result[2] = BITCAST(result[2], mSimdFP32Ty); 628 result[3] = BITCAST(result[3], mSimdFP32Ty); 629 } 630 631 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) 632 { 633 const SWR_FORMAT_INFO &info = GetFormatInfo(format); 634 635 for (uint32_t c = 0; c < info.numComps; ++c) 636 { 637 uint32_t compIndex = info.swizzle[c]; 638 639 // skip any conversion on UNUSED components 640 if (info.type[c] == SWR_TYPE_UNUSED) 641 { 642 continue; 643 } 644 645 if (info.isNormalized[c]) 646 { 647 if (info.type[c] == SWR_TYPE_SNORM) 648 { 649 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f. 650 651 /// result = c * (1.0f / (2^(n-1) - 1); 652 uint32_t n = info.bpc[c]; 653 uint32_t pow2 = 1 << (n - 1); 654 float scale = 1.0f / (float)(pow2 - 1); 655 Value *vScale = VIMMED1(scale); 656 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 657 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 658 texels[compIndex] = FMUL(texels[compIndex], vScale); 659 } 660 else 661 { 662 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); 663 664 /// result = c * (1.0f / (2^n - 1)) 665 uint32_t n = info.bpc[c]; 666 uint32_t pow2 = 1 << n; 667 // special case 24bit unorm format, which requires a full divide to meet ULP requirement 668 if (n == 24) 669 { 670 float scale = (float)(pow2 - 1); 671 Value* vScale = VIMMED1(scale); 672 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 673 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 674 texels[compIndex] = FDIV(texels[compIndex], vScale); 675 } 676 else 677 { 678 float scale = 1.0f / (float)(pow2 - 1); 679 Value *vScale = VIMMED1(scale); 680 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 681 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); 682 texels[compIndex] = FMUL(texels[compIndex], vScale); 683 } 684 } 685 continue; 686 } 687 } 688 } 689 690 ////////////////////////////////////////////////////////////////////////// 691 /// @brief Loads attributes from memory using AVX2 GATHER(s) 692 /// @param fetchState - info about attributes to be fetched from memory 693 /// @param streams - value pointer to the current vertex stream 694 /// @param vIndices - vector value of indices to gather 695 /// @param pVtxOut - value pointer to output simdvertex struct 696 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, 697 Value* streams, Value* vIndices, Value* pVtxOut) 698 { 699 uint32_t currentVertexElement = 0; 700 uint32_t outputElt = 0; 701 Value* vVertexElements[4]; 702 703 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); 704 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); 705 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); 706 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 707 curInstance->setName("curInstance"); 708 709 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) 710 { 711 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; 712 713 // skip element if all components are disabled 714 if (ied.ComponentPacking == ComponentEnable::NONE) 715 { 716 continue; 717 } 718 719 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); 720 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); 721 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. 722 723 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); 724 725 // VGATHER* takes an *i8 src pointer 726 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); 727 728 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); 729 Value *vStride = VBROADCAST(stride); 730 731 // max vertex index that is fully in bounds 732 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); 733 maxVertex = LOAD(maxVertex); 734 735 Value *vCurIndices; 736 Value *startOffset; 737 if(ied.InstanceEnable) 738 { 739 Value* stepRate = C(ied.InstanceDataStepRate); 740 741 // prevent a div by 0 for 0 step rate 742 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); 743 stepRate = SELECT(isNonZeroStep, stepRate, C(1)); 744 745 // calc the current offset into instanced data buffer 746 Value* calcInstance = UDIV(curInstance, stepRate); 747 748 // if step rate is 0, every instance gets instance 0 749 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); 750 751 vCurIndices = VBROADCAST(calcInstance); 752 753 startOffset = startInstance; 754 } 755 else 756 { 757 // offset indices by baseVertex 758 vCurIndices = ADD(vIndices, vBaseVertex); 759 760 startOffset = startVertex; 761 } 762 763 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 764 // do 64bit address offset calculations. 765 766 // calculate byte offset to the start of the VB 767 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); 768 pStreamBase = GEP(pStreamBase, baseOffset); 769 770 // if we have a start offset, subtract from max vertex. Used for OOB check 771 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); 772 Value* neg = ICMP_SLT(maxVertex, C((int64_t)0)); 773 // if we have a negative value, we're already OOB. clamp at 0. 774 maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty)); 775 776 // Load the in bounds size of a partially valid vertex 777 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); 778 partialInboundsSize = LOAD(partialInboundsSize); 779 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); 780 Value* vBpp = VBROADCAST(C(info.Bpp)); 781 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); 782 783 // is the element is <= the partially valid size 784 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); 785 786 // override cur indices with 0 if pitch is 0 787 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); 788 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); 789 790 // are vertices partially OOB? 791 Value* vMaxVertex = VBROADCAST(maxVertex); 792 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); 793 794 // are vertices are fully in bounds? 795 Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); 796 797 // blend in any partially OOB indices that have valid elements 798 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); 799 Value* pMask = vGatherMask; 800 vGatherMask = VMASK(vGatherMask); 801 802 // calculate the actual offsets into the VB 803 Value* vOffsets = MUL(vCurIndices, vStride); 804 vOffsets = ADD(vOffsets, vAlignmentOffsets); 805 806 // Packing and component control 807 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; 808 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 809 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; 810 811 // Special gather/conversion for formats without equal component sizes 812 if (IsOddFormat((SWR_FORMAT)ied.Format)) 813 { 814 Value* pResults[4]; 815 CreateGatherOddFormats((SWR_FORMAT)ied.Format, pMask, pStreamBase, vOffsets, pResults); 816 ConvertFormat((SWR_FORMAT)ied.Format, pResults); 817 818 for (uint32_t c = 0; c < 4; ++c) 819 { 820 if (isComponentEnabled(compMask, c)) 821 { 822 vVertexElements[currentVertexElement++] = pResults[c]; 823 if (currentVertexElement > 3) 824 { 825 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 826 // reset to the next vVertexElement to output 827 currentVertexElement = 0; 828 } 829 } 830 } 831 } 832 else if(info.type[0] == SWR_TYPE_FLOAT) 833 { 834 ///@todo: support 64 bit vb accesses 835 Value* gatherSrc = VIMMED1(0.0f); 836 837 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 838 "Unsupported format for standard gather fetch."); 839 840 // Gather components from memory to store in a simdvertex structure 841 switch(bpc) 842 { 843 case 16: 844 { 845 Value* vGatherResult[2]; 846 Value *vMask; 847 848 // if we have at least one component out of x or y to fetch 849 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ 850 // save mask as it is zero'd out after each gather 851 vMask = vGatherMask; 852 853 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 854 // e.g. result of first 8x32bit integer gather for 16bit components 855 // 256i - 0 1 2 3 4 5 6 7 856 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 857 // 858 } 859 860 // if we have at least one component out of z or w to fetch 861 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ 862 // offset base to the next components(zw) in the vertex to gather 863 pStreamBase = GEP(pStreamBase, C((char)4)); 864 vMask = vGatherMask; 865 866 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 867 // e.g. result of second 8x32bit integer gather for 16bit components 868 // 256i - 0 1 2 3 4 5 6 7 869 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 870 // 871 } 872 873 // if we have at least one component to shuffle into place 874 if(compMask){ 875 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, 876 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); 877 878 // Shuffle gathered components into place in simdvertex struct 879 Shuffle16bpcGather(args); // outputs to vVertexElements ref 880 } 881 } 882 break; 883 case 32: 884 { 885 for (uint32_t i = 0; i < 4; i++) 886 { 887 if (isComponentEnabled(compMask, i)) 888 { 889 // if we need to gather the component 890 if (compCtrl[i] == StoreSrc) 891 { 892 // save mask as it is zero'd out after each gather 893 Value *vMask = vGatherMask; 894 895 // Gather a SIMD of vertices 896 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 897 } 898 else 899 { 900 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 901 } 902 903 if (currentVertexElement > 3) 904 { 905 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 906 // reset to the next vVertexElement to output 907 currentVertexElement = 0; 908 } 909 910 } 911 912 // offset base to the next component in the vertex to gather 913 pStreamBase = GEP(pStreamBase, C((char)4)); 914 } 915 } 916 break; 917 case 64: 918 { 919 for (uint32_t i = 0; i < 4; i++) 920 { 921 if (isComponentEnabled(compMask, i)) 922 { 923 // if we need to gather the component 924 if (compCtrl[i] == StoreSrc) 925 { 926 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3})); 927 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7})); 928 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4)); 929 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4)); 930 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4)); 931 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4)); 932 933 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); 934 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); 935 936 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); 937 938 Value* pGatherLo = GATHERPD(vZeroDouble, 939 pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); 940 Value* pGatherHi = GATHERPD(vZeroDouble, 941 pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); 942 943 pGatherLo = VCVTPD2PS(pGatherLo); 944 pGatherHi = VCVTPD2PS(pGatherHi); 945 946 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7})); 947 948 vVertexElements[currentVertexElement++] = pGather; 949 } 950 else 951 { 952 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 953 } 954 955 if (currentVertexElement > 3) 956 { 957 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 958 // reset to the next vVertexElement to output 959 currentVertexElement = 0; 960 } 961 962 } 963 964 // offset base to the next component in the vertex to gather 965 pStreamBase = GEP(pStreamBase, C((char)8)); 966 } 967 } 968 break; 969 default: 970 SWR_ASSERT(0, "Tried to fetch invalid FP format"); 971 break; 972 } 973 } 974 else 975 { 976 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; 977 ConversionType conversionType = CONVERT_NONE; 978 979 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 980 "Unsupported format for standard gather fetch."); 981 982 switch(info.type[0]) 983 { 984 case SWR_TYPE_UNORM: 985 conversionType = CONVERT_NORMALIZED; 986 case SWR_TYPE_UINT: 987 extendCastType = Instruction::CastOps::ZExt; 988 break; 989 case SWR_TYPE_SNORM: 990 conversionType = CONVERT_NORMALIZED; 991 case SWR_TYPE_SINT: 992 extendCastType = Instruction::CastOps::SExt; 993 break; 994 case SWR_TYPE_USCALED: 995 conversionType = CONVERT_USCALED; 996 extendCastType = Instruction::CastOps::UIToFP; 997 break; 998 case SWR_TYPE_SSCALED: 999 conversionType = CONVERT_SSCALED; 1000 extendCastType = Instruction::CastOps::SIToFP; 1001 break; 1002 case SWR_TYPE_SFIXED: 1003 conversionType = CONVERT_SFIXED; 1004 extendCastType = Instruction::CastOps::SExt; 1005 break; 1006 default: 1007 break; 1008 } 1009 1010 // value substituted when component of gather is masked 1011 Value* gatherSrc = VIMMED1(0); 1012 1013 // Gather components from memory to store in a simdvertex structure 1014 switch (bpc) 1015 { 1016 case 8: 1017 { 1018 // if we have at least one component to fetch 1019 if(compMask) 1020 { 1021 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); 1022 // e.g. result of an 8x32bit integer gather for 8bit components 1023 // 256i - 0 1 2 3 4 5 6 7 1024 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 1025 1026 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, 1027 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); 1028 1029 // Shuffle gathered components into place in simdvertex struct 1030 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref 1031 } 1032 } 1033 break; 1034 case 16: 1035 { 1036 Value* vGatherResult[2]; 1037 Value *vMask; 1038 1039 // if we have at least one component out of x or y to fetch 1040 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ 1041 // save mask as it is zero'd out after each gather 1042 vMask = vGatherMask; 1043 1044 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 1045 // e.g. result of first 8x32bit integer gather for 16bit components 1046 // 256i - 0 1 2 3 4 5 6 7 1047 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 1048 // 1049 } 1050 1051 // if we have at least one component out of z or w to fetch 1052 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ 1053 // offset base to the next components(zw) in the vertex to gather 1054 pStreamBase = GEP(pStreamBase, C((char)4)); 1055 vMask = vGatherMask; 1056 1057 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 1058 // e.g. result of second 8x32bit integer gather for 16bit components 1059 // 256i - 0 1 2 3 4 5 6 7 1060 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1061 // 1062 } 1063 1064 // if we have at least one component to shuffle into place 1065 if(compMask){ 1066 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, 1067 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); 1068 1069 // Shuffle gathered components into place in simdvertex struct 1070 Shuffle16bpcGather(args); // outputs to vVertexElements ref 1071 } 1072 } 1073 break; 1074 case 32: 1075 { 1076 // Gathered components into place in simdvertex struct 1077 for (uint32_t i = 0; i < 4; i++) 1078 { 1079 if (isComponentEnabled(compMask, i)) 1080 { 1081 // if we need to gather the component 1082 if (compCtrl[i] == StoreSrc) 1083 { 1084 // save mask as it is zero'd out after each gather 1085 Value *vMask = vGatherMask; 1086 1087 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); 1088 1089 if (conversionType == CONVERT_USCALED) 1090 { 1091 pGather = UI_TO_FP(pGather, mSimdFP32Ty); 1092 } 1093 else if (conversionType == CONVERT_SSCALED) 1094 { 1095 pGather = SI_TO_FP(pGather, mSimdFP32Ty); 1096 } 1097 else if (conversionType == CONVERT_SFIXED) 1098 { 1099 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f))); 1100 } 1101 1102 vVertexElements[currentVertexElement++] = pGather; 1103 // e.g. result of a single 8x32bit integer gather for 32bit components 1104 // 256i - 0 1 2 3 4 5 6 7 1105 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 1106 } 1107 else 1108 { 1109 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1110 } 1111 1112 if (currentVertexElement > 3) 1113 { 1114 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1115 // reset to the next vVertexElement to output 1116 currentVertexElement = 0; 1117 } 1118 1119 } 1120 1121 // offset base to the next component in the vertex to gather 1122 pStreamBase = GEP(pStreamBase, C((char)4)); 1123 } 1124 } 1125 break; 1126 } 1127 } 1128 } 1129 1130 // if we have a partially filled vVertexElement struct, output it 1131 if(currentVertexElement > 0){ 1132 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); 1133 } 1134 } 1135 1136 ////////////////////////////////////////////////////////////////////////// 1137 /// @brief Loads a simd of valid indices. OOB indices are set to 0 1138 /// *Note* have to do 16bit index checking in scalar until we have AVX-512 1139 /// support 1140 /// @param pIndices - pointer to 8 bit indices 1141 /// @param pLastIndex - pointer to last valid index 1142 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) 1143 { 1144 // can fit 2 16 bit integers per vWidth lane 1145 Value* vIndices = VUNDEF_I(); 1146 1147 // store 0 index on stack to be used to conditionally load from if index address is OOB 1148 Value* pZeroIndex = ALLOCA(mInt8Ty); 1149 STORE(C((uint8_t)0), pZeroIndex); 1150 1151 // Load a SIMD of index pointers 1152 for(int64_t lane = 0; lane < mVWidth; lane++) 1153 { 1154 // Calculate the address of the requested index 1155 Value *pIndex = GEP(pIndices, C(lane)); 1156 1157 // check if the address is less than the max index, 1158 Value* mask = ICMP_ULT(pIndex, pLastIndex); 1159 1160 // if valid, load the index. if not, load 0 from the stack 1161 Value* pValid = SELECT(mask, pIndex, pZeroIndex); 1162 Value *index = LOAD(pValid, "valid index"); 1163 1164 // zero extended index to 32 bits and insert into the correct simd lane 1165 index = Z_EXT(index, mInt32Ty); 1166 vIndices = VINSERT(vIndices, index, lane); 1167 } 1168 return vIndices; 1169 } 1170 1171 ////////////////////////////////////////////////////////////////////////// 1172 /// @brief Loads a simd of valid indices. OOB indices are set to 0 1173 /// *Note* have to do 16bit index checking in scalar until we have AVX-512 1174 /// support 1175 /// @param pIndices - pointer to 16 bit indices 1176 /// @param pLastIndex - pointer to last valid index 1177 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) 1178 { 1179 // can fit 2 16 bit integers per vWidth lane 1180 Value* vIndices = VUNDEF_I(); 1181 1182 // store 0 index on stack to be used to conditionally load from if index address is OOB 1183 Value* pZeroIndex = ALLOCA(mInt16Ty); 1184 STORE(C((uint16_t)0), pZeroIndex); 1185 1186 // Load a SIMD of index pointers 1187 for(int64_t lane = 0; lane < mVWidth; lane++) 1188 { 1189 // Calculate the address of the requested index 1190 Value *pIndex = GEP(pIndices, C(lane)); 1191 1192 // check if the address is less than the max index, 1193 Value* mask = ICMP_ULT(pIndex, pLastIndex); 1194 1195 // if valid, load the index. if not, load 0 from the stack 1196 Value* pValid = SELECT(mask, pIndex, pZeroIndex); 1197 Value *index = LOAD(pValid, "valid index"); 1198 1199 // zero extended index to 32 bits and insert into the correct simd lane 1200 index = Z_EXT(index, mInt32Ty); 1201 vIndices = VINSERT(vIndices, index, lane); 1202 } 1203 return vIndices; 1204 } 1205 1206 ////////////////////////////////////////////////////////////////////////// 1207 /// @brief Loads a simd of valid indices. OOB indices are set to 0 1208 /// @param pIndices - pointer to 32 bit indices 1209 /// @param pLastIndex - pointer to last valid index 1210 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) 1211 { 1212 DataLayout dL(JM()->mpCurrentModule); 1213 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits 1214 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize)); 1215 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize)); 1216 1217 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) 1218 Value* numIndicesLeft = SUB(iLastIndex,iIndices); 1219 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); 1220 numIndicesLeft = SDIV(numIndicesLeft, C(4)); 1221 1222 // create a vector of index counts from the base index ptr passed into the fetch 1223 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)}; 1224 Constant* vIndexOffsets = ConstantVector::get(vecIndices); 1225 1226 // compare index count to the max valid index 1227 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load 1228 // vIndexOffsets 0 1 2 3 4 5 6 7 1229 // ------------------------------ 1230 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass 1231 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 1232 Value* vMaxIndex = VBROADCAST(numIndicesLeft); 1233 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); 1234 1235 // VMASKLOAD takes an *i8 src pointer 1236 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); 1237 1238 // Load the indices; OOB loads 0 1239 return MASKLOADD(pIndices,vIndexMask); 1240 } 1241 1242 ////////////////////////////////////////////////////////////////////////// 1243 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 1244 /// denormalizes if needed, converts to F32 if needed, and positions in 1245 // the proper SIMD rows to be output to the simdvertex structure 1246 /// @param args: (tuple of args, listed below) 1247 /// @param vGatherResult - 8 gathered 8bpc vertices 1248 /// @param pVtxOut - base pointer to output simdvertex struct 1249 /// @param extendType - sign extend or zero extend 1250 /// @param bNormalized - do we need to denormalize? 1251 /// @param currentVertexElement - reference to the current vVertexElement 1252 /// @param outputElt - reference to the current offset from simdvertex we're o 1253 /// @param compMask - component packing mask 1254 /// @param compCtrl - component control val 1255 /// @param vVertexElements[4] - vertex components to output 1256 /// @param swizzle[4] - component swizzle location 1257 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) 1258 { 1259 // Unpack tuple args 1260 Value*& vGatherResult = std::get<0>(args); 1261 Value* pVtxOut = std::get<1>(args); 1262 const Instruction::CastOps extendType = std::get<2>(args); 1263 const ConversionType conversionType = std::get<3>(args); 1264 uint32_t ¤tVertexElement = std::get<4>(args); 1265 uint32_t &outputElt = std::get<5>(args); 1266 const ComponentEnable compMask = std::get<6>(args); 1267 const ComponentControl (&compCtrl)[4] = std::get<7>(args); 1268 Value* (&vVertexElements)[4] = std::get<8>(args); 1269 const uint32_t (&swizzle)[4] = std::get<9>(args); 1270 1271 // cast types 1272 Type* vGatherTy = mSimdInt32Ty; 1273 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits 1274 1275 // have to do extra work for sign extending 1276 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ 1277 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane 1278 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1279 1280 // shuffle mask, including any swizzling 1281 const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; 1282 const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; 1283 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12), 1284 char(y), char(y+4), char(y+8), char(y+12), 1285 char(z), char(z+4), char(z+8), char(z+12), 1286 char(w), char(w+4), char(w+8), char(w+12), 1287 char(x), char(x+4), char(x+8), char(x+12), 1288 char(y), char(y+4), char(y+8), char(y+12), 1289 char(z), char(z+4), char(z+8), char(z+12), 1290 char(w), char(w+4), char(w+8), char(w+12)}); 1291 1292 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); 1293 // after pshufb: group components together in each 128bit lane 1294 // 256i - 0 1 2 3 4 5 6 7 1295 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 1296 1297 Value* vi128XY = nullptr; 1298 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ 1299 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); 1300 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 1301 // 256i - 0 1 2 3 4 5 6 7 1302 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 1303 } 1304 1305 // do the same for zw components 1306 Value* vi128ZW = nullptr; 1307 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ 1308 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); 1309 } 1310 1311 // init denormalize variables if needed 1312 Instruction::CastOps fpCast; 1313 Value* conversionFactor; 1314 1315 switch (conversionType) 1316 { 1317 case CONVERT_NORMALIZED: 1318 fpCast = Instruction::CastOps::SIToFP; 1319 conversionFactor = VIMMED1((float)(1.0 / 127.0)); 1320 break; 1321 case CONVERT_SSCALED: 1322 fpCast = Instruction::CastOps::SIToFP; 1323 conversionFactor = VIMMED1((float)(1.0)); 1324 break; 1325 case CONVERT_USCALED: 1326 SWR_ASSERT(0, "Type should not be sign extended!"); 1327 conversionFactor = nullptr; 1328 break; 1329 default: 1330 SWR_ASSERT(conversionType == CONVERT_NONE); 1331 conversionFactor = nullptr; 1332 break; 1333 } 1334 1335 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex 1336 for (uint32_t i = 0; i < 4; i++) 1337 { 1338 if (isComponentEnabled(compMask, i)) 1339 { 1340 if (compCtrl[i] == ComponentControl::StoreSrc) 1341 { 1342 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1343 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1344 // if x or y, use vi128XY permute result, else use vi128ZW 1345 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1346 1347 // sign extend 1348 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); 1349 1350 // denormalize if needed 1351 if (conversionType != CONVERT_NONE) 1352 { 1353 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); 1354 } 1355 currentVertexElement++; 1356 } 1357 else 1358 { 1359 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1360 } 1361 1362 if (currentVertexElement > 3) 1363 { 1364 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1365 // reset to the next vVertexElement to output 1366 currentVertexElement = 0; 1367 } 1368 } 1369 } 1370 } 1371 // else zero extend 1372 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) 1373 { 1374 // init denormalize variables if needed 1375 Instruction::CastOps fpCast; 1376 Value* conversionFactor; 1377 1378 switch (conversionType) 1379 { 1380 case CONVERT_NORMALIZED: 1381 fpCast = Instruction::CastOps::UIToFP; 1382 conversionFactor = VIMMED1((float)(1.0 / 255.0)); 1383 break; 1384 case CONVERT_USCALED: 1385 fpCast = Instruction::CastOps::UIToFP; 1386 conversionFactor = VIMMED1((float)(1.0)); 1387 break; 1388 case CONVERT_SSCALED: 1389 SWR_ASSERT(0, "Type should not be zero extended!"); 1390 conversionFactor = nullptr; 1391 break; 1392 default: 1393 SWR_ASSERT(conversionType == CONVERT_NONE); 1394 conversionFactor = nullptr; 1395 break; 1396 } 1397 1398 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 1399 for (uint32_t i = 0; i < 4; i++) 1400 { 1401 if (isComponentEnabled(compMask, i)) 1402 { 1403 if (compCtrl[i] == ComponentControl::StoreSrc) 1404 { 1405 // pshufb masks for each component 1406 Value* vConstMask; 1407 switch (swizzle[i]) 1408 { 1409 case 0: 1410 // x shuffle mask 1411 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 1412 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 }); 1413 break; 1414 case 1: 1415 // y shuffle mask 1416 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1417 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 }); 1418 break; 1419 case 2: 1420 // z shuffle mask 1421 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 1422 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 }); 1423 break; 1424 case 3: 1425 // w shuffle mask 1426 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 1427 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 }); 1428 break; 1429 default: 1430 vConstMask = nullptr; 1431 break; 1432 } 1433 1434 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); 1435 // after pshufb for x channel 1436 // 256i - 0 1 2 3 4 5 6 7 1437 // x000 x000 x000 x000 x000 x000 x000 x000 1438 1439 // denormalize if needed 1440 if (conversionType != CONVERT_NONE) 1441 { 1442 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); 1443 } 1444 currentVertexElement++; 1445 } 1446 else 1447 { 1448 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1449 } 1450 1451 if (currentVertexElement > 3) 1452 { 1453 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1454 // reset to the next vVertexElement to output 1455 currentVertexElement = 0; 1456 } 1457 } 1458 } 1459 } 1460 else 1461 { 1462 SWR_ASSERT(0, "Unsupported conversion type"); 1463 } 1464 } 1465 1466 ////////////////////////////////////////////////////////////////////////// 1467 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 1468 /// denormalizes if needed, converts to F32 if needed, and positions in 1469 // the proper SIMD rows to be output to the simdvertex structure 1470 /// @param args: (tuple of args, listed below) 1471 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index 1472 /// @param pVtxOut - base pointer to output simdvertex struct 1473 /// @param extendType - sign extend or zero extend 1474 /// @param bNormalized - do we need to denormalize? 1475 /// @param currentVertexElement - reference to the current vVertexElement 1476 /// @param outputElt - reference to the current offset from simdvertex we're o 1477 /// @param compMask - component packing mask 1478 /// @param compCtrl - component control val 1479 /// @param vVertexElements[4] - vertex components to output 1480 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) 1481 { 1482 // Unpack tuple args 1483 Value* (&vGatherResult)[2] = std::get<0>(args); 1484 Value* pVtxOut = std::get<1>(args); 1485 const Instruction::CastOps extendType = std::get<2>(args); 1486 const ConversionType conversionType = std::get<3>(args); 1487 uint32_t ¤tVertexElement = std::get<4>(args); 1488 uint32_t &outputElt = std::get<5>(args); 1489 const ComponentEnable compMask = std::get<6>(args); 1490 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1491 Value* (&vVertexElements)[4] = std::get<8>(args); 1492 1493 // cast types 1494 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1495 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1496 1497 // have to do extra work for sign extending 1498 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| 1499 (extendType == Instruction::CastOps::FPExt)) 1500 { 1501 // is this PP float? 1502 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; 1503 1504 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane 1505 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits 1506 1507 // shuffle mask 1508 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1509 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1510 Value* vi128XY = nullptr; 1511 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ 1512 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); 1513 // after pshufb: group components together in each 128bit lane 1514 // 256i - 0 1 2 3 4 5 6 7 1515 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1516 1517 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1518 // after PERMD: move and pack xy components into each 128bit lane 1519 // 256i - 0 1 2 3 4 5 6 7 1520 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1521 } 1522 1523 // do the same for zw components 1524 Value* vi128ZW = nullptr; 1525 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ 1526 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); 1527 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1528 } 1529 1530 // init denormalize variables if needed 1531 Instruction::CastOps IntToFpCast; 1532 Value* conversionFactor; 1533 1534 switch (conversionType) 1535 { 1536 case CONVERT_NORMALIZED: 1537 IntToFpCast = Instruction::CastOps::SIToFP; 1538 conversionFactor = VIMMED1((float)(1.0 / 32767.0)); 1539 break; 1540 case CONVERT_SSCALED: 1541 IntToFpCast = Instruction::CastOps::SIToFP; 1542 conversionFactor = VIMMED1((float)(1.0)); 1543 break; 1544 case CONVERT_USCALED: 1545 SWR_ASSERT(0, "Type should not be sign extended!"); 1546 conversionFactor = nullptr; 1547 break; 1548 default: 1549 SWR_ASSERT(conversionType == CONVERT_NONE); 1550 conversionFactor = nullptr; 1551 break; 1552 } 1553 1554 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex 1555 for (uint32_t i = 0; i < 4; i++) 1556 { 1557 if (isComponentEnabled(compMask, i)) 1558 { 1559 if (compCtrl[i] == ComponentControl::StoreSrc) 1560 { 1561 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1562 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1563 // if x or y, use vi128XY permute result, else use vi128ZW 1564 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1565 1566 if (bFP) { 1567 // extract 128 bit lanes to sign extend each component 1568 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 1569 } 1570 else { 1571 // extract 128 bit lanes to sign extend each component 1572 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 1573 1574 // denormalize if needed 1575 if (conversionType != CONVERT_NONE) { 1576 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); 1577 } 1578 } 1579 currentVertexElement++; 1580 } 1581 else 1582 { 1583 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1584 } 1585 1586 if (currentVertexElement > 3) 1587 { 1588 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1589 // reset to the next vVertexElement to output 1590 currentVertexElement = 0; 1591 } 1592 } 1593 } 1594 } 1595 // else zero extend 1596 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) 1597 { 1598 // pshufb masks for each component 1599 Value* vConstMask[2]; 1600 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ 1601 // x/z shuffle mask 1602 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1603 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); 1604 } 1605 1606 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ 1607 // y/w shuffle mask 1608 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 1609 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 1610 } 1611 1612 // init denormalize variables if needed 1613 Instruction::CastOps fpCast; 1614 Value* conversionFactor; 1615 1616 switch (conversionType) 1617 { 1618 case CONVERT_NORMALIZED: 1619 fpCast = Instruction::CastOps::UIToFP; 1620 conversionFactor = VIMMED1((float)(1.0 / 65535.0)); 1621 break; 1622 case CONVERT_USCALED: 1623 fpCast = Instruction::CastOps::UIToFP; 1624 conversionFactor = VIMMED1((float)(1.0f)); 1625 break; 1626 case CONVERT_SSCALED: 1627 SWR_ASSERT(0, "Type should not be zero extended!"); 1628 conversionFactor = nullptr; 1629 break; 1630 default: 1631 SWR_ASSERT(conversionType == CONVERT_NONE); 1632 conversionFactor = nullptr; 1633 break; 1634 } 1635 1636 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 1637 for (uint32_t i = 0; i < 4; i++) 1638 { 1639 if (isComponentEnabled(compMask, i)) 1640 { 1641 if (compCtrl[i] == ComponentControl::StoreSrc) 1642 { 1643 // select correct constMask for x/z or y/w pshufb 1644 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 1645 // if x or y, use vi128XY permute result, else use vi128ZW 1646 uint32_t selectedGather = (i < 2) ? 0 : 1; 1647 1648 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); 1649 // after pshufb mask for x channel; z uses the same shuffle from the second gather 1650 // 256i - 0 1 2 3 4 5 6 7 1651 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 1652 1653 // denormalize if needed 1654 if (conversionType != CONVERT_NONE) 1655 { 1656 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); 1657 } 1658 currentVertexElement++; 1659 } 1660 else 1661 { 1662 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1663 } 1664 1665 if (currentVertexElement > 3) 1666 { 1667 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1668 // reset to the next vVertexElement to output 1669 currentVertexElement = 0; 1670 } 1671 } 1672 } 1673 } 1674 else 1675 { 1676 SWR_ASSERT(0, "Unsupported conversion type"); 1677 } 1678 } 1679 1680 ////////////////////////////////////////////////////////////////////////// 1681 /// @brief Output a simdvertex worth of elements to the current outputElt 1682 /// @param pVtxOut - base address of VIN output struct 1683 /// @param outputElt - simdvertex offset in VIN to write to 1684 /// @param numEltsToStore - number of simdvertex rows to write out 1685 /// @param vVertexElements - LLVM Value*[] simdvertex to write out 1686 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) 1687 { 1688 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); 1689 1690 for(uint32_t c = 0; c < numEltsToStore; ++c) 1691 { 1692 // STORE expects FP32 x vWidth type, just bitcast if needed 1693 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){ 1694 #if FETCH_DUMP_VERTEX 1695 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); 1696 #endif 1697 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); 1698 } 1699 #if FETCH_DUMP_VERTEX 1700 else 1701 { 1702 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); 1703 } 1704 #endif 1705 // outputElt * 4 = offsetting by the size of a simdvertex 1706 // + c offsets to a 32bit x vWidth row within the current vertex 1707 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); 1708 STORE(vVertexElements[c], dest); 1709 } 1710 } 1711 1712 ////////////////////////////////////////////////////////////////////////// 1713 /// @brief Generates a constant vector of values based on the 1714 /// ComponentControl value 1715 /// @param ctrl - ComponentControl value 1716 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) 1717 { 1718 switch(ctrl) 1719 { 1720 case NoStore: return VUNDEF_I(); 1721 case Store0: return VIMMED1(0); 1722 case Store1Fp: return VIMMED1(1.0f); 1723 case Store1Int: return VIMMED1(1); 1724 case StoreVertexId: 1725 { 1726 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); 1727 return VBROADCAST(pId); 1728 } 1729 case StoreInstanceId: 1730 { 1731 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); 1732 return VBROADCAST(pId); 1733 } 1734 case StoreSrc: 1735 default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I(); 1736 } 1737 } 1738 1739 ////////////////////////////////////////////////////////////////////////// 1740 /// @brief Returns the enable mask for the specified component. 1741 /// @param enableMask - enable bits 1742 /// @param component - component to check if enabled. 1743 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) 1744 { 1745 switch (component) 1746 { 1747 // X 1748 case 0: return (enableMask & ComponentEnable::X); 1749 // Y 1750 case 1: return (enableMask & ComponentEnable::Y); 1751 // Z 1752 case 2: return (enableMask & ComponentEnable::Z); 1753 // W 1754 case 3: return (enableMask & ComponentEnable::W); 1755 1756 default: return false; 1757 } 1758 } 1759 1760 1761 ////////////////////////////////////////////////////////////////////////// 1762 /// @brief JITs from fetch shader IR 1763 /// @param hJitMgr - JitManager handle 1764 /// @param func - LLVM function IR 1765 /// @return PFN_FETCH_FUNC - pointer to fetch code 1766 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) 1767 { 1768 const llvm::Function* func = (const llvm::Function*)hFunc; 1769 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 1770 PFN_FETCH_FUNC pfnFetch; 1771 1772 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 1773 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module 1774 pJitMgr->mIsModuleFinalized = true; 1775 1776 #if defined(KNOB_SWRC_TRACING) 1777 char fName[1024]; 1778 const char *funcName = func->getName().data(); 1779 sprintf(fName, "%s.bin", funcName); 1780 FILE *fd = fopen(fName, "wb"); 1781 fwrite((void *)pfnFetch, 1, 2048, fd); 1782 fclose(fd); 1783 #endif 1784 1785 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final"); 1786 1787 return pfnFetch; 1788 } 1789 1790 ////////////////////////////////////////////////////////////////////////// 1791 /// @brief JIT compiles fetch shader 1792 /// @param hJitMgr - JitManager handle 1793 /// @param state - fetch state to build function from 1794 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) 1795 { 1796 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 1797 1798 pJitMgr->SetupNewModule(); 1799 1800 FetchJit theJit(pJitMgr); 1801 HANDLE hFunc = theJit.Create(state); 1802 1803 return JitFetchFunc(hJitMgr, hFunc); 1804 } 1805