1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "llvm/IR/BasicBlock.h" 20 #include "llvm/IR/CallSite.h" 21 #include "llvm/IR/Constants.h" 22 #include "llvm/IR/DerivedTypes.h" 23 24 using namespace clang; 25 using namespace CodeGen; 26 27 namespace { 28 29 class CGNVCUDARuntime : public CGCUDARuntime { 30 31 private: 32 llvm::Type *IntTy, *SizeTy, *VoidTy; 33 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34 35 /// Convenience reference to LLVM Context 36 llvm::LLVMContext &Context; 37 /// Convenience reference to the current module 38 llvm::Module &TheModule; 39 /// Keeps track of kernel launch stubs emitted in this module 40 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41 /// Keeps track of variables containing handles of GPU binaries. Populated by 42 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 43 /// ModuleDtorFunction() 44 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 45 46 llvm::Constant *getSetupArgumentFn() const; 47 llvm::Constant *getLaunchFn() const; 48 49 /// Creates a function to register all kernel stubs generated in this module. 50 llvm::Function *makeRegisterKernelsFn(); 51 52 /// Helper function that generates a constant string and returns a pointer to 53 /// the start of the string. The result of this function can be used anywhere 54 /// where the C code specifies const char*. 55 llvm::Constant *makeConstantString(const std::string &Str, 56 const std::string &Name = "", 57 unsigned Alignment = 0) { 58 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 59 llvm::ConstantInt::get(SizeTy, 0)}; 60 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 61 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 62 ConstStr.getPointer(), Zeros); 63 } 64 65 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 66 67 public: 68 CGNVCUDARuntime(CodeGenModule &CGM); 69 70 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 71 /// Creates module constructor function 72 llvm::Function *makeModuleCtorFunction() override; 73 /// Creates module destructor function 74 llvm::Function *makeModuleDtorFunction() override; 75 }; 76 77 } 78 79 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 80 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 81 TheModule(CGM.getModule()) { 82 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 83 ASTContext &Ctx = CGM.getContext(); 84 85 IntTy = Types.ConvertType(Ctx.IntTy); 86 SizeTy = Types.ConvertType(Ctx.getSizeType()); 87 VoidTy = llvm::Type::getVoidTy(Context); 88 89 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 90 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 91 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 92 } 93 94 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 95 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 96 std::vector<llvm::Type*> Params; 97 Params.push_back(VoidPtrTy); 98 Params.push_back(SizeTy); 99 Params.push_back(SizeTy); 100 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 101 Params, false), 102 "cudaSetupArgument"); 103 } 104 105 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 106 // cudaError_t cudaLaunch(char *) 107 return CGM.CreateRuntimeFunction( 108 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 109 } 110 111 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 112 FunctionArgList &Args) { 113 EmittedKernels.push_back(CGF.CurFn); 114 emitDeviceStubBody(CGF, Args); 115 } 116 117 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 118 FunctionArgList &Args) { 119 // Build the argument value list and the argument stack struct type. 120 SmallVector<llvm::Value *, 16> ArgValues; 121 std::vector<llvm::Type *> ArgTypes; 122 for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); 123 I != E; ++I) { 124 llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer(); 125 ArgValues.push_back(V); 126 assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType"); 127 ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType()); 128 } 129 llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); 130 131 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 132 133 // Emit the calls to cudaSetupArgument 134 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 135 for (unsigned I = 0, E = Args.size(); I != E; ++I) { 136 llvm::Value *Args[3]; 137 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 138 Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); 139 Args[1] = CGF.Builder.CreateIntCast( 140 llvm::ConstantExpr::getSizeOf(ArgTypes[I]), 141 SizeTy, false); 142 Args[2] = CGF.Builder.CreateIntCast( 143 llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), 144 SizeTy, false); 145 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 146 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 147 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 148 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 149 CGF.EmitBlock(NextBlock); 150 } 151 152 // Emit the call to cudaLaunch 153 llvm::Constant *cudaLaunchFn = getLaunchFn(); 154 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 155 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 156 CGF.EmitBranch(EndBlock); 157 158 CGF.EmitBlock(EndBlock); 159 } 160 161 /// Creates internal function to register all kernel stubs generated in this 162 /// module with the CUDA runtime. 163 /// \code 164 /// void __cuda_register_kernels(void** GpuBinaryHandle) { 165 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 166 /// ... 167 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 168 /// } 169 /// \endcode 170 llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { 171 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 172 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 173 llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); 174 llvm::BasicBlock *EntryBB = 175 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 176 CGBuilderTy Builder(CGM, Context); 177 Builder.SetInsertPoint(EntryBB); 178 179 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 180 // int, uint3*, uint3*, dim3*, dim3*, int*) 181 std::vector<llvm::Type *> RegisterFuncParams = { 182 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 183 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 184 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 185 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 186 "__cudaRegisterFunction"); 187 188 // Extract GpuBinaryHandle passed as the first argument passed to 189 // __cuda_register_kernels() and generate __cudaRegisterFunction() call for 190 // each emitted kernel. 191 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 192 for (llvm::Function *Kernel : EmittedKernels) { 193 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 194 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 195 llvm::Value *args[] = { 196 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 197 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 198 NullPtr, NullPtr, NullPtr, 199 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 200 Builder.CreateCall(RegisterFunc, args); 201 } 202 203 Builder.CreateRetVoid(); 204 return RegisterKernelsFunc; 205 } 206 207 /// Creates a global constructor function for the module: 208 /// \code 209 /// void __cuda_module_ctor(void*) { 210 /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 211 /// __cuda_register_kernels(Handle0); 212 /// ... 213 /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 214 /// __cuda_register_kernels(HandleN); 215 /// } 216 /// \endcode 217 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 218 // void __cuda_register_kernels(void* handle); 219 llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); 220 // void ** __cudaRegisterFatBinary(void *); 221 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 222 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 223 "__cudaRegisterFatBinary"); 224 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 225 llvm::StructType *FatbinWrapperTy = 226 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 227 228 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 229 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 230 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 231 llvm::BasicBlock *CtorEntryBB = 232 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 233 CGBuilderTy CtorBuilder(CGM, Context); 234 235 CtorBuilder.SetInsertPoint(CtorEntryBB); 236 237 // For each GPU binary, register it with the CUDA runtime and store returned 238 // handle in a global variable and save the handle in GpuBinaryHandles vector 239 // to be cleaned up in destructor on exit. Then associate all known kernels 240 // with the GPU binary handle so CUDA runtime can figure out what to call on 241 // the GPU side. 242 for (const std::string &GpuBinaryFileName : 243 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 244 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 245 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 246 if (std::error_code EC = GpuBinaryOrErr.getError()) { 247 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 248 << EC.message(); 249 continue; 250 } 251 252 // Create initialized wrapper structure that points to the loaded GPU binary 253 llvm::Constant *Values[] = { 254 llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 255 llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 256 makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. 257 llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 258 llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 259 TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 260 llvm::ConstantStruct::get(FatbinWrapperTy, Values), 261 "__cuda_fatbin_wrapper"); 262 263 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 264 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 265 RegisterFatbinFunc, 266 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 267 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 268 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 269 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 270 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 271 CGM.getPointerAlign()); 272 273 // Call __cuda_register_kernels(GpuBinaryHandle); 274 CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); 275 276 // Save GpuBinaryHandle so we can unregister it in destructor. 277 GpuBinaryHandles.push_back(GpuBinaryHandle); 278 } 279 280 CtorBuilder.CreateRetVoid(); 281 return ModuleCtorFunc; 282 } 283 284 /// Creates a global destructor function that unregisters all GPU code blobs 285 /// registered by constructor. 286 /// \code 287 /// void __cuda_module_dtor(void*) { 288 /// __cudaUnregisterFatBinary(Handle0); 289 /// ... 290 /// __cudaUnregisterFatBinary(HandleN); 291 /// } 292 /// \endcode 293 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 294 // void __cudaUnregisterFatBinary(void ** handle); 295 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 296 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 297 "__cudaUnregisterFatBinary"); 298 299 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 300 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 301 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 302 llvm::BasicBlock *DtorEntryBB = 303 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 304 CGBuilderTy DtorBuilder(CGM, Context); 305 DtorBuilder.SetInsertPoint(DtorEntryBB); 306 307 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 308 auto HandleValue = 309 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 310 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 311 } 312 313 DtorBuilder.CreateRetVoid(); 314 return ModuleDtorFunc; 315 } 316 317 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 318 return new CGNVCUDARuntime(CGM); 319 } 320