Home | History | Annotate | Download | only in lib
      1 /*
      2  * Copyright 2012, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "Assert.h"
     18 #include "Log.h"
     19 #include "RSTransforms.h"
     20 #include "RSUtils.h"
     21 
     22 #include "bcc/Config.h"
     23 #include "bcinfo/MetadataExtractor.h"
     24 
     25 #include "slang_version.h"
     26 
     27 #include <cstdlib>
     28 #include <functional>
     29 #include <unordered_set>
     30 
     31 #include <llvm/IR/DerivedTypes.h>
     32 #include <llvm/IR/Function.h>
     33 #include <llvm/IR/Instructions.h>
     34 #include <llvm/IR/IRBuilder.h>
     35 #include <llvm/IR/MDBuilder.h>
     36 #include <llvm/IR/Module.h>
     37 #include <llvm/Pass.h>
     38 #include <llvm/Support/raw_ostream.h>
     39 #include <llvm/IR/DataLayout.h>
     40 #include <llvm/IR/Function.h>
     41 #include <llvm/IR/Type.h>
     42 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
     43 
     44 #ifndef __DISABLE_ASSERTS
     45 // Only used in bccAssert()
     46 const int kNumExpandedForeachParams = 4;
     47 const int kNumExpandedReduceAccumulatorParams = 4;
     48 #endif
     49 
     50 const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
     51 const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
     52 
     53 using namespace bcc;
     54 
     55 namespace {
     56 
     57 static const bool gEnableRsTbaa = true;
     58 
     59 /* RSKernelExpandPass
     60  *
     61  * This pass generates functions used to implement calls via
     62  * rsForEach(), "foreach_<NAME>", or "reduce_<NAME>". We create an
     63  * inner loop for the function to be invoked over the appropriate data
     64  * cells of the input/output allocations (adjusting other relevant
     65  * parameters as we go). We support doing this for any forEach or
     66  * reduce style compute kernels.
     67  *
     68  * In the case of a foreach kernel or a simple reduction kernel, the
     69  * new function name is the original function name "<NAME>" followed
     70  * by ".expand" -- "<NAME>.expand".
     71  *
     72  * In the case of a general reduction kernel, the kernel's accumulator
     73  * function is the one transformed, and the new function name is the
     74  * original accumulator function name "<ACCUMFN>" followed by
     75  * ".expand" -- "<ACCUMFN>.expand". Using the name "<ACCUMFN>.expand"
     76  * for the function generated from the accumulator should not
     77  * introduce any possibility for name clashes today: The accumulator
     78  * function <ACCUMFN> must be static, so it cannot also serve as a
     79  * foreach kernel; and the code for <ACCUMFN>.expand depends only on
     80  * <ACCUMFN>, not on any other properties of the reduction kernel, so
     81  * any reduction kernels that share the accumulator <ACCUMFN> can
     82  * share <ACCUMFN>.expand also.
     83  *
     84  * Note that this pass does not delete the original function <NAME> or
     85  * <ACCUMFN>. However, if it is inlined into the newly-generated
     86  * function and not otherwise referenced, then a subsequent pass may
     87  * delete it.
     88  */
     89 class RSKernelExpandPass : public llvm::ModulePass {
     90 public:
     91   static char ID;
     92 
     93 private:
     94   static const size_t RS_KERNEL_INPUT_LIMIT = 8;  // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
     95 
     96   typedef std::unordered_set<llvm::Function *> FunctionSet;
     97 
     98   enum RsLaunchDimensionsField {
     99     RsLaunchDimensionsFieldX,
    100     RsLaunchDimensionsFieldY,
    101     RsLaunchDimensionsFieldZ,
    102     RsLaunchDimensionsFieldLod,
    103     RsLaunchDimensionsFieldFace,
    104     RsLaunchDimensionsFieldArray,
    105 
    106     RsLaunchDimensionsFieldCount
    107   };
    108 
    109   enum RsExpandKernelDriverInfoPfxField {
    110     RsExpandKernelDriverInfoPfxFieldInPtr,
    111     RsExpandKernelDriverInfoPfxFieldInStride,
    112     RsExpandKernelDriverInfoPfxFieldInLen,
    113     RsExpandKernelDriverInfoPfxFieldOutPtr,
    114     RsExpandKernelDriverInfoPfxFieldOutStride,
    115     RsExpandKernelDriverInfoPfxFieldOutLen,
    116     RsExpandKernelDriverInfoPfxFieldDim,
    117     RsExpandKernelDriverInfoPfxFieldCurrent,
    118     RsExpandKernelDriverInfoPfxFieldUsr,
    119     RsExpandKernelDriverInfoPfxFieldUsLenr,
    120 
    121     RsExpandKernelDriverInfoPfxFieldCount
    122   };
    123 
    124   llvm::Module *Module;
    125   llvm::LLVMContext *Context;
    126 
    127   /*
    128    * Pointers to LLVM type information for the the function signatures
    129    * for expanded functions. These must be re-calculated for each module
    130    * the pass is run on.
    131    */
    132   llvm::FunctionType *ExpandedForEachType;
    133   llvm::Type *RsExpandKernelDriverInfoPfxTy;
    134 
    135   // Initialized when we begin to process each Module
    136   bool mStructExplicitlyPaddedBySlang;
    137   uint32_t mExportForEachCount;
    138   const char **mExportForEachNameList;
    139   const uint32_t *mExportForEachSignatureList;
    140 
    141   // Turns on optimization of allocation stride values.
    142   bool mEnableStepOpt;
    143 
    144   uint32_t getRootSignature(llvm::Function *Function) {
    145     const llvm::NamedMDNode *ExportForEachMetadata =
    146         Module->getNamedMetadata("#rs_export_foreach");
    147 
    148     if (!ExportForEachMetadata) {
    149       llvm::SmallVector<llvm::Type*, 8> RootArgTys;
    150       for (llvm::Function::arg_iterator B = Function->arg_begin(),
    151                                         E = Function->arg_end();
    152            B != E;
    153            ++B) {
    154         RootArgTys.push_back(B->getType());
    155       }
    156 
    157       // For pre-ICS bitcode, we may not have signature information. In that
    158       // case, we use the size of the RootArgTys to select the number of
    159       // arguments.
    160       return (1 << RootArgTys.size()) - 1;
    161     }
    162 
    163     if (ExportForEachMetadata->getNumOperands() == 0) {
    164       return 0;
    165     }
    166 
    167     bccAssert(ExportForEachMetadata->getNumOperands() > 0);
    168 
    169     // We only handle the case for legacy root() functions here, so this is
    170     // hard-coded to look at only the first such function.
    171     llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
    172     if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
    173       llvm::Metadata *SigMD = SigNode->getOperand(0);
    174       if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
    175         llvm::StringRef SigString = SigS->getString();
    176         uint32_t Signature = 0;
    177         if (SigString.getAsInteger(10, Signature)) {
    178           ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
    179           return 0;
    180         }
    181         return Signature;
    182       }
    183     }
    184 
    185     return 0;
    186   }
    187 
    188   bool isStepOptSupported(llvm::Type *AllocType) {
    189 
    190     llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
    191     llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
    192 
    193     if (mEnableStepOpt) {
    194       return false;
    195     }
    196 
    197     if (AllocType == VoidPtrTy) {
    198       return false;
    199     }
    200 
    201     if (!PT) {
    202       return false;
    203     }
    204 
    205     // remaining conditions are 64-bit only
    206     if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
    207       return true;
    208     }
    209 
    210     // coerce suggests an upconverted struct type, which we can't support
    211     if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
    212       return false;
    213     }
    214 
    215     // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
    216     llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
    217     llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
    218     if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
    219       return false;
    220     }
    221 
    222     return true;
    223   }
    224 
    225   // Get the actual value we should use to step through an allocation.
    226   //
    227   // Normally the value we use to step through an allocation is given to us by
    228   // the driver. However, for certain primitive data types, we can derive an
    229   // integer constant for the step value. We use this integer constant whenever
    230   // possible to allow further compiler optimizations to take place.
    231   //
    232   // DL - Target Data size/layout information.
    233   // T - Type of allocation (should be a pointer).
    234   // OrigStep - Original step increment (root.expand() input from driver).
    235   llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
    236                             llvm::Value *OrigStep) {
    237     bccAssert(DL);
    238     bccAssert(AllocType);
    239     bccAssert(OrigStep);
    240     llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
    241     if (isStepOptSupported(AllocType)) {
    242       llvm::Type *ET = PT->getElementType();
    243       uint64_t ETSize = DL->getTypeAllocSize(ET);
    244       llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
    245       return llvm::ConstantInt::get(Int32Ty, ETSize);
    246     } else {
    247       return OrigStep;
    248     }
    249   }
    250 
    251   /// Builds the types required by the pass for the given context.
    252   void buildTypes(void) {
    253     // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
    254 
    255     llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
    256     llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
    257     llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
    258     llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
    259     llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
    260     llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
    261     llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
    262 
    263     /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
    264      *
    265      * struct RsLaunchDimensions {
    266      *   uint32_t x;
    267      *   uint32_t y;
    268      *   uint32_t z;
    269      *   uint32_t lod;
    270      *   uint32_t face;
    271      *   uint32_t array[4];
    272      * };
    273      */
    274     llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
    275     RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
    276     RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
    277     RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
    278     RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
    279     RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
    280     RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
    281     llvm::StructType *RsLaunchDimensionsTy =
    282         llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
    283 
    284     /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
    285      *
    286      * struct RsExpandKernelDriverInfoPfx {
    287      *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
    288      *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
    289      *     uint32_t inLen;
    290      *
    291      *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
    292      *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
    293      *     uint32_t outLen;
    294      *
    295      *     // Dimension of the launch
    296      *     RsLaunchDimensions dim;
    297      *
    298      *     // The walking iterator of the launch
    299      *     RsLaunchDimensions current;
    300      *
    301      *     const void *usr;
    302      *     uint32_t usrLen;
    303      *
    304      *     // Items below this line are not used by the compiler and can be change in the driver.
    305      *     // So the compiler must assume there are an unknown number of fields of unknown type
    306      *     // beginning here.
    307      * };
    308      *
    309      * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
    310      */
    311     llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
    312     RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
    313     RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
    314     RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
    315     RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
    316     RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
    317     RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
    318     RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
    319     RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
    320     RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
    321     RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
    322     RsExpandKernelDriverInfoPfxTy =
    323         llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
    324 
    325     // Create the function type for expanded kernels.
    326     llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
    327 
    328     llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
    329     // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
    330     ExpandedForEachType = llvm::FunctionType::get(VoidTy,
    331         {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
    332   }
    333 
    334   /// @brief Create skeleton of the expanded foreach kernel.
    335   ///
    336   /// This creates a function with the following signature:
    337   ///
    338   ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
    339   ///         uint32_t outstep)
    340   ///
    341   llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
    342     llvm::Function *ExpandedFunction =
    343       llvm::Function::Create(ExpandedForEachType,
    344                              llvm::GlobalValue::ExternalLinkage,
    345                              OldName + ".expand", Module);
    346     bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
    347     llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
    348     (AI++)->setName("p");
    349     (AI++)->setName("x1");
    350     (AI++)->setName("x2");
    351     (AI++)->setName("arg_outstep");
    352     llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
    353                                                        ExpandedFunction);
    354     llvm::IRBuilder<> Builder(Begin);
    355     Builder.CreateRetVoid();
    356     return ExpandedFunction;
    357   }
    358 
    359   // Create skeleton of a general reduce kernel's expanded accumulator.
    360   //
    361   // This creates a function with the following signature:
    362   //
    363   //  void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p,
    364   //                    i32 %x1, i32 %x2, accumType* nocapture %accum)
    365   //
    366   llvm::Function *createEmptyExpandedReduceAccumulator(llvm::StringRef OldName,
    367                                                        llvm::Type *AccumArgTy) {
    368     llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
    369     llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
    370     llvm::FunctionType *ExpandedReduceAccumulatorType =
    371         llvm::FunctionType::get(VoidTy,
    372                                 {RsExpandKernelDriverInfoPfxTy->getPointerTo(),
    373                                  Int32Ty, Int32Ty, AccumArgTy}, false);
    374     llvm::Function *FnExpandedAccumulator =
    375       llvm::Function::Create(ExpandedReduceAccumulatorType,
    376                              llvm::GlobalValue::ExternalLinkage,
    377                              OldName + ".expand", Module);
    378     bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
    379 
    380     llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin();
    381 
    382     using llvm::Attribute;
    383 
    384     llvm::Argument *Arg_p = &(*AI++);
    385     Arg_p->setName("p");
    386     Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1,
    387                                            llvm::makeArrayRef(Attribute::NoCapture)));
    388 
    389     llvm::Argument *Arg_x1 = &(*AI++);
    390     Arg_x1->setName("x1");
    391 
    392     llvm::Argument *Arg_x2 = &(*AI++);
    393     Arg_x2->setName("x2");
    394 
    395     llvm::Argument *Arg_accum = &(*AI++);
    396     Arg_accum->setName("accum");
    397     Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1,
    398                                                llvm::makeArrayRef(Attribute::NoCapture)));
    399 
    400     llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
    401                                                        FnExpandedAccumulator);
    402     llvm::IRBuilder<> Builder(Begin);
    403     Builder.CreateRetVoid();
    404 
    405     return FnExpandedAccumulator;
    406   }
    407 
    408   /// @brief Create an empty loop
    409   ///
    410   /// Create a loop of the form:
    411   ///
    412   /// for (i = LowerBound; i < UpperBound; i++)
    413   ///   ;
    414   ///
    415   /// After the loop has been created, the builder is set such that
    416   /// instructions can be added to the loop body.
    417   ///
    418   /// @param Builder The builder to use to build this loop. The current
    419   ///                position of the builder is the position the loop
    420   ///                will be inserted.
    421   /// @param LowerBound The first value of the loop iterator
    422   /// @param UpperBound The maximal value of the loop iterator
    423   /// @param LoopIV A reference that will be set to the loop iterator.
    424   /// @return The BasicBlock that will be executed after the loop.
    425   llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
    426                                llvm::Value *LowerBound,
    427                                llvm::Value *UpperBound,
    428                                llvm::Value **LoopIV) {
    429     bccAssert(LowerBound->getType() == UpperBound->getType());
    430 
    431     llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
    432     llvm::Value *Cond, *IVNext, *IV, *IVVar;
    433 
    434     CondBB = Builder.GetInsertBlock();
    435     AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr);
    436     HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
    437 
    438     CondBB->getTerminator()->eraseFromParent();
    439     Builder.SetInsertPoint(CondBB);
    440 
    441     // decltype(LowerBound) *ivvar = alloca(sizeof(int))
    442     // *ivvar = LowerBound
    443     IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME);
    444     Builder.CreateStore(LowerBound, IVVar);
    445 
    446     // if (LowerBound < Upperbound)
    447     //   goto LoopHeader
    448     // else
    449     //   goto AfterBB
    450     Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
    451     Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
    452 
    453     // LoopHeader:
    454     //   iv = *ivvar
    455     //   <insertion point here>
    456     //   iv.next = iv + 1
    457     //   *ivvar = iv.next
    458     //   if (iv.next < Upperbound)
    459     //     goto LoopHeader
    460     //   else
    461     //     goto AfterBB
    462     // AfterBB:
    463     Builder.SetInsertPoint(HeaderBB);
    464     IV = Builder.CreateLoad(IVVar, "X");
    465     IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
    466     Builder.CreateStore(IVNext, IVVar);
    467     Cond = Builder.CreateICmpULT(IVNext, UpperBound);
    468     Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
    469     AfterBB->setName("Exit");
    470     Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext));
    471 
    472     // Record information about this loop.
    473     *LoopIV = IV;
    474     return AfterBB;
    475   }
    476 
    477   // Finish building the outgoing argument list for calling a ForEach-able function.
    478   //
    479   // ArgVector - on input, the non-special arguments
    480   //             on output, the non-special arguments combined with the special arguments
    481   //               from SpecialArgVector
    482   // SpecialArgVector - special arguments (from ExpandSpecialArguments())
    483   // SpecialArgContextIdx - return value of ExpandSpecialArguments()
    484   //                          (position of context argument in SpecialArgVector)
    485   // CalleeFunction - the ForEach-able function being called
    486   // Builder - for inserting code into the caller function
    487   template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
    488   void finishArgList(      llvm::SmallVector<llvm::Value *, ArgVectorLen>        &ArgVector,
    489                      const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
    490                      const int SpecialArgContextIdx,
    491                      const llvm::Function &CalleeFunction,
    492                      llvm::IRBuilder<> &CallerBuilder) {
    493     /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
    494      * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
    495      * two types represent the same thing).  Therefore, we must introduce a pointer cast when
    496      * generating a call to the kernel function.
    497      */
    498     const int ArgContextIdx =
    499         SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
    500     ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
    501     if (ArgContextIdx >= 0) {
    502       llvm::Type *ContextArgType = nullptr;
    503       int ArgIdx = ArgContextIdx;
    504       for (const auto &Arg : CalleeFunction.getArgumentList()) {
    505         if (!ArgIdx--) {
    506           ContextArgType = Arg.getType();
    507           break;
    508         }
    509       }
    510       bccAssert(ContextArgType);
    511       ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
    512     }
    513   }
    514 
    515   // GEPHelper() returns a SmallVector of values suitable for passing
    516   // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
    517   // the returned data type. It is sized so that the SmallVector
    518   // returned by GEPHelper() never needs to do a heap allocation for
    519   // any list of GEP indices it encounters in the code.
    520   typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
    521 
    522   // Helper for turning a list of constant integer GEP indices into a
    523   // SmallVector of llvm::Value*. The return value is suitable for
    524   // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
    525   //
    526   // Inputs:
    527   //   I32Args should be integers which represent the index arguments
    528   //   to a GEP instruction.
    529   //
    530   // Returns:
    531   //   Returns a SmallVector of ConstantInts.
    532   SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
    533     SmallGEPIndices Out(I32Args.size());
    534     llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
    535     std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
    536                    [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
    537     return Out;
    538   }
    539 
    540 public:
    541   explicit RSKernelExpandPass(bool pEnableStepOpt = true)
    542       : ModulePass(ID), Module(nullptr), Context(nullptr),
    543         mEnableStepOpt(pEnableStepOpt) {
    544 
    545   }
    546 
    547   virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
    548     // This pass does not use any other analysis passes, but it does
    549     // add/wrap the existing functions in the module (thus altering the CFG).
    550   }
    551 
    552   // Build contribution to outgoing argument list for calling a
    553   // ForEach-able function or a general reduction accumulator
    554   // function, based on the special parameters of that function.
    555   //
    556   // Signature - metadata bits for the signature of the callee
    557   // X, Arg_p - values derived directly from expanded function,
    558   //            suitable for computing arguments for the callee
    559   // CalleeArgs - contribution is accumulated here
    560   // Bump - invoked once for each contributed outgoing argument
    561   // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
    562   //                            this function can insert loop-invariant loads
    563   //
    564   // Return value is the (zero-based) position of the context (Arg_p)
    565   // argument in the CalleeArgs vector, or a negative value if the
    566   // context argument is not placed in the CalleeArgs vector.
    567   int ExpandSpecialArguments(uint32_t Signature,
    568                              llvm::Value *X,
    569                              llvm::Value *Arg_p,
    570                              llvm::IRBuilder<> &Builder,
    571                              llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
    572                              const std::function<void ()> &Bump,
    573                              llvm::Instruction *LoopHeaderInsertionPoint) {
    574 
    575     bccAssert(CalleeArgs.empty());
    576 
    577     int Return = -1;
    578     if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
    579       CalleeArgs.push_back(Arg_p);
    580       Bump();
    581       Return = CalleeArgs.size() - 1;
    582     }
    583 
    584     if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
    585       CalleeArgs.push_back(X);
    586       Bump();
    587     }
    588 
    589     if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
    590         bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
    591       bccAssert(LoopHeaderInsertionPoint);
    592 
    593       // Y and Z are loop invariant, so they can be hoisted out of the
    594       // loop. Set the IRBuilder insertion point to the loop header.
    595       auto OldInsertionPoint = Builder.saveIP();
    596       Builder.SetInsertPoint(LoopHeaderInsertionPoint);
    597 
    598       if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
    599         SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
    600           RsLaunchDimensionsFieldY}));
    601         llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
    602         CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
    603         Bump();
    604       }
    605 
    606       if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
    607         SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
    608           RsLaunchDimensionsFieldZ}));
    609         llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
    610         CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
    611         Bump();
    612       }
    613 
    614       Builder.restoreIP(OldInsertionPoint);
    615     }
    616 
    617     return Return;
    618   }
    619 
    620   // Generate loop-invariant input processing setup code for an expanded
    621   // ForEach-able function or an expanded general reduction accumulator
    622   // function.
    623   //
    624   // LoopHeader - block at the end of which the setup code will be inserted
    625   // Arg_p - RSKernelDriverInfo pointer passed to the expanded function
    626   // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo
    627   // ArgIter - iterator pointing to first input of the UNexpanded function
    628   // NumInputs - number of inputs (NOT number of ARGUMENTS)
    629   //
    630   // InTypes[] - this function saves input type, they will be used in ExpandInputsBody().
    631   // InBufPtrs[] - this function sets each array element to point to the first cell / byte
    632   //               (byte for x86, cell for other platforms) of the corresponding input allocation
    633   // InStructTempSlots[] - this function sets each array element either to nullptr
    634   //                       or to the result of an alloca (for the case where the
    635   //                       calling convention dictates that a value must be passed
    636   //                       by reference, and so we need a stacked temporary to hold
    637   //                       a copy of that value)
    638   void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader,
    639                                  llvm::Value *Arg_p,
    640                                  llvm::MDNode *TBAAPointer,
    641                                  llvm::Function::arg_iterator ArgIter,
    642                                  const size_t NumInputs,
    643                                  llvm::SmallVectorImpl<llvm::Type *> &InTypes,
    644                                  llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
    645                                  llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) {
    646     bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
    647 
    648     // Extract information about input slots. The work done
    649     // here is loop-invariant, so we can hoist the operations out of the loop.
    650     auto OldInsertionPoint = Builder.saveIP();
    651     Builder.SetInsertPoint(LoopHeader->getTerminator());
    652 
    653     for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) {
    654       llvm::Type *InType = ArgIter->getType();
    655 
    656       /*
    657        * AArch64 calling conventions dictate that structs of sufficient size
    658        * get passed by pointer instead of passed by value.  This, combined
    659        * with the fact that we don't allow kernels to operate on pointer
    660        * data means that if we see a kernel with a pointer parameter we know
    661        * that it is a struct input that has been promoted.  As such we don't
    662        * need to convert its type to a pointer.  Later we will need to know
    663        * to create a temporary copy on the stack, so we save this information
    664        * in InStructTempSlots.
    665        */
    666       if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
    667         llvm::Type *ElementType = PtrType->getElementType();
    668         InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
    669                                                          "input_struct_slot"));
    670       } else {
    671         InType = InType->getPointerTo();
    672         InStructTempSlots.push_back(nullptr);
    673       }
    674 
    675       SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
    676                                              static_cast<int32_t>(InputIndex)}));
    677       llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
    678       llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
    679 
    680       llvm::Value *CastInBufPtr = nullptr;
    681       if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
    682         CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
    683       } else {
    684         // The disagreement between module and x86 target machine datalayout
    685         // causes mismatched input/output data offset between slang reflected
    686         // code and bcc codegen for GetElementPtr. To solve this issue, skip the
    687         // cast to InType and leave CastInBufPtr as an int8_t*.  The buffer is
    688         // later indexed with an explicit byte offset computed based on
    689         // X86_CUSTOM_DL_STRING and then bitcast to actual input type.
    690         CastInBufPtr = InBufPtr;
    691       }
    692 
    693       if (gEnableRsTbaa) {
    694         InBufPtr->setMetadata("tbaa", TBAAPointer);
    695       }
    696 
    697       InTypes.push_back(InType);
    698       InBufPtrs.push_back(CastInBufPtr);
    699     }
    700 
    701     Builder.restoreIP(OldInsertionPoint);
    702   }
    703 
    704   // Generate loop-varying input processing code for an expanded ForEach-able function
    705   // or an expanded general reduction accumulator function.  Also, for the call to the
    706   // UNexpanded function, collect the portion of the argument list corresponding to the
    707   // inputs.
    708   //
    709   // Arg_x1 - first X coordinate to be processed by the expanded function
    710   // TBAAAllocation - metadata for marking loads of input values out of allocations
    711   // NumInputs -- number of inputs (NOT number of ARGUMENTS)
    712   // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant()
    713   //             to convert the pointer of byte InPtr to its real type.
    714   // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant()
    715   // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant()
    716   // IndVar - value of loop induction variable (X coordinate) for a given loop iteration
    717   //
    718   // RootArgs - this function sets this to the list of outgoing argument values corresponding
    719   //            to the inputs
    720   void ExpandInputsBody(llvm::IRBuilder<> &Builder,
    721                         llvm::Value *Arg_x1,
    722                         llvm::MDNode *TBAAAllocation,
    723                         const size_t NumInputs,
    724                         const llvm::SmallVectorImpl<llvm::Type *> &InTypes,
    725                         const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
    726                         const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots,
    727                         llvm::Value *IndVar,
    728                         llvm::SmallVectorImpl<llvm::Value *> &RootArgs) {
    729     llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1);
    730     llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
    731 
    732     for (size_t Index = 0; Index < NumInputs; ++Index) {
    733 
    734       llvm::Value *InPtr = nullptr;
    735       if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
    736         InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
    737       } else {
    738         // Treat x86 input buffer as byte[], get indexed pointer with explicit
    739         // byte offset computed using a datalayout based on
    740         // X86_CUSTOM_DL_STRING, then bitcast it to actual input type.
    741         llvm::DataLayout DL(X86_CUSTOM_DL_STRING);
    742         llvm::Type *InTy = InTypes[Index];
    743         uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType());
    744         llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep));
    745         InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes);
    746         InPtr = Builder.CreatePointerCast(InPtr, InTy);
    747       }
    748 
    749       llvm::Value *Input;
    750       llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
    751 
    752       if (gEnableRsTbaa) {
    753         InputLoad->setMetadata("tbaa", TBAAAllocation);
    754       }
    755 
    756       if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
    757         // Pass a pointer to a temporary on the stack, rather than
    758         // passing a pointer to the original value. We do not want
    759         // the kernel to potentially modify the input data.
    760 
    761         // Note: don't annotate with TBAA, since the kernel might
    762         // have its own TBAA annotations for the pointer argument.
    763         Builder.CreateStore(InputLoad, TemporarySlot);
    764         Input = TemporarySlot;
    765       } else {
    766         Input = InputLoad;
    767       }
    768 
    769       RootArgs.push_back(Input);
    770     }
    771   }
    772 
    773   /* Performs the actual optimization on a selected function. On success, the
    774    * Module will contain a new function of the name "<NAME>.expand" that
    775    * invokes <NAME>() in a loop with the appropriate parameters.
    776    */
    777   bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
    778     ALOGV("Expanding ForEach-able Function %s",
    779           Function->getName().str().c_str());
    780 
    781     if (!Signature) {
    782       Signature = getRootSignature(Function);
    783       if (!Signature) {
    784         // We couldn't determine how to expand this function based on its
    785         // function signature.
    786         return false;
    787       }
    788     }
    789 
    790     llvm::DataLayout DL(Module);
    791     if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
    792       DL.reset(X86_CUSTOM_DL_STRING);
    793     }
    794 
    795     llvm::Function *ExpandedFunction =
    796       createEmptyExpandedForEachKernel(Function->getName());
    797 
    798     /*
    799      * Extract the expanded function's parameters.  It is guaranteed by
    800      * createEmptyExpandedForEachKernel that there will be four parameters.
    801      */
    802 
    803     bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
    804 
    805     llvm::Function::arg_iterator ExpandedFunctionArgIter =
    806       ExpandedFunction->arg_begin();
    807 
    808     llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
    809     llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
    810     llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
    811     llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
    812 
    813     llvm::Value *InStep  = nullptr;
    814     llvm::Value *OutStep = nullptr;
    815 
    816     // Construct the actual function body.
    817     llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
    818 
    819     // Collect and construct the arguments for the kernel().
    820     // Note that we load any loop-invariant arguments before entering the Loop.
    821     llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
    822 
    823     llvm::Type  *InTy      = nullptr;
    824     llvm::Value *InBufPtr = nullptr;
    825     if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
    826       SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
    827       llvm::LoadInst *InStepArg  = Builder.CreateLoad(
    828         Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
    829 
    830       InTy = (FunctionArgIter++)->getType();
    831       InStep = getStepValue(&DL, InTy, InStepArg);
    832 
    833       InStep->setName("instep");
    834 
    835       SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
    836       InBufPtr = Builder.CreateLoad(
    837         Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
    838     }
    839 
    840     llvm::Type *OutTy = nullptr;
    841     llvm::Value *OutBasePtr = nullptr;
    842     if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
    843       OutTy = (FunctionArgIter++)->getType();
    844       OutStep = getStepValue(&DL, OutTy, Arg_outstep);
    845       OutStep->setName("outstep");
    846       SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
    847       OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
    848     }
    849 
    850     llvm::Value *UsrData = nullptr;
    851     if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
    852       llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
    853       llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
    854       UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
    855       UsrData->setName("UsrData");
    856     }
    857 
    858     llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
    859     llvm::Value *IV;
    860     createLoop(Builder, Arg_x1, Arg_x2, &IV);
    861 
    862     llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
    863     const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
    864                                                             [&FunctionArgIter]() { FunctionArgIter++; },
    865                                                             LoopHeader->getTerminator());
    866 
    867     bccAssert(FunctionArgIter == Function->arg_end());
    868 
    869     // Populate the actual call to kernel().
    870     llvm::SmallVector<llvm::Value*, 8> RootArgs;
    871 
    872     llvm::Value *InPtr  = nullptr;
    873     llvm::Value *OutPtr = nullptr;
    874 
    875     // Calculate the current input and output pointers
    876     //
    877     // We always calculate the input/output pointers with a GEP operating on i8
    878     // values and only cast at the very end to OutTy. This is because the step
    879     // between two values is given in bytes.
    880     //
    881     // TODO: We could further optimize the output by using a GEP operation of
    882     // type 'OutTy' in cases where the element type of the allocation allows.
    883     if (OutBasePtr) {
    884       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
    885       OutOffset = Builder.CreateMul(OutOffset, OutStep);
    886       OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
    887       OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
    888     }
    889 
    890     if (InBufPtr) {
    891       llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
    892       InOffset = Builder.CreateMul(InOffset, InStep);
    893       InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
    894       InPtr = Builder.CreatePointerCast(InPtr, InTy);
    895     }
    896 
    897     if (InPtr) {
    898       RootArgs.push_back(InPtr);
    899     }
    900 
    901     if (OutPtr) {
    902       RootArgs.push_back(OutPtr);
    903     }
    904 
    905     if (UsrData) {
    906       RootArgs.push_back(UsrData);
    907     }
    908 
    909     finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
    910 
    911     Builder.CreateCall(Function, RootArgs);
    912 
    913     return true;
    914   }
    915 
    916   /* Expand a pass-by-value foreach kernel.
    917    */
    918   bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
    919     bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
    920     ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
    921 
    922     // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
    923     llvm::DataLayout DL(Module);
    924     if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
    925       DL.reset(X86_CUSTOM_DL_STRING);
    926     }
    927     llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
    928 
    929     llvm::Function *ExpandedFunction =
    930       createEmptyExpandedForEachKernel(Function->getName());
    931 
    932     /*
    933      * Extract the expanded function's parameters.  It is guaranteed by
    934      * createEmptyExpandedForEachKernel that there will be four parameters.
    935      */
    936 
    937     bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
    938 
    939     llvm::Function::arg_iterator ExpandedFunctionArgIter =
    940       ExpandedFunction->arg_begin();
    941 
    942     llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
    943     llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
    944     llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
    945     // Arg_outstep is not used by expanded new-style forEach kernels.
    946 
    947     // Construct the actual function body.
    948     llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
    949 
    950     // Create TBAA meta-data.
    951     llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
    952                  *TBAAAllocation, *TBAAPointer;
    953     llvm::MDBuilder MDHelper(*Context);
    954 
    955     TBAARenderScriptDistinct =
    956       MDHelper.createTBAARoot(kRenderScriptTBAARootName);
    957     TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
    958         TBAARenderScriptDistinct);
    959     TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
    960                                                        TBAARenderScript);
    961     TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
    962                                                       TBAAAllocation, 0);
    963     TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
    964                                                     TBAARenderScript);
    965     TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
    966 
    967     /*
    968      * Collect and construct the arguments for the kernel().
    969      *
    970      * Note that we load any loop-invariant arguments before entering the Loop.
    971      */
    972     size_t NumRemainingInputs = Function->arg_size();
    973 
    974     // No usrData parameter on kernels.
    975     bccAssert(
    976         !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
    977 
    978     llvm::Function::arg_iterator ArgIter = Function->arg_begin();
    979 
    980     // Check the return type
    981     llvm::Type     *OutTy            = nullptr;
    982     llvm::LoadInst *OutBasePtr       = nullptr;
    983     llvm::Value    *CastedOutBasePtr = nullptr;
    984 
    985     bool PassOutByPointer = false;
    986 
    987     if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
    988       llvm::Type *OutBaseTy = Function->getReturnType();
    989 
    990       if (OutBaseTy->isVoidTy()) {
    991         PassOutByPointer = true;
    992         OutTy = ArgIter->getType();
    993 
    994         ArgIter++;
    995         --NumRemainingInputs;
    996       } else {
    997         // We don't increment Args, since we are using the actual return type.
    998         OutTy = OutBaseTy->getPointerTo();
    999       }
   1000 
   1001       SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
   1002       OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
   1003 
   1004       if (gEnableRsTbaa) {
   1005         OutBasePtr->setMetadata("tbaa", TBAAPointer);
   1006       }
   1007 
   1008       if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
   1009         CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
   1010       } else {
   1011         // The disagreement between module and x86 target machine datalayout
   1012         // causes mismatched input/output data offset between slang reflected
   1013         // code and bcc codegen for GetElementPtr. To solve this issue, skip the
   1014         // cast to OutTy and leave CastedOutBasePtr as an int8_t*.  The buffer
   1015         // is later indexed with an explicit byte offset computed based on
   1016         // X86_CUSTOM_DL_STRING and then bitcast to actual output type.
   1017         CastedOutBasePtr = OutBasePtr;
   1018       }
   1019     }
   1020 
   1021     llvm::SmallVector<llvm::Type*,  8> InTypes;
   1022     llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
   1023     llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
   1024 
   1025     bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
   1026 
   1027     // Create the loop structure.
   1028     llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
   1029     llvm::Value *IV;
   1030     createLoop(Builder, Arg_x1, Arg_x2, &IV);
   1031 
   1032     llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
   1033     const int CalleeArgsContextIdx =
   1034       ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
   1035                              [&NumRemainingInputs]() { --NumRemainingInputs; },
   1036                              LoopHeader->getTerminator());
   1037 
   1038     // After ExpandSpecialArguments() gets called, NumRemainingInputs
   1039     // counts the number of arguments to the kernel that correspond to
   1040     // an array entry from the InPtr field of the DriverInfo
   1041     // structure.
   1042     const size_t NumInPtrArguments = NumRemainingInputs;
   1043 
   1044     if (NumInPtrArguments > 0) {
   1045       ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments,
   1046                                 InTypes, InBufPtrs, InStructTempSlots);
   1047     }
   1048 
   1049     // Populate the actual call to kernel().
   1050     llvm::SmallVector<llvm::Value*, 8> RootArgs;
   1051 
   1052     // Calculate the current input and output pointers.
   1053 
   1054     // Output
   1055 
   1056     llvm::Value *OutPtr = nullptr;
   1057     if (CastedOutBasePtr) {
   1058       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
   1059 
   1060       if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
   1061         OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
   1062       } else {
   1063         // Treat x86 output buffer as byte[], get indexed pointer with explicit
   1064         // byte offset computed using a datalayout based on
   1065         // X86_CUSTOM_DL_STRING, then bitcast it to actual output type.
   1066         uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType());
   1067         llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep));
   1068         OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes);
   1069         OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
   1070       }
   1071 
   1072       if (PassOutByPointer) {
   1073         RootArgs.push_back(OutPtr);
   1074       }
   1075     }
   1076 
   1077     // Inputs
   1078 
   1079     if (NumInPtrArguments > 0) {
   1080       ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments,
   1081                        InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs);
   1082     }
   1083 
   1084     finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
   1085 
   1086     llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
   1087 
   1088     if (OutPtr && !PassOutByPointer) {
   1089       RetVal->setName("call.result");
   1090       llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
   1091       if (gEnableRsTbaa) {
   1092         Store->setMetadata("tbaa", TBAAAllocation);
   1093       }
   1094     }
   1095 
   1096     return true;
   1097   }
   1098 
   1099   // Certain categories of functions that make up a general
   1100   // reduce-style kernel are called directly from the driver with no
   1101   // expansion needed.  For a function in such a category, we need to
   1102   // promote linkage from static to external, to ensure that the
   1103   // function is visible to the driver in the dynamic symbol table.
   1104   // This promotion is safe because we don't have any kind of cross
   1105   // translation unit linkage model (except for linking against
   1106   // RenderScript libraries), so we do not risk name clashes.
   1107   bool PromoteReduceFunction(const char *Name, FunctionSet &PromotedFunctions) {
   1108     if (!Name)  // a presumably-optional function that is not present
   1109       return false;
   1110 
   1111     llvm::Function *Fn = Module->getFunction(Name);
   1112     bccAssert(Fn != nullptr);
   1113     if (PromotedFunctions.insert(Fn).second) {
   1114       bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage);
   1115       Fn->setLinkage(llvm::GlobalValue::ExternalLinkage);
   1116       return true;
   1117     }
   1118 
   1119     return false;
   1120   }
   1121 
   1122   // Expand the accumulator function for a general reduce-style kernel.
   1123   //
   1124   // The input is a function of the form
   1125   //
   1126   //   define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments])
   1127   //
   1128   // where all arguments except the first are the same as for a foreach kernel.
   1129   //
   1130   // The input accumulator function gets expanded into a function of the form
   1131   //
   1132   //   define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum)
   1133   //
   1134   // which performs a serial accumulaion of elements [x1, x2) into *%accum.
   1135   //
   1136   // In pseudocode, @func.expand does:
   1137   //
   1138   //   for (i = %x1; i < %x2; ++i) {
   1139   //     func(%accum,
   1140   //          *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i)
   1141   //          [, p] [, i] [, p->current.y] [, p->current.z]);
   1142   //   }
   1143   //
   1144   // This is very similar to foreach kernel expansion with no output.
   1145   bool ExpandReduceAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) {
   1146     ALOGV("Expanding accumulator %s for general reduce kernel",
   1147           FnAccumulator->getName().str().c_str());
   1148 
   1149     // Create TBAA meta-data.
   1150     llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
   1151                  *TBAAAllocation, *TBAAPointer;
   1152     llvm::MDBuilder MDHelper(*Context);
   1153     TBAARenderScriptDistinct =
   1154       MDHelper.createTBAARoot(kRenderScriptTBAARootName);
   1155     TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
   1156         TBAARenderScriptDistinct);
   1157     TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
   1158                                                        TBAARenderScript);
   1159     TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
   1160                                                       TBAAAllocation, 0);
   1161     TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
   1162                                                     TBAARenderScript);
   1163     TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
   1164 
   1165     auto AccumulatorArgIter = FnAccumulator->arg_begin();
   1166 
   1167     // Create empty accumulator function.
   1168     llvm::Function *FnExpandedAccumulator =
   1169         createEmptyExpandedReduceAccumulator(FnAccumulator->getName(),
   1170                                              (AccumulatorArgIter++)->getType());
   1171 
   1172     // Extract the expanded accumulator's parameters.  It is
   1173     // guaranteed by createEmptyExpandedReduceAccumulator that
   1174     // there will be 4 parameters.
   1175     bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
   1176     auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin();
   1177     llvm::Value *Arg_p     = &*(ExpandedAccumulatorArgIter++);
   1178     llvm::Value *Arg_x1    = &*(ExpandedAccumulatorArgIter++);
   1179     llvm::Value *Arg_x2    = &*(ExpandedAccumulatorArgIter++);
   1180     llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++);
   1181 
   1182     // Construct the actual function body.
   1183     llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin());
   1184 
   1185     // Create the loop structure.
   1186     llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
   1187     llvm::Value *IndVar;
   1188     createLoop(Builder, Arg_x1, Arg_x2, &IndVar);
   1189 
   1190     llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
   1191     const int CalleeArgsContextIdx =
   1192         ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs,
   1193                                [](){}, LoopHeader->getTerminator());
   1194 
   1195     llvm::SmallVector<llvm::Type*,  8> InTypes;
   1196     llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
   1197     llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
   1198     ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs,
   1199                               InTypes, InBufPtrs, InStructTempSlots);
   1200 
   1201     // Populate the actual call to the original accumulator.
   1202     llvm::SmallVector<llvm::Value*, 8> RootArgs;
   1203     RootArgs.push_back(Arg_accum);
   1204     ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots,
   1205                      IndVar, RootArgs);
   1206     finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder);
   1207     Builder.CreateCall(FnAccumulator, RootArgs);
   1208 
   1209     return true;
   1210   }
   1211 
   1212   // Create a combiner function for a general reduce-style kernel that lacks one,
   1213   // by calling the accumulator function.
   1214   //
   1215   // The accumulator function must be of the form
   1216   //
   1217   //   define void @accumFn(accumType* %accum, accumType %in)
   1218   //
   1219   // A combiner function will be generated of the form
   1220   //
   1221   //   define void @accumFn.combiner(accumType* %accum, accumType* %other) {
   1222   //     %1 = load accumType, accumType* %other
   1223   //     call void @accumFn(accumType* %accum, accumType %1);
   1224   //   }
   1225   bool CreateReduceCombinerFromAccumulator(llvm::Function *FnAccumulator) {
   1226     ALOGV("Creating combiner from accumulator %s for general reduce kernel",
   1227           FnAccumulator->getName().str().c_str());
   1228 
   1229     using llvm::Attribute;
   1230 
   1231     bccAssert(FnAccumulator->arg_size() == 2);
   1232     auto AccumulatorArgIter = FnAccumulator->arg_begin();
   1233     llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++);
   1234     llvm::Value *AccumulatorArg_in    = &*(AccumulatorArgIter++);
   1235     llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType();
   1236     bccAssert(AccumulatorArgType->isPointerTy());
   1237 
   1238     llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
   1239     llvm::FunctionType *CombinerType =
   1240         llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false);
   1241     llvm::Function *FnCombiner =
   1242         llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage,
   1243                                nameReduceCombinerFromAccumulator(FnAccumulator->getName()),
   1244                                Module);
   1245 
   1246     auto CombinerArgIter = FnCombiner->arg_begin();
   1247 
   1248     llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++);
   1249     CombinerArg_accum->setName("accum");
   1250     CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1,
   1251                                                        llvm::makeArrayRef(Attribute::NoCapture)));
   1252 
   1253     llvm::Argument *CombinerArg_other = &(*CombinerArgIter++);
   1254     CombinerArg_other->setName("other");
   1255     CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1,
   1256                                                        llvm::makeArrayRef(Attribute::NoCapture)));
   1257 
   1258     llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner);
   1259     llvm::IRBuilder<> Builder(BB);
   1260 
   1261     if (AccumulatorArg_in->getType()->isPointerTy()) {
   1262       // Types of sufficient size get passed by pointer-to-copy rather
   1263       // than passed by value.  An accumulator cannot take a pointer
   1264       // at the user level; so if we see a pointer here, we know that
   1265       // we have a pass-by-pointer-to-copy case.
   1266       llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType();
   1267       llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy");
   1268       Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem);
   1269       Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem });
   1270     } else {
   1271       llvm::Value *TypeAdjustedOther = CombinerArg_other;
   1272       if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) {
   1273         // Call lowering by frontend has done some type coercion
   1274         TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other,
   1275                                                       AccumulatorArg_in->getType()->getPointerTo(),
   1276                                                       "cast");
   1277       }
   1278       llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther);
   1279       Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther });
   1280     }
   1281     Builder.CreateRetVoid();
   1282 
   1283     return true;
   1284   }
   1285 
   1286   /// @brief Checks if pointers to allocation internals are exposed
   1287   ///
   1288   /// This function verifies if through the parameters passed to the kernel
   1289   /// or through calls to the runtime library the script gains access to
   1290   /// pointers pointing to data within a RenderScript Allocation.
   1291   /// If we know we control all loads from and stores to data within
   1292   /// RenderScript allocations and if we know the run-time internal accesses
   1293   /// are all annotated with RenderScript TBAA metadata, only then we
   1294   /// can safely use TBAA to distinguish between generic and from-allocation
   1295   /// pointers.
   1296   bool allocPointersExposed(llvm::Module &Module) {
   1297     // Old style kernel function can expose pointers to elements within
   1298     // allocations.
   1299     // TODO: Extend analysis to allow simple cases of old-style kernels.
   1300     for (size_t i = 0; i < mExportForEachCount; ++i) {
   1301       const char *Name = mExportForEachNameList[i];
   1302       uint32_t Signature = mExportForEachSignatureList[i];
   1303       if (Module.getFunction(Name) &&
   1304           !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
   1305         return true;
   1306       }
   1307     }
   1308 
   1309     // Check for library functions that expose a pointer to an Allocation or
   1310     // that are not yet annotated with RenderScript-specific tbaa information.
   1311     static const std::vector<const char *> Funcs{
   1312       // rsGetElementAt(...)
   1313       "_Z14rsGetElementAt13rs_allocationj",
   1314       "_Z14rsGetElementAt13rs_allocationjj",
   1315       "_Z14rsGetElementAt13rs_allocationjjj",
   1316 
   1317       // rsSetElementAt()
   1318       "_Z14rsSetElementAt13rs_allocationPvj",
   1319       "_Z14rsSetElementAt13rs_allocationPvjj",
   1320       "_Z14rsSetElementAt13rs_allocationPvjjj",
   1321 
   1322       // rsGetElementAtYuv_uchar_Y()
   1323       "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
   1324 
   1325       // rsGetElementAtYuv_uchar_U()
   1326       "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
   1327 
   1328       // rsGetElementAtYuv_uchar_V()
   1329       "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
   1330     };
   1331 
   1332     for (auto FI : Funcs) {
   1333       llvm::Function *Function = Module.getFunction(FI);
   1334 
   1335       if (!Function) {
   1336         ALOGE("Missing run-time function '%s'", FI);
   1337         return true;
   1338       }
   1339 
   1340       if (Function->getNumUses() > 0) {
   1341         return true;
   1342       }
   1343     }
   1344 
   1345     return false;
   1346   }
   1347 
   1348   /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
   1349   ///
   1350   /// The TBAA metadata used to annotate loads/stores from RenderScript
   1351   /// Allocations is generated in a separate TBAA tree with a
   1352   /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
   1353   /// all nodes in unrelated alias analysis trees. This function makes the
   1354   /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
   1355   /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
   1356   /// the connected trees every access to an Allocation is resolved to
   1357   /// must-alias if compared to a normal C/C++ access.
   1358   void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
   1359     llvm::MDBuilder MDHelper(*Context);
   1360     llvm::MDNode *TBAARenderScriptDistinct =
   1361       MDHelper.createTBAARoot("RenderScript Distinct TBAA");
   1362     llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
   1363         "RenderScript TBAA", TBAARenderScriptDistinct);
   1364     llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
   1365     TBAARenderScript->replaceOperandWith(1, TBAARoot);
   1366   }
   1367 
   1368   virtual bool runOnModule(llvm::Module &Module) {
   1369     bool Changed  = false;
   1370     this->Module  = &Module;
   1371     Context = &Module.getContext();
   1372 
   1373     buildTypes();
   1374 
   1375     bcinfo::MetadataExtractor me(&Module);
   1376     if (!me.extract()) {
   1377       ALOGE("Could not extract metadata from module!");
   1378       return false;
   1379     }
   1380 
   1381     mStructExplicitlyPaddedBySlang = (me.getCompilerVersion() >= SlangVersion::N_STRUCT_EXPLICIT_PADDING);
   1382 
   1383     // Expand forEach_* style kernels.
   1384     mExportForEachCount = me.getExportForEachSignatureCount();
   1385     mExportForEachNameList = me.getExportForEachNameList();
   1386     mExportForEachSignatureList = me.getExportForEachSignatureList();
   1387 
   1388     for (size_t i = 0; i < mExportForEachCount; ++i) {
   1389       const char *name = mExportForEachNameList[i];
   1390       uint32_t signature = mExportForEachSignatureList[i];
   1391       llvm::Function *kernel = Module.getFunction(name);
   1392       if (kernel) {
   1393         if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
   1394           Changed |= ExpandForEach(kernel, signature);
   1395           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
   1396         } else if (kernel->getReturnType()->isVoidTy()) {
   1397           Changed |= ExpandOldStyleForEach(kernel, signature);
   1398           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
   1399         } else {
   1400           // There are some graphics root functions that are not
   1401           // expanded, but that will be called directly. For those
   1402           // functions, we can not set the linkage to internal.
   1403         }
   1404       }
   1405     }
   1406 
   1407     // Process general reduce_* style functions.
   1408     const size_t ExportReduceCount = me.getExportReduceCount();
   1409     const bcinfo::MetadataExtractor::Reduce *ExportReduceList = me.getExportReduceList();
   1410     //   Note that functions can be shared between kernels
   1411     FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners;
   1412 
   1413     for (size_t i = 0; i < ExportReduceCount; ++i) {
   1414       Changed |= PromoteReduceFunction(ExportReduceList[i].mInitializerName, PromotedFunctions);
   1415       Changed |= PromoteReduceFunction(ExportReduceList[i].mCombinerName, PromotedFunctions);
   1416       Changed |= PromoteReduceFunction(ExportReduceList[i].mOutConverterName, PromotedFunctions);
   1417 
   1418       // Accumulator
   1419       llvm::Function *accumulator = Module.getFunction(ExportReduceList[i].mAccumulatorName);
   1420       bccAssert(accumulator != nullptr);
   1421       if (ExpandedAccumulators.insert(accumulator).second)
   1422         Changed |= ExpandReduceAccumulator(accumulator,
   1423                                            ExportReduceList[i].mSignature,
   1424                                            ExportReduceList[i].mInputCount);
   1425       if (!ExportReduceList[i].mCombinerName) {
   1426         if (AccumulatorsForCombiners.insert(accumulator).second)
   1427           Changed |= CreateReduceCombinerFromAccumulator(accumulator);
   1428       }
   1429     }
   1430 
   1431     if (gEnableRsTbaa && !allocPointersExposed(Module)) {
   1432       connectRenderScriptTBAAMetadata(Module);
   1433     }
   1434 
   1435     return Changed;
   1436   }
   1437 
   1438   virtual const char *getPassName() const {
   1439     return "forEach_* and reduce_* function expansion";
   1440   }
   1441 
   1442 }; // end RSKernelExpandPass
   1443 
   1444 } // end anonymous namespace
   1445 
   1446 char RSKernelExpandPass::ID = 0;
   1447 static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
   1448 
   1449 namespace bcc {
   1450 
   1451 const char BCC_INDEX_VAR_NAME[] = "rsIndex";
   1452 
   1453 llvm::ModulePass *
   1454 createRSKernelExpandPass(bool pEnableStepOpt) {
   1455   return new RSKernelExpandPass(pEnableStepOpt);
   1456 }
   1457 
   1458 } // end namespace bcc
   1459