Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
     18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
     19 
     20 #include "base/scoped_arena_allocator.h"
     21 #include "base/scoped_arena_containers.h"
     22 #include "induction_var_range.h"
     23 #include "nodes.h"
     24 #include "optimization.h"
     25 
     26 namespace art {
     27 
     28 class CompilerDriver;
     29 
     30 /**
     31  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
     32  * the detected nested loops, such as removal of dead induction and empty loops
     33  * and inner loop vectorization.
     34  */
     35 class HLoopOptimization : public HOptimization {
     36  public:
     37   HLoopOptimization(HGraph* graph,
     38                     CompilerDriver* compiler_driver,
     39                     HInductionVarAnalysis* induction_analysis,
     40                     OptimizingCompilerStats* stats,
     41                     const char* name = kLoopOptimizationPassName);
     42 
     43   void Run() OVERRIDE;
     44 
     45   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
     46 
     47  private:
     48   /**
     49    * A single loop inside the loop hierarchy representation.
     50    */
     51   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
     52     explicit LoopNode(HLoopInformation* lp_info)
     53         : loop_info(lp_info),
     54           outer(nullptr),
     55           inner(nullptr),
     56           previous(nullptr),
     57           next(nullptr) {}
     58     HLoopInformation* loop_info;
     59     LoopNode* outer;
     60     LoopNode* inner;
     61     LoopNode* previous;
     62     LoopNode* next;
     63   };
     64 
     65   /*
     66    * Vectorization restrictions (bit mask).
     67    */
     68   enum VectorRestrictions {
     69     kNone            = 0,        // no restrictions
     70     kNoMul           = 1 << 0,   // no multiplication
     71     kNoDiv           = 1 << 1,   // no division
     72     kNoShift         = 1 << 2,   // no shift
     73     kNoShr           = 1 << 3,   // no arithmetic shift right
     74     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
     75     kNoSignedHAdd    = 1 << 5,   // no signed halving add
     76     kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
     77     kNoAbs           = 1 << 7,   // no absolute value
     78     kNoMinMax        = 1 << 8,   // no min/max
     79     kNoStringCharAt  = 1 << 9,   // no StringCharAt
     80     kNoReduction     = 1 << 10,  // no reduction
     81     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
     82     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
     83   };
     84 
     85   /*
     86    * Vectorization mode during synthesis
     87    * (sequential peeling/cleanup loop or vector loop).
     88    */
     89   enum VectorMode {
     90     kSequential,
     91     kVector
     92   };
     93 
     94   /*
     95    * Representation of a unit-stride array reference.
     96    */
     97   struct ArrayReference {
     98     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
     99         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
    100     bool operator<(const ArrayReference& other) const {
    101       return
    102           (base < other.base) ||
    103           (base == other.base &&
    104            (offset < other.offset || (offset == other.offset &&
    105                                       (type < other.type ||
    106                                        (type == other.type &&
    107                                         (lhs < other.lhs ||
    108                                          (lhs == other.lhs &&
    109                                           is_string_char_at < other.is_string_char_at)))))));
    110     }
    111     HInstruction* base;      // base address
    112     HInstruction* offset;    // offset + i
    113     DataType::Type type;     // component type
    114     bool lhs;                // def/use
    115     bool is_string_char_at;  // compressed string read
    116   };
    117 
    118   //
    119   // Loop setup and traversal.
    120   //
    121 
    122   void LocalRun();
    123   void AddLoop(HLoopInformation* loop_info);
    124   void RemoveLoop(LoopNode* node);
    125 
    126   // Traverses all loops inner to outer to perform simplifications and optimizations.
    127   // Returns true if loops nested inside current loop (node) have changed.
    128   bool TraverseLoopsInnerToOuter(LoopNode* node);
    129 
    130   //
    131   // Optimization.
    132   //
    133 
    134   void SimplifyInduction(LoopNode* node);
    135   void SimplifyBlocks(LoopNode* node);
    136 
    137   // Performs optimizations specific to inner loop (empty loop removal,
    138   // unrolling, vectorization). Returns true if anything changed.
    139   bool OptimizeInnerLoop(LoopNode* node);
    140 
    141   //
    142   // Vectorization analysis and synthesis.
    143   //
    144 
    145   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
    146   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
    147   void GenerateNewLoop(LoopNode* node,
    148                        HBasicBlock* block,
    149                        HBasicBlock* new_preheader,
    150                        HInstruction* lo,
    151                        HInstruction* hi,
    152                        HInstruction* step,
    153                        uint32_t unroll);
    154   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
    155   bool VectorizeUse(LoopNode* node,
    156                     HInstruction* instruction,
    157                     bool generate_code,
    158                     DataType::Type type,
    159                     uint64_t restrictions);
    160   uint32_t GetVectorSizeInBytes();
    161   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
    162   bool TrySetVectorLength(uint32_t length);
    163   void GenerateVecInv(HInstruction* org, DataType::Type type);
    164   void GenerateVecSub(HInstruction* org, HInstruction* offset);
    165   void GenerateVecMem(HInstruction* org,
    166                       HInstruction* opa,
    167                       HInstruction* opb,
    168                       HInstruction* offset,
    169                       DataType::Type type);
    170   void GenerateVecReductionPhi(HPhi* phi);
    171   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
    172   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
    173   void GenerateVecOp(HInstruction* org,
    174                      HInstruction* opa,
    175                      HInstruction* opb,
    176                      DataType::Type type,
    177                      bool is_unsigned = false);
    178 
    179   // Vectorization idioms.
    180   bool VectorizeHalvingAddIdiom(LoopNode* node,
    181                                 HInstruction* instruction,
    182                                 bool generate_code,
    183                                 DataType::Type type,
    184                                 uint64_t restrictions);
    185   bool VectorizeSADIdiom(LoopNode* node,
    186                          HInstruction* instruction,
    187                          bool generate_code,
    188                          DataType::Type type,
    189                          uint64_t restrictions);
    190 
    191   // Vectorization heuristics.
    192   Alignment ComputeAlignment(HInstruction* offset,
    193                              DataType::Type type,
    194                              bool is_string_char_at,
    195                              uint32_t peeling = 0);
    196   void SetAlignmentStrategy(uint32_t peeling_votes[],
    197                             const ArrayReference* peeling_candidate);
    198   uint32_t MaxNumberPeeled();
    199   bool IsVectorizationProfitable(int64_t trip_count);
    200   uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
    201 
    202   //
    203   // Helpers.
    204   //
    205 
    206   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
    207   bool TrySetPhiReduction(HPhi* phi);
    208 
    209   // Detects loop header with a single induction (returned in main_phi), possibly
    210   // other phis for reductions, but no other side effects. Returns true on success.
    211   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
    212 
    213   bool IsEmptyBody(HBasicBlock* block);
    214   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
    215                            HInstruction* instruction,
    216                            bool collect_loop_uses,
    217                            /*out*/ uint32_t* use_count);
    218   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
    219                          HInstruction* instruction);
    220   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
    221                                HInstruction* instruction,
    222                                HBasicBlock* block);
    223   bool TryAssignLastValue(HLoopInformation* loop_info,
    224                           HInstruction* instruction,
    225                           HBasicBlock* block,
    226                           bool collect_loop_uses);
    227   void RemoveDeadInstructions(const HInstructionList& list);
    228   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
    229 
    230   // Compiler driver (to query ISA features).
    231   const CompilerDriver* compiler_driver_;
    232 
    233   // Range information based on prior induction variable analysis.
    234   InductionVarRange induction_range_;
    235 
    236   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
    237   // through this allocator is immediately released when the loop optimizer is done.
    238   ScopedArenaAllocator* loop_allocator_;
    239 
    240   // Global heap memory allocator. Used to build HIR.
    241   ArenaAllocator* global_allocator_;
    242 
    243   // Entries into the loop hierarchy representation. The hierarchy resides
    244   // in phase-local heap memory.
    245   LoopNode* top_loop_;
    246   LoopNode* last_loop_;
    247 
    248   // Temporary bookkeeping of a set of instructions.
    249   // Contents reside in phase-local heap memory.
    250   ScopedArenaSet<HInstruction*>* iset_;
    251 
    252   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
    253   // (1) reductions in the loop-body are mapped back to their phi definition,
    254   // (2) phi definitions are mapped to their initial value (updated during
    255   //     code generation to feed the proper values into the new chain).
    256   // Contents reside in phase-local heap memory.
    257   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
    258 
    259   // Flag that tracks if any simplifications have occurred.
    260   bool simplified_;
    261 
    262   // Number of "lanes" for selected packed type.
    263   uint32_t vector_length_;
    264 
    265   // Set of array references in the vector loop.
    266   // Contents reside in phase-local heap memory.
    267   ScopedArenaSet<ArrayReference>* vector_refs_;
    268 
    269   // Static or dynamic loop peeling for alignment.
    270   uint32_t vector_static_peeling_factor_;
    271   const ArrayReference* vector_dynamic_peeling_candidate_;
    272 
    273   // Dynamic data dependence test of the form a != b.
    274   HInstruction* vector_runtime_test_a_;
    275   HInstruction* vector_runtime_test_b_;
    276 
    277   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
    278   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
    279   // structure maps original instructions into the new instructions.
    280   // Contents reside in phase-local heap memory.
    281   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
    282 
    283   // Permanent mapping used during vectorization synthesis.
    284   // Contents reside in phase-local heap memory.
    285   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
    286 
    287   // Temporary vectorization bookkeeping.
    288   VectorMode vector_mode_;  // synthesis mode
    289   HBasicBlock* vector_preheader_;  // preheader of the new loop
    290   HBasicBlock* vector_header_;  // header of the new loop
    291   HBasicBlock* vector_body_;  // body of the new loop
    292   HInstruction* vector_index_;  // normalized index of the new loop
    293 
    294   friend class LoopOptimizationTest;
    295 
    296   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
    297 };
    298 
    299 }  // namespace art
    300 
    301 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
    302