Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
     18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
     19 
     20 #include "base/scoped_arena_allocator.h"
     21 #include "base/scoped_arena_containers.h"
     22 #include "induction_var_range.h"
     23 #include "loop_analysis.h"
     24 #include "nodes.h"
     25 #include "optimization.h"
     26 #include "superblock_cloner.h"
     27 
     28 namespace art {
     29 
     30 class CompilerOptions;
     31 class ArchNoOptsLoopHelper;
     32 
     33 /**
     34  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
     35  * the detected nested loops, such as removal of dead induction and empty loops
     36  * and inner loop vectorization.
     37  */
     38 class HLoopOptimization : public HOptimization {
     39  public:
     40   HLoopOptimization(HGraph* graph,
     41                     const CompilerOptions* compiler_options,
     42                     HInductionVarAnalysis* induction_analysis,
     43                     OptimizingCompilerStats* stats,
     44                     const char* name = kLoopOptimizationPassName);
     45 
     46   bool Run() override;
     47 
     48   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
     49 
     50  private:
     51   /**
     52    * A single loop inside the loop hierarchy representation.
     53    */
     54   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
     55     explicit LoopNode(HLoopInformation* lp_info)
     56         : loop_info(lp_info),
     57           outer(nullptr),
     58           inner(nullptr),
     59           previous(nullptr),
     60           next(nullptr) {}
     61     HLoopInformation* loop_info;
     62     LoopNode* outer;
     63     LoopNode* inner;
     64     LoopNode* previous;
     65     LoopNode* next;
     66   };
     67 
     68   /*
     69    * Vectorization restrictions (bit mask).
     70    */
     71   enum VectorRestrictions {
     72     kNone            = 0,        // no restrictions
     73     kNoMul           = 1 << 0,   // no multiplication
     74     kNoDiv           = 1 << 1,   // no division
     75     kNoShift         = 1 << 2,   // no shift
     76     kNoShr           = 1 << 3,   // no arithmetic shift right
     77     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
     78     kNoSignedHAdd    = 1 << 5,   // no signed halving add
     79     kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
     80     kNoAbs           = 1 << 7,   // no absolute value
     81     kNoStringCharAt  = 1 << 8,   // no StringCharAt
     82     kNoReduction     = 1 << 9,   // no reduction
     83     kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
     84     kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
     85     kNoDotProd       = 1 << 12,  // no dot product
     86   };
     87 
     88   /*
     89    * Vectorization mode during synthesis
     90    * (sequential peeling/cleanup loop or vector loop).
     91    */
     92   enum VectorMode {
     93     kSequential,
     94     kVector
     95   };
     96 
     97   /*
     98    * Representation of a unit-stride array reference.
     99    */
    100   struct ArrayReference {
    101     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
    102         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
    103     bool operator<(const ArrayReference& other) const {
    104       return
    105           (base < other.base) ||
    106           (base == other.base &&
    107            (offset < other.offset || (offset == other.offset &&
    108                                       (type < other.type ||
    109                                        (type == other.type &&
    110                                         (lhs < other.lhs ||
    111                                          (lhs == other.lhs &&
    112                                           is_string_char_at < other.is_string_char_at)))))));
    113     }
    114     HInstruction* base;      // base address
    115     HInstruction* offset;    // offset + i
    116     DataType::Type type;     // component type
    117     bool lhs;                // def/use
    118     bool is_string_char_at;  // compressed string read
    119   };
    120 
    121   //
    122   // Loop setup and traversal.
    123   //
    124 
    125   bool LocalRun();
    126   void AddLoop(HLoopInformation* loop_info);
    127   void RemoveLoop(LoopNode* node);
    128 
    129   // Traverses all loops inner to outer to perform simplifications and optimizations.
    130   // Returns true if loops nested inside current loop (node) have changed.
    131   bool TraverseLoopsInnerToOuter(LoopNode* node);
    132 
    133   //
    134   // Optimization.
    135   //
    136 
    137   void SimplifyInduction(LoopNode* node);
    138   void SimplifyBlocks(LoopNode* node);
    139 
    140   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
    141   // unrolling, vectorization). Returns true if anything changed.
    142   bool TryOptimizeInnerLoopFinite(LoopNode* node);
    143 
    144   // Performs optimizations specific to inner loop. Returns true if anything changed.
    145   bool OptimizeInnerLoop(LoopNode* node);
    146 
    147   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
    148   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
    149   // optimization should be actually applied.
    150   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
    151                                              bool generate_code = true);
    152 
    153   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
    154   // transformation happened. 'generate_code' determines whether the optimization should be
    155   // actually applied.
    156   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
    157                                                   bool generate_code = true);
    158 
    159   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
    160   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
    161   // Returns whether transformation happened. 'generate_code' determines whether the optimization
    162   // should be actually applied.
    163   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
    164 
    165   // Tries to apply scalar loop peeling and unrolling.
    166   bool TryPeelingAndUnrolling(LoopNode* node);
    167 
    168   //
    169   // Vectorization analysis and synthesis.
    170   //
    171 
    172   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
    173   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
    174   void GenerateNewLoop(LoopNode* node,
    175                        HBasicBlock* block,
    176                        HBasicBlock* new_preheader,
    177                        HInstruction* lo,
    178                        HInstruction* hi,
    179                        HInstruction* step,
    180                        uint32_t unroll);
    181   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
    182   bool VectorizeUse(LoopNode* node,
    183                     HInstruction* instruction,
    184                     bool generate_code,
    185                     DataType::Type type,
    186                     uint64_t restrictions);
    187   uint32_t GetVectorSizeInBytes();
    188   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
    189   bool TrySetVectorLength(uint32_t length);
    190   void GenerateVecInv(HInstruction* org, DataType::Type type);
    191   void GenerateVecSub(HInstruction* org, HInstruction* offset);
    192   void GenerateVecMem(HInstruction* org,
    193                       HInstruction* opa,
    194                       HInstruction* opb,
    195                       HInstruction* offset,
    196                       DataType::Type type);
    197   void GenerateVecReductionPhi(HPhi* phi);
    198   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
    199   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
    200   void GenerateVecOp(HInstruction* org,
    201                      HInstruction* opa,
    202                      HInstruction* opb,
    203                      DataType::Type type);
    204 
    205   // Vectorization idioms.
    206   bool VectorizeSaturationIdiom(LoopNode* node,
    207                                 HInstruction* instruction,
    208                                 bool generate_code,
    209                                 DataType::Type type,
    210                                 uint64_t restrictions);
    211   bool VectorizeHalvingAddIdiom(LoopNode* node,
    212                                 HInstruction* instruction,
    213                                 bool generate_code,
    214                                 DataType::Type type,
    215                                 uint64_t restrictions);
    216   bool VectorizeSADIdiom(LoopNode* node,
    217                          HInstruction* instruction,
    218                          bool generate_code,
    219                          DataType::Type type,
    220                          uint64_t restrictions);
    221   bool VectorizeDotProdIdiom(LoopNode* node,
    222                              HInstruction* instruction,
    223                              bool generate_code,
    224                              DataType::Type type,
    225                              uint64_t restrictions);
    226 
    227   // Vectorization heuristics.
    228   Alignment ComputeAlignment(HInstruction* offset,
    229                              DataType::Type type,
    230                              bool is_string_char_at,
    231                              uint32_t peeling = 0);
    232   void SetAlignmentStrategy(uint32_t peeling_votes[],
    233                             const ArrayReference* peeling_candidate);
    234   uint32_t MaxNumberPeeled();
    235   bool IsVectorizationProfitable(int64_t trip_count);
    236 
    237   //
    238   // Helpers.
    239   //
    240 
    241   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
    242   bool TrySetPhiReduction(HPhi* phi);
    243 
    244   // Detects loop header with a single induction (returned in main_phi), possibly
    245   // other phis for reductions, but no other side effects. Returns true on success.
    246   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
    247 
    248   bool IsEmptyBody(HBasicBlock* block);
    249   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
    250                            HInstruction* instruction,
    251                            bool collect_loop_uses,
    252                            /*out*/ uint32_t* use_count);
    253   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
    254                          HInstruction* instruction);
    255   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
    256                                HInstruction* instruction,
    257                                HBasicBlock* block);
    258   bool TryAssignLastValue(HLoopInformation* loop_info,
    259                           HInstruction* instruction,
    260                           HBasicBlock* block,
    261                           bool collect_loop_uses);
    262   void RemoveDeadInstructions(const HInstructionList& list);
    263   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
    264 
    265   // Compiler options (to query ISA features).
    266   const CompilerOptions* compiler_options_;
    267 
    268   // Range information based on prior induction variable analysis.
    269   InductionVarRange induction_range_;
    270 
    271   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
    272   // through this allocator is immediately released when the loop optimizer is done.
    273   ScopedArenaAllocator* loop_allocator_;
    274 
    275   // Global heap memory allocator. Used to build HIR.
    276   ArenaAllocator* global_allocator_;
    277 
    278   // Entries into the loop hierarchy representation. The hierarchy resides
    279   // in phase-local heap memory.
    280   LoopNode* top_loop_;
    281   LoopNode* last_loop_;
    282 
    283   // Temporary bookkeeping of a set of instructions.
    284   // Contents reside in phase-local heap memory.
    285   ScopedArenaSet<HInstruction*>* iset_;
    286 
    287   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
    288   // (1) reductions in the loop-body are mapped back to their phi definition,
    289   // (2) phi definitions are mapped to their initial value (updated during
    290   //     code generation to feed the proper values into the new chain).
    291   // Contents reside in phase-local heap memory.
    292   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
    293 
    294   // Flag that tracks if any simplifications have occurred.
    295   bool simplified_;
    296 
    297   // Number of "lanes" for selected packed type.
    298   uint32_t vector_length_;
    299 
    300   // Set of array references in the vector loop.
    301   // Contents reside in phase-local heap memory.
    302   ScopedArenaSet<ArrayReference>* vector_refs_;
    303 
    304   // Static or dynamic loop peeling for alignment.
    305   uint32_t vector_static_peeling_factor_;
    306   const ArrayReference* vector_dynamic_peeling_candidate_;
    307 
    308   // Dynamic data dependence test of the form a != b.
    309   HInstruction* vector_runtime_test_a_;
    310   HInstruction* vector_runtime_test_b_;
    311 
    312   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
    313   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
    314   // structure maps original instructions into the new instructions.
    315   // Contents reside in phase-local heap memory.
    316   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
    317 
    318   // Permanent mapping used during vectorization synthesis.
    319   // Contents reside in phase-local heap memory.
    320   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
    321 
    322   // Temporary vectorization bookkeeping.
    323   VectorMode vector_mode_;  // synthesis mode
    324   HBasicBlock* vector_preheader_;  // preheader of the new loop
    325   HBasicBlock* vector_header_;  // header of the new loop
    326   HBasicBlock* vector_body_;  // body of the new loop
    327   HInstruction* vector_index_;  // normalized index of the new loop
    328 
    329   // Helper for target-specific behaviour for loop optimizations.
    330   ArchNoOptsLoopHelper* arch_loop_helper_;
    331 
    332   friend class LoopOptimizationTest;
    333 
    334   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
    335 };
    336 
    337 }  // namespace art
    338 
    339 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
    340