Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "loop_optimization.h"
     18 
     19 #include "arch/instruction_set.h"
     20 #include "arch/arm/instruction_set_features_arm.h"
     21 #include "arch/arm64/instruction_set_features_arm64.h"
     22 #include "arch/mips/instruction_set_features_mips.h"
     23 #include "arch/mips64/instruction_set_features_mips64.h"
     24 #include "arch/x86/instruction_set_features_x86.h"
     25 #include "arch/x86_64/instruction_set_features_x86_64.h"
     26 #include "driver/compiler_driver.h"
     27 #include "linear_order.h"
     28 
     29 namespace art {
     30 
     31 // Enables vectorization (SIMDization) in the loop optimizer.
     32 static constexpr bool kEnableVectorization = true;
     33 
     34 // All current SIMD targets want 16-byte alignment.
     35 static constexpr size_t kAlignedBase = 16;
     36 
     37 // Remove the instruction from the graph. A bit more elaborate than the usual
     38 // instruction removal, since there may be a cycle in the use structure.
     39 static void RemoveFromCycle(HInstruction* instruction) {
     40   instruction->RemoveAsUserOfAllInputs();
     41   instruction->RemoveEnvironmentUsers();
     42   instruction->GetBlock()->RemoveInstructionOrPhi(instruction, /*ensure_safety=*/ false);
     43 }
     44 
     45 // Detect a goto block and sets succ to the single successor.
     46 static bool IsGotoBlock(HBasicBlock* block, /*out*/ HBasicBlock** succ) {
     47   if (block->GetPredecessors().size() == 1 &&
     48       block->GetSuccessors().size() == 1 &&
     49       block->IsSingleGoto()) {
     50     *succ = block->GetSingleSuccessor();
     51     return true;
     52   }
     53   return false;
     54 }
     55 
     56 // Detect an early exit loop.
     57 static bool IsEarlyExit(HLoopInformation* loop_info) {
     58   HBlocksInLoopReversePostOrderIterator it_loop(*loop_info);
     59   for (it_loop.Advance(); !it_loop.Done(); it_loop.Advance()) {
     60     for (HBasicBlock* successor : it_loop.Current()->GetSuccessors()) {
     61       if (!loop_info->Contains(*successor)) {
     62         return true;
     63       }
     64     }
     65   }
     66   return false;
     67 }
     68 
     69 // Detect a sign extension from the given type. Returns the promoted operand on success.
     70 static bool IsSignExtensionAndGet(HInstruction* instruction,
     71                                   Primitive::Type type,
     72                                   /*out*/ HInstruction** operand) {
     73   // Accept any already wider constant that would be handled properly by sign
     74   // extension when represented in the *width* of the given narrower data type
     75   // (the fact that char normally zero extends does not matter here).
     76   int64_t value = 0;
     77   if (IsInt64AndGet(instruction, /*out*/ &value)) {
     78     switch (type) {
     79       case Primitive::kPrimByte:
     80         if (std::numeric_limits<int8_t>::min() <= value &&
     81             std::numeric_limits<int8_t>::max() >= value) {
     82           *operand = instruction;
     83           return true;
     84         }
     85         return false;
     86       case Primitive::kPrimChar:
     87       case Primitive::kPrimShort:
     88         if (std::numeric_limits<int16_t>::min() <= value &&
     89             std::numeric_limits<int16_t>::max() <= value) {
     90           *operand = instruction;
     91           return true;
     92         }
     93         return false;
     94       default:
     95         return false;
     96     }
     97   }
     98   // An implicit widening conversion of a signed integer to an integral type sign-extends
     99   // the two's-complement representation of the integer value to fill the wider format.
    100   if (instruction->GetType() == type && (instruction->IsArrayGet() ||
    101                                          instruction->IsStaticFieldGet() ||
    102                                          instruction->IsInstanceFieldGet())) {
    103     switch (type) {
    104       case Primitive::kPrimByte:
    105       case Primitive::kPrimShort:
    106         *operand = instruction;
    107         return true;
    108       default:
    109         return false;
    110     }
    111   }
    112   // TODO: perhaps explicit conversions later too?
    113   //       (this may return something different from instruction)
    114   return false;
    115 }
    116 
    117 // Detect a zero extension from the given type. Returns the promoted operand on success.
    118 static bool IsZeroExtensionAndGet(HInstruction* instruction,
    119                                   Primitive::Type type,
    120                                   /*out*/ HInstruction** operand) {
    121   // Accept any already wider constant that would be handled properly by zero
    122   // extension when represented in the *width* of the given narrower data type
    123   // (the fact that byte/short normally sign extend does not matter here).
    124   int64_t value = 0;
    125   if (IsInt64AndGet(instruction, /*out*/ &value)) {
    126     switch (type) {
    127       case Primitive::kPrimByte:
    128         if (std::numeric_limits<uint8_t>::min() <= value &&
    129             std::numeric_limits<uint8_t>::max() >= value) {
    130           *operand = instruction;
    131           return true;
    132         }
    133         return false;
    134       case Primitive::kPrimChar:
    135       case Primitive::kPrimShort:
    136         if (std::numeric_limits<uint16_t>::min() <= value &&
    137             std::numeric_limits<uint16_t>::max() <= value) {
    138           *operand = instruction;
    139           return true;
    140         }
    141         return false;
    142       default:
    143         return false;
    144     }
    145   }
    146   // An implicit widening conversion of a char to an integral type zero-extends
    147   // the representation of the char value to fill the wider format.
    148   if (instruction->GetType() == type && (instruction->IsArrayGet() ||
    149                                          instruction->IsStaticFieldGet() ||
    150                                          instruction->IsInstanceFieldGet())) {
    151     if (type == Primitive::kPrimChar) {
    152       *operand = instruction;
    153       return true;
    154     }
    155   }
    156   // A sign (or zero) extension followed by an explicit removal of just the
    157   // higher sign bits is equivalent to a zero extension of the underlying operand.
    158   if (instruction->IsAnd()) {
    159     int64_t mask = 0;
    160     HInstruction* a = instruction->InputAt(0);
    161     HInstruction* b = instruction->InputAt(1);
    162     // In (a & b) find (mask & b) or (a & mask) with sign or zero extension on the non-mask.
    163     if ((IsInt64AndGet(a, /*out*/ &mask) && (IsSignExtensionAndGet(b, type, /*out*/ operand) ||
    164                                              IsZeroExtensionAndGet(b, type, /*out*/ operand))) ||
    165         (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
    166                                              IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
    167       switch ((*operand)->GetType()) {
    168         case Primitive::kPrimByte:  return mask == std::numeric_limits<uint8_t>::max();
    169         case Primitive::kPrimChar:
    170         case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
    171         default: return false;
    172       }
    173     }
    174   }
    175   // TODO: perhaps explicit conversions later too?
    176   return false;
    177 }
    178 
    179 // Detect situations with same-extension narrower operands.
    180 // Returns true on success and sets is_unsigned accordingly.
    181 static bool IsNarrowerOperands(HInstruction* a,
    182                                HInstruction* b,
    183                                Primitive::Type type,
    184                                /*out*/ HInstruction** r,
    185                                /*out*/ HInstruction** s,
    186                                /*out*/ bool* is_unsigned) {
    187   if (IsSignExtensionAndGet(a, type, r) && IsSignExtensionAndGet(b, type, s)) {
    188     *is_unsigned = false;
    189     return true;
    190   } else if (IsZeroExtensionAndGet(a, type, r) && IsZeroExtensionAndGet(b, type, s)) {
    191     *is_unsigned = true;
    192     return true;
    193   }
    194   return false;
    195 }
    196 
    197 // As above, single operand.
    198 static bool IsNarrowerOperand(HInstruction* a,
    199                               Primitive::Type type,
    200                               /*out*/ HInstruction** r,
    201                               /*out*/ bool* is_unsigned) {
    202   if (IsSignExtensionAndGet(a, type, r)) {
    203     *is_unsigned = false;
    204     return true;
    205   } else if (IsZeroExtensionAndGet(a, type, r)) {
    206     *is_unsigned = true;
    207     return true;
    208   }
    209   return false;
    210 }
    211 
    212 // Detect up to two instructions a and b, and an acccumulated constant c.
    213 static bool IsAddConstHelper(HInstruction* instruction,
    214                              /*out*/ HInstruction** a,
    215                              /*out*/ HInstruction** b,
    216                              /*out*/ int64_t* c,
    217                              int32_t depth) {
    218   static constexpr int32_t kMaxDepth = 8;  // don't search too deep
    219   int64_t value = 0;
    220   if (IsInt64AndGet(instruction, &value)) {
    221     *c += value;
    222     return true;
    223   } else if (instruction->IsAdd() && depth <= kMaxDepth) {
    224     return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) &&
    225            IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1);
    226   } else if (*a == nullptr) {
    227     *a = instruction;
    228     return true;
    229   } else if (*b == nullptr) {
    230     *b = instruction;
    231     return true;
    232   }
    233   return false;  // too many non-const operands
    234 }
    235 
    236 // Detect a + b + c for an optional constant c.
    237 static bool IsAddConst(HInstruction* instruction,
    238                        /*out*/ HInstruction** a,
    239                        /*out*/ HInstruction** b,
    240                        /*out*/ int64_t* c) {
    241   if (instruction->IsAdd()) {
    242     // Try to find a + b and accumulated c.
    243     if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) &&
    244         IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) &&
    245         *b != nullptr) {
    246       return true;
    247     }
    248     // Found a + b.
    249     *a = instruction->InputAt(0);
    250     *b = instruction->InputAt(1);
    251     *c = 0;
    252     return true;
    253   }
    254   return false;
    255 }
    256 
    257 // Test vector restrictions.
    258 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
    259   return (restrictions & tested) != 0;
    260 }
    261 
    262 // Insert an instruction.
    263 static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) {
    264   DCHECK(block != nullptr);
    265   DCHECK(instruction != nullptr);
    266   block->InsertInstructionBefore(instruction, block->GetLastInstruction());
    267   return instruction;
    268 }
    269 
    270 //
    271 // Class methods.
    272 //
    273 
    274 HLoopOptimization::HLoopOptimization(HGraph* graph,
    275                                      CompilerDriver* compiler_driver,
    276                                      HInductionVarAnalysis* induction_analysis)
    277     : HOptimization(graph, kLoopOptimizationPassName),
    278       compiler_driver_(compiler_driver),
    279       induction_range_(induction_analysis),
    280       loop_allocator_(nullptr),
    281       global_allocator_(graph_->GetArena()),
    282       top_loop_(nullptr),
    283       last_loop_(nullptr),
    284       iset_(nullptr),
    285       induction_simplication_count_(0),
    286       simplified_(false),
    287       vector_length_(0),
    288       vector_refs_(nullptr),
    289       vector_peeling_candidate_(nullptr),
    290       vector_runtime_test_a_(nullptr),
    291       vector_runtime_test_b_(nullptr),
    292       vector_map_(nullptr) {
    293 }
    294 
    295 void HLoopOptimization::Run() {
    296   // Skip if there is no loop or the graph has try-catch/irreducible loops.
    297   // TODO: make this less of a sledgehammer.
    298   if (!graph_->HasLoops() || graph_->HasTryCatch() || graph_->HasIrreducibleLoops()) {
    299     return;
    300   }
    301 
    302   // Phase-local allocator that draws from the global pool. Since the allocator
    303   // itself resides on the stack, it is destructed on exiting Run(), which
    304   // implies its underlying memory is released immediately.
    305   ArenaAllocator allocator(global_allocator_->GetArenaPool());
    306   loop_allocator_ = &allocator;
    307 
    308   // Perform loop optimizations.
    309   LocalRun();
    310   if (top_loop_ == nullptr) {
    311     graph_->SetHasLoops(false);  // no more loops
    312   }
    313 
    314   // Detach.
    315   loop_allocator_ = nullptr;
    316   last_loop_ = top_loop_ = nullptr;
    317 }
    318 
    319 void HLoopOptimization::LocalRun() {
    320   // Build the linear order using the phase-local allocator. This step enables building
    321   // a loop hierarchy that properly reflects the outer-inner and previous-next relation.
    322   ArenaVector<HBasicBlock*> linear_order(loop_allocator_->Adapter(kArenaAllocLinearOrder));
    323   LinearizeGraph(graph_, loop_allocator_, &linear_order);
    324 
    325   // Build the loop hierarchy.
    326   for (HBasicBlock* block : linear_order) {
    327     if (block->IsLoopHeader()) {
    328       AddLoop(block->GetLoopInformation());
    329     }
    330   }
    331 
    332   // Traverse the loop hierarchy inner-to-outer and optimize. Traversal can use
    333   // temporary data structures using the phase-local allocator. All new HIR
    334   // should use the global allocator.
    335   if (top_loop_ != nullptr) {
    336     ArenaSet<HInstruction*> iset(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
    337     ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
    338     ArenaSafeMap<HInstruction*, HInstruction*> map(
    339         std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
    340     // Attach.
    341     iset_ = &iset;
    342     vector_refs_ = &refs;
    343     vector_map_ = &map;
    344     // Traverse.
    345     TraverseLoopsInnerToOuter(top_loop_);
    346     // Detach.
    347     iset_ = nullptr;
    348     vector_refs_ = nullptr;
    349     vector_map_ = nullptr;
    350   }
    351 }
    352 
    353 void HLoopOptimization::AddLoop(HLoopInformation* loop_info) {
    354   DCHECK(loop_info != nullptr);
    355   LoopNode* node = new (loop_allocator_) LoopNode(loop_info);
    356   if (last_loop_ == nullptr) {
    357     // First loop.
    358     DCHECK(top_loop_ == nullptr);
    359     last_loop_ = top_loop_ = node;
    360   } else if (loop_info->IsIn(*last_loop_->loop_info)) {
    361     // Inner loop.
    362     node->outer = last_loop_;
    363     DCHECK(last_loop_->inner == nullptr);
    364     last_loop_ = last_loop_->inner = node;
    365   } else {
    366     // Subsequent loop.
    367     while (last_loop_->outer != nullptr && !loop_info->IsIn(*last_loop_->outer->loop_info)) {
    368       last_loop_ = last_loop_->outer;
    369     }
    370     node->outer = last_loop_->outer;
    371     node->previous = last_loop_;
    372     DCHECK(last_loop_->next == nullptr);
    373     last_loop_ = last_loop_->next = node;
    374   }
    375 }
    376 
    377 void HLoopOptimization::RemoveLoop(LoopNode* node) {
    378   DCHECK(node != nullptr);
    379   DCHECK(node->inner == nullptr);
    380   if (node->previous != nullptr) {
    381     // Within sequence.
    382     node->previous->next = node->next;
    383     if (node->next != nullptr) {
    384       node->next->previous = node->previous;
    385     }
    386   } else {
    387     // First of sequence.
    388     if (node->outer != nullptr) {
    389       node->outer->inner = node->next;
    390     } else {
    391       top_loop_ = node->next;
    392     }
    393     if (node->next != nullptr) {
    394       node->next->outer = node->outer;
    395       node->next->previous = nullptr;
    396     }
    397   }
    398 }
    399 
    400 void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
    401   for ( ; node != nullptr; node = node->next) {
    402     // Visit inner loops first.
    403     uint32_t current_induction_simplification_count = induction_simplication_count_;
    404     if (node->inner != nullptr) {
    405       TraverseLoopsInnerToOuter(node->inner);
    406     }
    407     // Recompute induction information of this loop if the induction
    408     // of any inner loop has been simplified.
    409     if (current_induction_simplification_count != induction_simplication_count_) {
    410       induction_range_.ReVisit(node->loop_info);
    411     }
    412     // Repeat simplifications in the loop-body until no more changes occur.
    413     // Note that since each simplification consists of eliminating code (without
    414     // introducing new code), this process is always finite.
    415     do {
    416       simplified_ = false;
    417       SimplifyInduction(node);
    418       SimplifyBlocks(node);
    419     } while (simplified_);
    420     // Optimize inner loop.
    421     if (node->inner == nullptr) {
    422       OptimizeInnerLoop(node);
    423     }
    424   }
    425 }
    426 
    427 //
    428 // Optimization.
    429 //
    430 
    431 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
    432   HBasicBlock* header = node->loop_info->GetHeader();
    433   HBasicBlock* preheader = node->loop_info->GetPreHeader();
    434   // Scan the phis in the header to find opportunities to simplify an induction
    435   // cycle that is only used outside the loop. Replace these uses, if any, with
    436   // the last value and remove the induction cycle.
    437   // Examples: for (int i = 0; x != null;   i++) { .... no i .... }
    438   //           for (int i = 0; i < 10; i++, k++) { .... no k .... } return k;
    439   for (HInstructionIterator it(header->GetPhis()); !it.Done(); it.Advance()) {
    440     HPhi* phi = it.Current()->AsPhi();
    441     iset_->clear();  // prepare phi induction
    442     if (TrySetPhiInduction(phi, /*restrict_uses*/ true) &&
    443         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ false)) {
    444       // Note that it's ok to have replaced uses after the loop with the last value, without
    445       // being able to remove the cycle. Environment uses (which are the reason we may not be
    446       // able to remove the cycle) within the loop will still hold the right value.
    447       if (CanRemoveCycle()) {
    448         for (HInstruction* i : *iset_) {
    449           RemoveFromCycle(i);
    450         }
    451         simplified_ = true;
    452       }
    453     }
    454   }
    455 }
    456 
    457 void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
    458   // Iterate over all basic blocks in the loop-body.
    459   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
    460     HBasicBlock* block = it.Current();
    461     // Remove dead instructions from the loop-body.
    462     RemoveDeadInstructions(block->GetPhis());
    463     RemoveDeadInstructions(block->GetInstructions());
    464     // Remove trivial control flow blocks from the loop-body.
    465     if (block->GetPredecessors().size() == 1 &&
    466         block->GetSuccessors().size() == 1 &&
    467         block->GetSingleSuccessor()->GetPredecessors().size() == 1) {
    468       simplified_ = true;
    469       block->MergeWith(block->GetSingleSuccessor());
    470     } else if (block->GetSuccessors().size() == 2) {
    471       // Trivial if block can be bypassed to either branch.
    472       HBasicBlock* succ0 = block->GetSuccessors()[0];
    473       HBasicBlock* succ1 = block->GetSuccessors()[1];
    474       HBasicBlock* meet0 = nullptr;
    475       HBasicBlock* meet1 = nullptr;
    476       if (succ0 != succ1 &&
    477           IsGotoBlock(succ0, &meet0) &&
    478           IsGotoBlock(succ1, &meet1) &&
    479           meet0 == meet1 &&  // meets again
    480           meet0 != block &&  // no self-loop
    481           meet0->GetPhis().IsEmpty()) {  // not used for merging
    482         simplified_ = true;
    483         succ0->DisconnectAndDelete();
    484         if (block->Dominates(meet0)) {
    485           block->RemoveDominatedBlock(meet0);
    486           succ1->AddDominatedBlock(meet0);
    487           meet0->SetDominator(succ1);
    488         }
    489       }
    490     }
    491   }
    492 }
    493 
    494 void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
    495   HBasicBlock* header = node->loop_info->GetHeader();
    496   HBasicBlock* preheader = node->loop_info->GetPreHeader();
    497   // Ensure loop header logic is finite.
    498   int64_t trip_count = 0;
    499   if (!induction_range_.IsFinite(node->loop_info, &trip_count)) {
    500     return;
    501   }
    502 
    503   // Ensure there is only a single loop-body (besides the header).
    504   HBasicBlock* body = nullptr;
    505   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
    506     if (it.Current() != header) {
    507       if (body != nullptr) {
    508         return;
    509       }
    510       body = it.Current();
    511     }
    512   }
    513   CHECK(body != nullptr);
    514   // Ensure there is only a single exit point.
    515   if (header->GetSuccessors().size() != 2) {
    516     return;
    517   }
    518   HBasicBlock* exit = (header->GetSuccessors()[0] == body)
    519       ? header->GetSuccessors()[1]
    520       : header->GetSuccessors()[0];
    521   // Ensure exit can only be reached by exiting loop.
    522   if (exit->GetPredecessors().size() != 1) {
    523     return;
    524   }
    525   // Detect either an empty loop (no side effects other than plain iteration) or
    526   // a trivial loop (just iterating once). Replace subsequent index uses, if any,
    527   // with the last value and remove the loop, possibly after unrolling its body.
    528   HInstruction* phi = header->GetFirstPhi();
    529   iset_->clear();  // prepare phi induction
    530   if (TrySetSimpleLoopHeader(header)) {
    531     bool is_empty = IsEmptyBody(body);
    532     if ((is_empty || trip_count == 1) &&
    533         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
    534       if (!is_empty) {
    535         // Unroll the loop-body, which sees initial value of the index.
    536         phi->ReplaceWith(phi->InputAt(0));
    537         preheader->MergeInstructionsWith(body);
    538       }
    539       body->DisconnectAndDelete();
    540       exit->RemovePredecessor(header);
    541       header->RemoveSuccessor(exit);
    542       header->RemoveDominatedBlock(exit);
    543       header->DisconnectAndDelete();
    544       preheader->AddSuccessor(exit);
    545       preheader->AddInstruction(new (global_allocator_) HGoto());
    546       preheader->AddDominatedBlock(exit);
    547       exit->SetDominator(preheader);
    548       RemoveLoop(node);  // update hierarchy
    549       return;
    550     }
    551   }
    552 
    553   // Vectorize loop, if possible and valid.
    554   if (kEnableVectorization) {
    555     iset_->clear();  // prepare phi induction
    556     if (TrySetSimpleLoopHeader(header) &&
    557         ShouldVectorize(node, body, trip_count) &&
    558         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
    559       Vectorize(node, body, exit, trip_count);
    560       graph_->SetHasSIMD(true);  // flag SIMD usage
    561       return;
    562     }
    563   }
    564 }
    565 
    566 //
    567 // Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
    568 // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
    569 // Intel Press, June, 2004 (http://www.aartbik.com/).
    570 //
    571 
    572 bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
    573   // Reset vector bookkeeping.
    574   vector_length_ = 0;
    575   vector_refs_->clear();
    576   vector_peeling_candidate_ = nullptr;
    577   vector_runtime_test_a_ =
    578   vector_runtime_test_b_= nullptr;
    579 
    580   // Phis in the loop-body prevent vectorization.
    581   if (!block->GetPhis().IsEmpty()) {
    582     return false;
    583   }
    584 
    585   // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side
    586   // occurrence, which allows passing down attributes down the use tree.
    587   for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
    588     if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
    589       return false;  // failure to vectorize a left-hand-side
    590     }
    591   }
    592 
    593   // Does vectorization seem profitable?
    594   if (!IsVectorizationProfitable(trip_count)) {
    595     return false;
    596   }
    597 
    598   // Data dependence analysis. Find each pair of references with same type, where
    599   // at least one is a write. Each such pair denotes a possible data dependence.
    600   // This analysis exploits the property that differently typed arrays cannot be
    601   // aliased, as well as the property that references either point to the same
    602   // array or to two completely disjoint arrays, i.e., no partial aliasing.
    603   // Other than a few simply heuristics, no detailed subscript analysis is done.
    604   for (auto i = vector_refs_->begin(); i != vector_refs_->end(); ++i) {
    605     for (auto j = i; ++j != vector_refs_->end(); ) {
    606       if (i->type == j->type && (i->lhs || j->lhs)) {
    607         // Found same-typed a[i+x] vs. b[i+y], where at least one is a write.
    608         HInstruction* a = i->base;
    609         HInstruction* b = j->base;
    610         HInstruction* x = i->offset;
    611         HInstruction* y = j->offset;
    612         if (a == b) {
    613           // Found a[i+x] vs. a[i+y]. Accept if x == y (loop-independent data dependence).
    614           // Conservatively assume a loop-carried data dependence otherwise, and reject.
    615           if (x != y) {
    616             return false;
    617           }
    618         } else {
    619           // Found a[i+x] vs. b[i+y]. Accept if x == y (at worst loop-independent data dependence).
    620           // Conservatively assume a potential loop-carried data dependence otherwise, avoided by
    621           // generating an explicit a != b disambiguation runtime test on the two references.
    622           if (x != y) {
    623             // To avoid excessive overhead, we only accept one a != b test.
    624             if (vector_runtime_test_a_ == nullptr) {
    625               // First test found.
    626               vector_runtime_test_a_ = a;
    627               vector_runtime_test_b_ = b;
    628             } else if ((vector_runtime_test_a_ != a || vector_runtime_test_b_ != b) &&
    629                        (vector_runtime_test_a_ != b || vector_runtime_test_b_ != a)) {
    630               return false;  // second test would be needed
    631             }
    632           }
    633         }
    634       }
    635     }
    636   }
    637 
    638   // Consider dynamic loop peeling for alignment.
    639   SetPeelingCandidate(trip_count);
    640 
    641   // Success!
    642   return true;
    643 }
    644 
    645 void HLoopOptimization::Vectorize(LoopNode* node,
    646                                   HBasicBlock* block,
    647                                   HBasicBlock* exit,
    648                                   int64_t trip_count) {
    649   Primitive::Type induc_type = Primitive::kPrimInt;
    650   HBasicBlock* header = node->loop_info->GetHeader();
    651   HBasicBlock* preheader = node->loop_info->GetPreHeader();
    652 
    653   // Pick a loop unrolling factor for the vector loop.
    654   uint32_t unroll = GetUnrollingFactor(block, trip_count);
    655   uint32_t chunk = vector_length_ * unroll;
    656 
    657   // A cleanup loop is needed, at least, for any unknown trip count or
    658   // for a known trip count with remainder iterations after vectorization.
    659   bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
    660 
    661   // Adjust vector bookkeeping.
    662   iset_->clear();  // prepare phi induction
    663   bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
    664   DCHECK(is_simple_loop_header);
    665   vector_header_ = header;
    666   vector_body_ = block;
    667 
    668   // Generate dynamic loop peeling trip count, if needed:
    669   // ptc = <peeling-needed-for-candidate>
    670   HInstruction* ptc = nullptr;
    671   if (vector_peeling_candidate_ != nullptr) {
    672     DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
    673     //
    674     // TODO: Implement this. Compute address of first access memory location and
    675     //       compute peeling factor to obtain kAlignedBase alignment.
    676     //
    677     needs_cleanup = true;
    678   }
    679 
    680   // Generate loop control:
    681   // stc = <trip-count>;
    682   // vtc = stc - (stc - ptc) % chunk;
    683   // i = 0;
    684   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
    685   HInstruction* vtc = stc;
    686   if (needs_cleanup) {
    687     DCHECK(IsPowerOfTwo(chunk));
    688     HInstruction* diff = stc;
    689     if (ptc != nullptr) {
    690       diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
    691     }
    692     HInstruction* rem = Insert(
    693         preheader, new (global_allocator_) HAnd(induc_type,
    694                                                 diff,
    695                                                 graph_->GetIntConstant(chunk - 1)));
    696     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
    697   }
    698   vector_index_ = graph_->GetIntConstant(0);
    699 
    700   // Generate runtime disambiguation test:
    701   // vtc = a != b ? vtc : 0;
    702   if (vector_runtime_test_a_ != nullptr) {
    703     HInstruction* rt = Insert(
    704         preheader,
    705         new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
    706     vtc = Insert(preheader,
    707                  new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
    708     needs_cleanup = true;
    709   }
    710 
    711   // Generate dynamic peeling loop for alignment, if needed:
    712   // for ( ; i < ptc; i += 1)
    713   //    <loop-body>
    714   if (ptc != nullptr) {
    715     vector_mode_ = kSequential;
    716     GenerateNewLoop(node,
    717                     block,
    718                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
    719                     vector_index_,
    720                     ptc,
    721                     graph_->GetIntConstant(1),
    722                     /*unroll*/ 1);
    723   }
    724 
    725   // Generate vector loop, possibly further unrolled:
    726   // for ( ; i < vtc; i += chunk)
    727   //    <vectorized-loop-body>
    728   vector_mode_ = kVector;
    729   GenerateNewLoop(node,
    730                   block,
    731                   graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
    732                   vector_index_,
    733                   vtc,
    734                   graph_->GetIntConstant(vector_length_),  // increment per unroll
    735                   unroll);
    736   HLoopInformation* vloop = vector_header_->GetLoopInformation();
    737 
    738   // Generate cleanup loop, if needed:
    739   // for ( ; i < stc; i += 1)
    740   //    <loop-body>
    741   if (needs_cleanup) {
    742     vector_mode_ = kSequential;
    743     GenerateNewLoop(node,
    744                     block,
    745                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
    746                     vector_index_,
    747                     stc,
    748                     graph_->GetIntConstant(1),
    749                     /*unroll*/ 1);
    750   }
    751 
    752   // Remove the original loop by disconnecting the body block
    753   // and removing all instructions from the header.
    754   block->DisconnectAndDelete();
    755   while (!header->GetFirstInstruction()->IsGoto()) {
    756     header->RemoveInstruction(header->GetFirstInstruction());
    757   }
    758   // Update loop hierarchy: the old header now resides in the same outer loop
    759   // as the old preheader. Note that we don't bother putting sequential
    760   // loops back in the hierarchy at this point.
    761   header->SetLoopInformation(preheader->GetLoopInformation());  // outward
    762   node->loop_info = vloop;
    763 }
    764 
    765 void HLoopOptimization::GenerateNewLoop(LoopNode* node,
    766                                         HBasicBlock* block,
    767                                         HBasicBlock* new_preheader,
    768                                         HInstruction* lo,
    769                                         HInstruction* hi,
    770                                         HInstruction* step,
    771                                         uint32_t unroll) {
    772   DCHECK(unroll == 1 || vector_mode_ == kVector);
    773   Primitive::Type induc_type = Primitive::kPrimInt;
    774   // Prepare new loop.
    775   vector_preheader_ = new_preheader,
    776   vector_header_ = vector_preheader_->GetSingleSuccessor();
    777   vector_body_ = vector_header_->GetSuccessors()[1];
    778   HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
    779                                            kNoRegNumber,
    780                                            0,
    781                                            HPhi::ToPhiType(induc_type));
    782   // Generate header and prepare body.
    783   // for (i = lo; i < hi; i += step)
    784   //    <loop-body>
    785   HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
    786   vector_header_->AddPhi(phi);
    787   vector_header_->AddInstruction(cond);
    788   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
    789   vector_index_ = phi;
    790   for (uint32_t u = 0; u < unroll; u++) {
    791     // Clear map, leaving loop invariants setup during unrolling.
    792     if (u == 0) {
    793       vector_map_->clear();
    794     } else {
    795       for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
    796         if (i->second->IsVecReplicateScalar()) {
    797           DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
    798           ++i;
    799         } else {
    800           i = vector_map_->erase(i);
    801         }
    802       }
    803     }
    804     // Generate instruction map.
    805     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
    806       bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
    807       DCHECK(vectorized_def);
    808     }
    809     // Generate body from the instruction map, but in original program order.
    810     HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
    811     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
    812       auto i = vector_map_->find(it.Current());
    813       if (i != vector_map_->end() && !i->second->IsInBlock()) {
    814         Insert(vector_body_, i->second);
    815         // Deal with instructions that need an environment, such as the scalar intrinsics.
    816         if (i->second->NeedsEnvironment()) {
    817           i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
    818         }
    819       }
    820     }
    821     vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
    822     Insert(vector_body_, vector_index_);
    823   }
    824   // Finalize phi for the loop index.
    825   phi->AddInput(lo);
    826   phi->AddInput(vector_index_);
    827   vector_index_ = phi;
    828 }
    829 
    830 // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
    831 bool HLoopOptimization::VectorizeDef(LoopNode* node,
    832                                      HInstruction* instruction,
    833                                      bool generate_code) {
    834   // Accept a left-hand-side array base[index] for
    835   // (1) supported vector type,
    836   // (2) loop-invariant base,
    837   // (3) unit stride index,
    838   // (4) vectorizable right-hand-side value.
    839   uint64_t restrictions = kNone;
    840   if (instruction->IsArraySet()) {
    841     Primitive::Type type = instruction->AsArraySet()->GetComponentType();
    842     HInstruction* base = instruction->InputAt(0);
    843     HInstruction* index = instruction->InputAt(1);
    844     HInstruction* value = instruction->InputAt(2);
    845     HInstruction* offset = nullptr;
    846     if (TrySetVectorType(type, &restrictions) &&
    847         node->loop_info->IsDefinedOutOfTheLoop(base) &&
    848         induction_range_.IsUnitStride(instruction, index, graph_, &offset) &&
    849         VectorizeUse(node, value, generate_code, type, restrictions)) {
    850       if (generate_code) {
    851         GenerateVecSub(index, offset);
    852         GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
    853       } else {
    854         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
    855       }
    856       return true;
    857     }
    858     return false;
    859   }
    860   // Branch back okay.
    861   if (instruction->IsGoto()) {
    862     return true;
    863   }
    864   // Otherwise accept only expressions with no effects outside the immediate loop-body.
    865   // Note that actual uses are inspected during right-hand-side tree traversal.
    866   return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite();
    867 }
    868 
    869 // TODO: saturation arithmetic.
    870 bool HLoopOptimization::VectorizeUse(LoopNode* node,
    871                                      HInstruction* instruction,
    872                                      bool generate_code,
    873                                      Primitive::Type type,
    874                                      uint64_t restrictions) {
    875   // Accept anything for which code has already been generated.
    876   if (generate_code) {
    877     if (vector_map_->find(instruction) != vector_map_->end()) {
    878       return true;
    879     }
    880   }
    881   // Continue the right-hand-side tree traversal, passing in proper
    882   // types and vector restrictions along the way. During code generation,
    883   // all new nodes are drawn from the global allocator.
    884   if (node->loop_info->IsDefinedOutOfTheLoop(instruction)) {
    885     // Accept invariant use, using scalar expansion.
    886     if (generate_code) {
    887       GenerateVecInv(instruction, type);
    888     }
    889     return true;
    890   } else if (instruction->IsArrayGet()) {
    891     // Deal with vector restrictions.
    892     if (instruction->AsArrayGet()->IsStringCharAt() &&
    893         HasVectorRestrictions(restrictions, kNoStringCharAt)) {
    894       return false;
    895     }
    896     // Accept a right-hand-side array base[index] for
    897     // (1) exact matching vector type,
    898     // (2) loop-invariant base,
    899     // (3) unit stride index,
    900     // (4) vectorizable right-hand-side value.
    901     HInstruction* base = instruction->InputAt(0);
    902     HInstruction* index = instruction->InputAt(1);
    903     HInstruction* offset = nullptr;
    904     if (type == instruction->GetType() &&
    905         node->loop_info->IsDefinedOutOfTheLoop(base) &&
    906         induction_range_.IsUnitStride(instruction, index, graph_, &offset)) {
    907       if (generate_code) {
    908         GenerateVecSub(index, offset);
    909         GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
    910       } else {
    911         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
    912       }
    913       return true;
    914     }
    915   } else if (instruction->IsTypeConversion()) {
    916     // Accept particular type conversions.
    917     HTypeConversion* conversion = instruction->AsTypeConversion();
    918     HInstruction* opa = conversion->InputAt(0);
    919     Primitive::Type from = conversion->GetInputType();
    920     Primitive::Type to = conversion->GetResultType();
    921     if ((to == Primitive::kPrimByte ||
    922          to == Primitive::kPrimChar ||
    923          to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
    924       // Accept a "narrowing" type conversion from a "wider" computation for
    925       // (1) conversion into final required type,
    926       // (2) vectorizable operand,
    927       // (3) "wider" operations cannot bring in higher order bits.
    928       if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
    929         if (generate_code) {
    930           if (vector_mode_ == kVector) {
    931             vector_map_->Put(instruction, vector_map_->Get(opa));  // operand pass-through
    932           } else {
    933             GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
    934           }
    935         }
    936         return true;
    937       }
    938     } else if (to == Primitive::kPrimFloat && from == Primitive::kPrimInt) {
    939       DCHECK_EQ(to, type);
    940       // Accept int to float conversion for
    941       // (1) supported int,
    942       // (2) vectorizable operand.
    943       if (TrySetVectorType(from, &restrictions) &&
    944           VectorizeUse(node, opa, generate_code, from, restrictions)) {
    945         if (generate_code) {
    946           GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
    947         }
    948         return true;
    949       }
    950     }
    951     return false;
    952   } else if (instruction->IsNeg() || instruction->IsNot() || instruction->IsBooleanNot()) {
    953     // Accept unary operator for vectorizable operand.
    954     HInstruction* opa = instruction->InputAt(0);
    955     if (VectorizeUse(node, opa, generate_code, type, restrictions)) {
    956       if (generate_code) {
    957         GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type);
    958       }
    959       return true;
    960     }
    961   } else if (instruction->IsAdd() || instruction->IsSub() ||
    962              instruction->IsMul() || instruction->IsDiv() ||
    963              instruction->IsAnd() || instruction->IsOr()  || instruction->IsXor()) {
    964     // Deal with vector restrictions.
    965     if ((instruction->IsMul() && HasVectorRestrictions(restrictions, kNoMul)) ||
    966         (instruction->IsDiv() && HasVectorRestrictions(restrictions, kNoDiv))) {
    967       return false;
    968     }
    969     // Accept binary operator for vectorizable operands.
    970     HInstruction* opa = instruction->InputAt(0);
    971     HInstruction* opb = instruction->InputAt(1);
    972     if (VectorizeUse(node, opa, generate_code, type, restrictions) &&
    973         VectorizeUse(node, opb, generate_code, type, restrictions)) {
    974       if (generate_code) {
    975         GenerateVecOp(instruction, vector_map_->Get(opa), vector_map_->Get(opb), type);
    976       }
    977       return true;
    978     }
    979   } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
    980     // Recognize vectorization idioms.
    981     if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
    982       return true;
    983     }
    984     // Deal with vector restrictions.
    985     HInstruction* opa = instruction->InputAt(0);
    986     HInstruction* opb = instruction->InputAt(1);
    987     HInstruction* r = opa;
    988     bool is_unsigned = false;
    989     if ((HasVectorRestrictions(restrictions, kNoShift)) ||
    990         (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) {
    991       return false;  // unsupported instruction
    992     } else if (HasVectorRestrictions(restrictions, kNoHiBits)) {
    993       // Shifts right need extra care to account for higher order bits.
    994       // TODO: less likely shr/unsigned and ushr/signed can by flipping signess.
    995       if (instruction->IsShr() &&
    996           (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
    997         return false;  // reject, unless all operands are sign-extension narrower
    998       } else if (instruction->IsUShr() &&
    999                  (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || !is_unsigned)) {
   1000         return false;  // reject, unless all operands are zero-extension narrower
   1001       }
   1002     }
   1003     // Accept shift operator for vectorizable/invariant operands.
   1004     // TODO: accept symbolic, albeit loop invariant shift factors.
   1005     DCHECK(r != nullptr);
   1006     if (generate_code && vector_mode_ != kVector) {  // de-idiom
   1007       r = opa;
   1008     }
   1009     int64_t distance = 0;
   1010     if (VectorizeUse(node, r, generate_code, type, restrictions) &&
   1011         IsInt64AndGet(opb, /*out*/ &distance)) {
   1012       // Restrict shift distance to packed data type width.
   1013       int64_t max_distance = Primitive::ComponentSize(type) * 8;
   1014       if (0 <= distance && distance < max_distance) {
   1015         if (generate_code) {
   1016           GenerateVecOp(instruction, vector_map_->Get(r), opb, type);
   1017         }
   1018         return true;
   1019       }
   1020     }
   1021   } else if (instruction->IsInvokeStaticOrDirect()) {
   1022     // Accept particular intrinsics.
   1023     HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect();
   1024     switch (invoke->GetIntrinsic()) {
   1025       case Intrinsics::kMathAbsInt:
   1026       case Intrinsics::kMathAbsLong:
   1027       case Intrinsics::kMathAbsFloat:
   1028       case Intrinsics::kMathAbsDouble: {
   1029         // Deal with vector restrictions.
   1030         HInstruction* opa = instruction->InputAt(0);
   1031         HInstruction* r = opa;
   1032         bool is_unsigned = false;
   1033         if (HasVectorRestrictions(restrictions, kNoAbs)) {
   1034           return false;
   1035         } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
   1036                    (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
   1037           return false;  // reject, unless operand is sign-extension narrower
   1038         }
   1039         // Accept ABS(x) for vectorizable operand.
   1040         DCHECK(r != nullptr);
   1041         if (generate_code && vector_mode_ != kVector) {  // de-idiom
   1042           r = opa;
   1043         }
   1044         if (VectorizeUse(node, r, generate_code, type, restrictions)) {
   1045           if (generate_code) {
   1046             GenerateVecOp(instruction, vector_map_->Get(r), nullptr, type);
   1047           }
   1048           return true;
   1049         }
   1050         return false;
   1051       }
   1052       case Intrinsics::kMathMinIntInt:
   1053       case Intrinsics::kMathMinLongLong:
   1054       case Intrinsics::kMathMinFloatFloat:
   1055       case Intrinsics::kMathMinDoubleDouble:
   1056       case Intrinsics::kMathMaxIntInt:
   1057       case Intrinsics::kMathMaxLongLong:
   1058       case Intrinsics::kMathMaxFloatFloat:
   1059       case Intrinsics::kMathMaxDoubleDouble: {
   1060         // Deal with vector restrictions.
   1061         HInstruction* opa = instruction->InputAt(0);
   1062         HInstruction* opb = instruction->InputAt(1);
   1063         HInstruction* r = opa;
   1064         HInstruction* s = opb;
   1065         bool is_unsigned = false;
   1066         if (HasVectorRestrictions(restrictions, kNoMinMax)) {
   1067           return false;
   1068         } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
   1069                    !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
   1070           return false;  // reject, unless all operands are same-extension narrower
   1071         }
   1072         // Accept MIN/MAX(x, y) for vectorizable operands.
   1073         DCHECK(r != nullptr && s != nullptr);
   1074         if (generate_code && vector_mode_ != kVector) {  // de-idiom
   1075           r = opa;
   1076           s = opb;
   1077         }
   1078         if (VectorizeUse(node, r, generate_code, type, restrictions) &&
   1079             VectorizeUse(node, s, generate_code, type, restrictions)) {
   1080           if (generate_code) {
   1081             GenerateVecOp(
   1082                 instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
   1083           }
   1084           return true;
   1085         }
   1086         return false;
   1087       }
   1088       default:
   1089         return false;
   1090     }  // switch
   1091   }
   1092   return false;
   1093 }
   1094 
   1095 bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restrictions) {
   1096   const InstructionSetFeatures* features = compiler_driver_->GetInstructionSetFeatures();
   1097   switch (compiler_driver_->GetInstructionSet()) {
   1098     case kArm:
   1099     case kThumb2:
   1100       // Allow vectorization for all ARM devices, because Android assumes that
   1101       // ARM 32-bit always supports advanced SIMD.
   1102       switch (type) {
   1103         case Primitive::kPrimBoolean:
   1104         case Primitive::kPrimByte:
   1105           *restrictions |= kNoDiv;
   1106           return TrySetVectorLength(8);
   1107         case Primitive::kPrimChar:
   1108         case Primitive::kPrimShort:
   1109           *restrictions |= kNoDiv | kNoStringCharAt;
   1110           return TrySetVectorLength(4);
   1111         case Primitive::kPrimInt:
   1112           *restrictions |= kNoDiv;
   1113           return TrySetVectorLength(2);
   1114         default:
   1115           break;
   1116       }
   1117       return false;
   1118     case kArm64:
   1119       // Allow vectorization for all ARM devices, because Android assumes that
   1120       // ARMv8 AArch64 always supports advanced SIMD.
   1121       switch (type) {
   1122         case Primitive::kPrimBoolean:
   1123         case Primitive::kPrimByte:
   1124           *restrictions |= kNoDiv;
   1125           return TrySetVectorLength(16);
   1126         case Primitive::kPrimChar:
   1127         case Primitive::kPrimShort:
   1128           *restrictions |= kNoDiv;
   1129           return TrySetVectorLength(8);
   1130         case Primitive::kPrimInt:
   1131           *restrictions |= kNoDiv;
   1132           return TrySetVectorLength(4);
   1133         case Primitive::kPrimLong:
   1134           *restrictions |= kNoDiv | kNoMul | kNoMinMax;
   1135           return TrySetVectorLength(2);
   1136         case Primitive::kPrimFloat:
   1137           return TrySetVectorLength(4);
   1138         case Primitive::kPrimDouble:
   1139           return TrySetVectorLength(2);
   1140         default:
   1141           return false;
   1142       }
   1143     case kX86:
   1144     case kX86_64:
   1145       // Allow vectorization for SSE4-enabled X86 devices only (128-bit vectors).
   1146       if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) {
   1147         switch (type) {
   1148           case Primitive::kPrimBoolean:
   1149           case Primitive::kPrimByte:
   1150             *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
   1151             return TrySetVectorLength(16);
   1152           case Primitive::kPrimChar:
   1153           case Primitive::kPrimShort:
   1154             *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
   1155             return TrySetVectorLength(8);
   1156           case Primitive::kPrimInt:
   1157             *restrictions |= kNoDiv;
   1158             return TrySetVectorLength(4);
   1159           case Primitive::kPrimLong:
   1160             *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
   1161             return TrySetVectorLength(2);
   1162           case Primitive::kPrimFloat:
   1163             *restrictions |= kNoMinMax;  // -0.0 vs +0.0
   1164             return TrySetVectorLength(4);
   1165           case Primitive::kPrimDouble:
   1166             *restrictions |= kNoMinMax;  // -0.0 vs +0.0
   1167             return TrySetVectorLength(2);
   1168           default:
   1169             break;
   1170         }  // switch type
   1171       }
   1172       return false;
   1173     case kMips:
   1174       if (features->AsMipsInstructionSetFeatures()->HasMsa()) {
   1175         switch (type) {
   1176           case Primitive::kPrimBoolean:
   1177           case Primitive::kPrimByte:
   1178             *restrictions |= kNoDiv;
   1179             return TrySetVectorLength(16);
   1180           case Primitive::kPrimChar:
   1181           case Primitive::kPrimShort:
   1182             *restrictions |= kNoDiv | kNoStringCharAt;
   1183             return TrySetVectorLength(8);
   1184           case Primitive::kPrimInt:
   1185             *restrictions |= kNoDiv;
   1186             return TrySetVectorLength(4);
   1187           case Primitive::kPrimLong:
   1188             *restrictions |= kNoDiv;
   1189             return TrySetVectorLength(2);
   1190           case Primitive::kPrimFloat:
   1191             *restrictions |= kNoMinMax;  // min/max(x, NaN)
   1192             return TrySetVectorLength(4);
   1193           case Primitive::kPrimDouble:
   1194             *restrictions |= kNoMinMax;  // min/max(x, NaN)
   1195             return TrySetVectorLength(2);
   1196           default:
   1197             break;
   1198         }  // switch type
   1199       }
   1200       return false;
   1201     case kMips64:
   1202       if (features->AsMips64InstructionSetFeatures()->HasMsa()) {
   1203         switch (type) {
   1204           case Primitive::kPrimBoolean:
   1205           case Primitive::kPrimByte:
   1206             *restrictions |= kNoDiv;
   1207             return TrySetVectorLength(16);
   1208           case Primitive::kPrimChar:
   1209           case Primitive::kPrimShort:
   1210             *restrictions |= kNoDiv | kNoStringCharAt;
   1211             return TrySetVectorLength(8);
   1212           case Primitive::kPrimInt:
   1213             *restrictions |= kNoDiv;
   1214             return TrySetVectorLength(4);
   1215           case Primitive::kPrimLong:
   1216             *restrictions |= kNoDiv;
   1217             return TrySetVectorLength(2);
   1218           case Primitive::kPrimFloat:
   1219             *restrictions |= kNoMinMax;  // min/max(x, NaN)
   1220             return TrySetVectorLength(4);
   1221           case Primitive::kPrimDouble:
   1222             *restrictions |= kNoMinMax;  // min/max(x, NaN)
   1223             return TrySetVectorLength(2);
   1224           default:
   1225             break;
   1226         }  // switch type
   1227       }
   1228       return false;
   1229     default:
   1230       return false;
   1231   }  // switch instruction set
   1232 }
   1233 
   1234 bool HLoopOptimization::TrySetVectorLength(uint32_t length) {
   1235   DCHECK(IsPowerOfTwo(length) && length >= 2u);
   1236   // First time set?
   1237   if (vector_length_ == 0) {
   1238     vector_length_ = length;
   1239   }
   1240   // Different types are acceptable within a loop-body, as long as all the corresponding vector
   1241   // lengths match exactly to obtain a uniform traversal through the vector iteration space
   1242   // (idiomatic exceptions to this rule can be handled by further unrolling sub-expressions).
   1243   return vector_length_ == length;
   1244 }
   1245 
   1246 void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) {
   1247   if (vector_map_->find(org) == vector_map_->end()) {
   1248     // In scalar code, just use a self pass-through for scalar invariants
   1249     // (viz. expression remains itself).
   1250     if (vector_mode_ == kSequential) {
   1251       vector_map_->Put(org, org);
   1252       return;
   1253     }
   1254     // In vector code, explicit scalar expansion is needed.
   1255     HInstruction* vector = new (global_allocator_) HVecReplicateScalar(
   1256         global_allocator_, org, type, vector_length_);
   1257     vector_map_->Put(org, Insert(vector_preheader_, vector));
   1258   }
   1259 }
   1260 
   1261 void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
   1262   if (vector_map_->find(org) == vector_map_->end()) {
   1263     HInstruction* subscript = vector_index_;
   1264     int64_t value = 0;
   1265     if (!IsInt64AndGet(offset, &value) || value != 0) {
   1266       subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
   1267       if (org->IsPhi()) {
   1268         Insert(vector_body_, subscript);  // lacks layout placeholder
   1269       }
   1270     }
   1271     vector_map_->Put(org, subscript);
   1272   }
   1273 }
   1274 
   1275 void HLoopOptimization::GenerateVecMem(HInstruction* org,
   1276                                        HInstruction* opa,
   1277                                        HInstruction* opb,
   1278                                        HInstruction* offset,
   1279                                        Primitive::Type type) {
   1280   HInstruction* vector = nullptr;
   1281   if (vector_mode_ == kVector) {
   1282     // Vector store or load.
   1283     HInstruction* base = org->InputAt(0);
   1284     if (opb != nullptr) {
   1285       vector = new (global_allocator_) HVecStore(
   1286           global_allocator_, base, opa, opb, type, vector_length_);
   1287     } else  {
   1288       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
   1289       vector = new (global_allocator_) HVecLoad(
   1290           global_allocator_, base, opa, type, vector_length_, is_string_char_at);
   1291     }
   1292     // Known dynamically enforced alignment?
   1293     // TODO: detect offset + constant differences.
   1294     // TODO: long run, static alignment analysis?
   1295     if (vector_peeling_candidate_ != nullptr &&
   1296         vector_peeling_candidate_->base == base &&
   1297         vector_peeling_candidate_->offset == offset) {
   1298       vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
   1299     }
   1300   } else {
   1301     // Scalar store or load.
   1302     DCHECK(vector_mode_ == kSequential);
   1303     if (opb != nullptr) {
   1304       vector = new (global_allocator_) HArraySet(org->InputAt(0), opa, opb, type, kNoDexPc);
   1305     } else  {
   1306       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
   1307       vector = new (global_allocator_) HArrayGet(
   1308           org->InputAt(0), opa, type, kNoDexPc, is_string_char_at);
   1309     }
   1310   }
   1311   vector_map_->Put(org, vector);
   1312 }
   1313 
   1314 #define GENERATE_VEC(x, y) \
   1315   if (vector_mode_ == kVector) { \
   1316     vector = (x); \
   1317   } else { \
   1318     DCHECK(vector_mode_ == kSequential); \
   1319     vector = (y); \
   1320   } \
   1321   break;
   1322 
   1323 void HLoopOptimization::GenerateVecOp(HInstruction* org,
   1324                                       HInstruction* opa,
   1325                                       HInstruction* opb,
   1326                                       Primitive::Type type,
   1327                                       bool is_unsigned) {
   1328   if (vector_mode_ == kSequential) {
   1329     // Non-converting scalar code follows implicit integral promotion.
   1330     if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
   1331                                      type == Primitive::kPrimByte ||
   1332                                      type == Primitive::kPrimChar ||
   1333                                      type == Primitive::kPrimShort)) {
   1334       type = Primitive::kPrimInt;
   1335     }
   1336   }
   1337   HInstruction* vector = nullptr;
   1338   switch (org->GetKind()) {
   1339     case HInstruction::kNeg:
   1340       DCHECK(opb == nullptr);
   1341       GENERATE_VEC(
   1342           new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
   1343           new (global_allocator_) HNeg(type, opa));
   1344     case HInstruction::kNot:
   1345       DCHECK(opb == nullptr);
   1346       GENERATE_VEC(
   1347           new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
   1348           new (global_allocator_) HNot(type, opa));
   1349     case HInstruction::kBooleanNot:
   1350       DCHECK(opb == nullptr);
   1351       GENERATE_VEC(
   1352           new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
   1353           new (global_allocator_) HBooleanNot(opa));
   1354     case HInstruction::kTypeConversion:
   1355       DCHECK(opb == nullptr);
   1356       GENERATE_VEC(
   1357           new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
   1358           new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
   1359     case HInstruction::kAdd:
   1360       GENERATE_VEC(
   1361           new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
   1362           new (global_allocator_) HAdd(type, opa, opb));
   1363     case HInstruction::kSub:
   1364       GENERATE_VEC(
   1365           new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
   1366           new (global_allocator_) HSub(type, opa, opb));
   1367     case HInstruction::kMul:
   1368       GENERATE_VEC(
   1369           new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
   1370           new (global_allocator_) HMul(type, opa, opb));
   1371     case HInstruction::kDiv:
   1372       GENERATE_VEC(
   1373           new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
   1374           new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
   1375     case HInstruction::kAnd:
   1376       GENERATE_VEC(
   1377           new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
   1378           new (global_allocator_) HAnd(type, opa, opb));
   1379     case HInstruction::kOr:
   1380       GENERATE_VEC(
   1381           new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
   1382           new (global_allocator_) HOr(type, opa, opb));
   1383     case HInstruction::kXor:
   1384       GENERATE_VEC(
   1385           new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
   1386           new (global_allocator_) HXor(type, opa, opb));
   1387     case HInstruction::kShl:
   1388       GENERATE_VEC(
   1389           new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
   1390           new (global_allocator_) HShl(type, opa, opb));
   1391     case HInstruction::kShr:
   1392       GENERATE_VEC(
   1393           new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
   1394           new (global_allocator_) HShr(type, opa, opb));
   1395     case HInstruction::kUShr:
   1396       GENERATE_VEC(
   1397           new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
   1398           new (global_allocator_) HUShr(type, opa, opb));
   1399     case HInstruction::kInvokeStaticOrDirect: {
   1400       HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
   1401       if (vector_mode_ == kVector) {
   1402         switch (invoke->GetIntrinsic()) {
   1403           case Intrinsics::kMathAbsInt:
   1404           case Intrinsics::kMathAbsLong:
   1405           case Intrinsics::kMathAbsFloat:
   1406           case Intrinsics::kMathAbsDouble:
   1407             DCHECK(opb == nullptr);
   1408             vector = new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_);
   1409             break;
   1410           case Intrinsics::kMathMinIntInt:
   1411           case Intrinsics::kMathMinLongLong:
   1412           case Intrinsics::kMathMinFloatFloat:
   1413           case Intrinsics::kMathMinDoubleDouble: {
   1414             vector = new (global_allocator_)
   1415                 HVecMin(global_allocator_, opa, opb, type, vector_length_, is_unsigned);
   1416             break;
   1417           }
   1418           case Intrinsics::kMathMaxIntInt:
   1419           case Intrinsics::kMathMaxLongLong:
   1420           case Intrinsics::kMathMaxFloatFloat:
   1421           case Intrinsics::kMathMaxDoubleDouble: {
   1422             vector = new (global_allocator_)
   1423                 HVecMax(global_allocator_, opa, opb, type, vector_length_, is_unsigned);
   1424             break;
   1425           }
   1426           default:
   1427             LOG(FATAL) << "Unsupported SIMD intrinsic";
   1428             UNREACHABLE();
   1429         }  // switch invoke
   1430       } else {
   1431         // In scalar code, simply clone the method invoke, and replace its operands with the
   1432         // corresponding new scalar instructions in the loop. The instruction will get an
   1433         // environment while being inserted from the instruction map in original program order.
   1434         DCHECK(vector_mode_ == kSequential);
   1435         size_t num_args = invoke->GetNumberOfArguments();
   1436         HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect(
   1437             global_allocator_,
   1438             num_args,
   1439             invoke->GetType(),
   1440             invoke->GetDexPc(),
   1441             invoke->GetDexMethodIndex(),
   1442             invoke->GetResolvedMethod(),
   1443             invoke->GetDispatchInfo(),
   1444             invoke->GetInvokeType(),
   1445             invoke->GetTargetMethod(),
   1446             invoke->GetClinitCheckRequirement());
   1447         HInputsRef inputs = invoke->GetInputs();
   1448         size_t num_inputs = inputs.size();
   1449         DCHECK_LE(num_args, num_inputs);
   1450         DCHECK_EQ(num_inputs, new_invoke->GetInputs().size());  // both invokes agree
   1451         for (size_t index = 0; index < num_inputs; ++index) {
   1452           HInstruction* new_input = index < num_args
   1453               ? vector_map_->Get(inputs[index])
   1454               : inputs[index];  // beyond arguments: just pass through
   1455           new_invoke->SetArgumentAt(index, new_input);
   1456         }
   1457         new_invoke->SetIntrinsic(invoke->GetIntrinsic(),
   1458                                  kNeedsEnvironmentOrCache,
   1459                                  kNoSideEffects,
   1460                                  kNoThrow);
   1461         vector = new_invoke;
   1462       }
   1463       break;
   1464     }
   1465     default:
   1466       break;
   1467   }  // switch
   1468   CHECK(vector != nullptr) << "Unsupported SIMD operator";
   1469   vector_map_->Put(org, vector);
   1470 }
   1471 
   1472 #undef GENERATE_VEC
   1473 
   1474 //
   1475 // Vectorization idioms.
   1476 //
   1477 
   1478 // Method recognizes the following idioms:
   1479 //   rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
   1480 //   regular  halving add (a + b)     >> 1 for unsigned/signed operands a, b
   1481 // Provided that the operands are promoted to a wider form to do the arithmetic and
   1482 // then cast back to narrower form, the idioms can be mapped into efficient SIMD
   1483 // implementation that operates directly in narrower form (plus one extra bit).
   1484 // TODO: current version recognizes implicit byte/short/char widening only;
   1485 //       explicit widening from int to long could be added later.
   1486 bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
   1487                                                  HInstruction* instruction,
   1488                                                  bool generate_code,
   1489                                                  Primitive::Type type,
   1490                                                  uint64_t restrictions) {
   1491   // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
   1492   // (note whether the sign bit in wider precision is shifted in has no effect
   1493   // on the narrow precision computed by the idiom).
   1494   int64_t distance = 0;
   1495   if ((instruction->IsShr() ||
   1496        instruction->IsUShr()) &&
   1497       IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) {
   1498     // Test for (a + b + c) >> 1 for optional constant c.
   1499     HInstruction* a = nullptr;
   1500     HInstruction* b = nullptr;
   1501     int64_t       c = 0;
   1502     if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
   1503       DCHECK(a != nullptr && b != nullptr);
   1504       // Accept c == 1 (rounded) or c == 0 (not rounded).
   1505       bool is_rounded = false;
   1506       if (c == 1) {
   1507         is_rounded = true;
   1508       } else if (c != 0) {
   1509         return false;
   1510       }
   1511       // Accept consistent zero or sign extension on operands a and b.
   1512       HInstruction* r = nullptr;
   1513       HInstruction* s = nullptr;
   1514       bool is_unsigned = false;
   1515       if (!IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned)) {
   1516         return false;
   1517       }
   1518       // Deal with vector restrictions.
   1519       if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
   1520           (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
   1521         return false;
   1522       }
   1523       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
   1524       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
   1525       DCHECK(r != nullptr && s != nullptr);
   1526       if (generate_code && vector_mode_ != kVector) {  // de-idiom
   1527         r = instruction->InputAt(0);
   1528         s = instruction->InputAt(1);
   1529       }
   1530       if (VectorizeUse(node, r, generate_code, type, restrictions) &&
   1531           VectorizeUse(node, s, generate_code, type, restrictions)) {
   1532         if (generate_code) {
   1533           if (vector_mode_ == kVector) {
   1534             vector_map_->Put(instruction, new (global_allocator_) HVecHalvingAdd(
   1535                 global_allocator_,
   1536                 vector_map_->Get(r),
   1537                 vector_map_->Get(s),
   1538                 type,
   1539                 vector_length_,
   1540                 is_unsigned,
   1541                 is_rounded));
   1542           } else {
   1543             GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type);
   1544           }
   1545         }
   1546         return true;
   1547       }
   1548     }
   1549   }
   1550   return false;
   1551 }
   1552 
   1553 //
   1554 // Vectorization heuristics.
   1555 //
   1556 
   1557 bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
   1558   // Current heuristic: non-empty body with sufficient number
   1559   // of iterations (if known).
   1560   // TODO: refine by looking at e.g. operation count, alignment, etc.
   1561   if (vector_length_ == 0) {
   1562     return false;  // nothing found
   1563   } else if (0 < trip_count && trip_count < vector_length_) {
   1564     return false;  // insufficient iterations
   1565   }
   1566   return true;
   1567 }
   1568 
   1569 void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
   1570   // Current heuristic: none.
   1571   // TODO: implement
   1572 }
   1573 
   1574 uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
   1575   // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
   1576   // counts and small loop bodies.
   1577   // TODO: refine with operation count, remaining iterations, etc.
   1578   //       Artem had some really cool ideas for this already.
   1579   switch (compiler_driver_->GetInstructionSet()) {
   1580     case kArm64:
   1581     case kX86:
   1582     case kX86_64: {
   1583       size_t num_instructions = block->GetInstructions().CountSize();
   1584       if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
   1585         return 2;
   1586       }
   1587       return 1;
   1588     }
   1589     default:
   1590       return 1;
   1591   }
   1592 }
   1593 
   1594 //
   1595 // Helpers.
   1596 //
   1597 
   1598 bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) {
   1599   // Special case Phis that have equivalent in a debuggable setup. Our graph checker isn't
   1600   // smart enough to follow strongly connected components (and it's probably not worth
   1601   // it to make it so). See b/33775412.
   1602   if (graph_->IsDebuggable() && phi->HasEquivalentPhi()) {
   1603     return false;
   1604   }
   1605   DCHECK(iset_->empty());
   1606   ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi);
   1607   if (set != nullptr) {
   1608     for (HInstruction* i : *set) {
   1609       // Check that, other than instructions that are no longer in the graph (removed earlier)
   1610       // each instruction is removable and, when restrict uses are requested, other than for phi,
   1611       // all uses are contained within the cycle.
   1612       if (!i->IsInBlock()) {
   1613         continue;
   1614       } else if (!i->IsRemovable()) {
   1615         return false;
   1616       } else if (i != phi && restrict_uses) {
   1617         for (const HUseListNode<HInstruction*>& use : i->GetUses()) {
   1618           if (set->find(use.GetUser()) == set->end()) {
   1619             return false;
   1620           }
   1621         }
   1622       }
   1623       iset_->insert(i);  // copy
   1624     }
   1625     return true;
   1626   }
   1627   return false;
   1628 }
   1629 
   1630 // Find: phi: Phi(init, addsub)
   1631 //       s:   SuspendCheck
   1632 //       c:   Condition(phi, bound)
   1633 //       i:   If(c)
   1634 // TODO: Find a less pattern matching approach?
   1635 bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block) {
   1636   DCHECK(iset_->empty());
   1637   HInstruction* phi = block->GetFirstPhi();
   1638   if (phi != nullptr &&
   1639       phi->GetNext() == nullptr &&
   1640       TrySetPhiInduction(phi->AsPhi(), /*restrict_uses*/ false)) {
   1641     HInstruction* s = block->GetFirstInstruction();
   1642     if (s != nullptr && s->IsSuspendCheck()) {
   1643       HInstruction* c = s->GetNext();
   1644       if (c != nullptr &&
   1645           c->IsCondition() &&
   1646           c->GetUses().HasExactlyOneElement() &&  // only used for termination
   1647           !c->HasEnvironmentUses()) {  // unlikely, but not impossible
   1648         HInstruction* i = c->GetNext();
   1649         if (i != nullptr && i->IsIf() && i->InputAt(0) == c) {
   1650           iset_->insert(c);
   1651           iset_->insert(s);
   1652           return true;
   1653         }
   1654       }
   1655     }
   1656   }
   1657   return false;
   1658 }
   1659 
   1660 bool HLoopOptimization::IsEmptyBody(HBasicBlock* block) {
   1661   if (!block->GetPhis().IsEmpty()) {
   1662     return false;
   1663   }
   1664   for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
   1665     HInstruction* instruction = it.Current();
   1666     if (!instruction->IsGoto() && iset_->find(instruction) == iset_->end()) {
   1667       return false;
   1668     }
   1669   }
   1670   return true;
   1671 }
   1672 
   1673 bool HLoopOptimization::IsUsedOutsideLoop(HLoopInformation* loop_info,
   1674                                           HInstruction* instruction) {
   1675   for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
   1676     if (use.GetUser()->GetBlock()->GetLoopInformation() != loop_info) {
   1677       return true;
   1678     }
   1679   }
   1680   return false;
   1681 }
   1682 
   1683 bool HLoopOptimization::IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
   1684                                             HInstruction* instruction,
   1685                                             bool collect_loop_uses,
   1686                                             /*out*/ int32_t* use_count) {
   1687   for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
   1688     HInstruction* user = use.GetUser();
   1689     if (iset_->find(user) == iset_->end()) {  // not excluded?
   1690       HLoopInformation* other_loop_info = user->GetBlock()->GetLoopInformation();
   1691       if (other_loop_info != nullptr && other_loop_info->IsIn(*loop_info)) {
   1692         // If collect_loop_uses is set, simply keep adding those uses to the set.
   1693         // Otherwise, reject uses inside the loop that were not already in the set.
   1694         if (collect_loop_uses) {
   1695           iset_->insert(user);
   1696           continue;
   1697         }
   1698         return false;
   1699       }
   1700       ++*use_count;
   1701     }
   1702   }
   1703   return true;
   1704 }
   1705 
   1706 bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info,
   1707                                                 HInstruction* instruction,
   1708                                                 HBasicBlock* block) {
   1709   // Try to replace outside uses with the last value.
   1710   if (induction_range_.CanGenerateLastValue(instruction)) {
   1711     HInstruction* replacement = induction_range_.GenerateLastValue(instruction, graph_, block);
   1712     const HUseList<HInstruction*>& uses = instruction->GetUses();
   1713     for (auto it = uses.begin(), end = uses.end(); it != end;) {
   1714       HInstruction* user = it->GetUser();
   1715       size_t index = it->GetIndex();
   1716       ++it;  // increment before replacing
   1717       if (iset_->find(user) == iset_->end()) {  // not excluded?
   1718         if (kIsDebugBuild) {
   1719           // We have checked earlier in 'IsOnlyUsedAfterLoop' that the use is after the loop.
   1720           HLoopInformation* other_loop_info = user->GetBlock()->GetLoopInformation();
   1721           CHECK(other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info));
   1722         }
   1723         user->ReplaceInput(replacement, index);
   1724         induction_range_.Replace(user, instruction, replacement);  // update induction
   1725       }
   1726     }
   1727     const HUseList<HEnvironment*>& env_uses = instruction->GetEnvUses();
   1728     for (auto it = env_uses.begin(), end = env_uses.end(); it != end;) {
   1729       HEnvironment* user = it->GetUser();
   1730       size_t index = it->GetIndex();
   1731       ++it;  // increment before replacing
   1732       if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
   1733         // Only update environment uses after the loop.
   1734         HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
   1735         if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
   1736           user->RemoveAsUserOfInput(index);
   1737           user->SetRawEnvAt(index, replacement);
   1738           replacement->AddEnvUseAt(user, index);
   1739         }
   1740       }
   1741     }
   1742     induction_simplication_count_++;
   1743     return true;
   1744   }
   1745   return false;
   1746 }
   1747 
   1748 bool HLoopOptimization::TryAssignLastValue(HLoopInformation* loop_info,
   1749                                            HInstruction* instruction,
   1750                                            HBasicBlock* block,
   1751                                            bool collect_loop_uses) {
   1752   // Assigning the last value is always successful if there are no uses.
   1753   // Otherwise, it succeeds in a no early-exit loop by generating the
   1754   // proper last value assignment.
   1755   int32_t use_count = 0;
   1756   return IsOnlyUsedAfterLoop(loop_info, instruction, collect_loop_uses, &use_count) &&
   1757       (use_count == 0 ||
   1758        (!IsEarlyExit(loop_info) && TryReplaceWithLastValue(loop_info, instruction, block)));
   1759 }
   1760 
   1761 void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
   1762   for (HBackwardInstructionIterator i(list); !i.Done(); i.Advance()) {
   1763     HInstruction* instruction = i.Current();
   1764     if (instruction->IsDeadAndRemovable()) {
   1765       simplified_ = true;
   1766       instruction->GetBlock()->RemoveInstructionOrPhi(instruction);
   1767     }
   1768   }
   1769 }
   1770 
   1771 bool HLoopOptimization::CanRemoveCycle() {
   1772   for (HInstruction* i : *iset_) {
   1773     // We can never remove instructions that have environment
   1774     // uses when we compile 'debuggable'.
   1775     if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
   1776       return false;
   1777     }
   1778     // A deoptimization should never have an environment input removed.
   1779     for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
   1780       if (use.GetUser()->GetHolder()->IsDeoptimize()) {
   1781         return false;
   1782       }
   1783     }
   1784   }
   1785   return true;
   1786 }
   1787 
   1788 }  // namespace art
   1789