Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
     18 #define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
     19 
     20 #include "scheduler.h"
     21 
     22 namespace art {
     23 namespace arm64 {
     24 
     25 static constexpr uint32_t kArm64MemoryLoadLatency = 5;
     26 static constexpr uint32_t kArm64MemoryStoreLatency = 3;
     27 
     28 static constexpr uint32_t kArm64CallInternalLatency = 10;
     29 static constexpr uint32_t kArm64CallLatency = 5;
     30 
     31 // AArch64 instruction latency.
     32 // We currently assume that all arm64 CPUs share the same instruction latency list.
     33 static constexpr uint32_t kArm64IntegerOpLatency = 2;
     34 static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
     35 
     36 
     37 static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
     38 static constexpr uint32_t kArm64DivDoubleLatency = 30;
     39 static constexpr uint32_t kArm64DivFloatLatency = 15;
     40 static constexpr uint32_t kArm64DivIntegerLatency = 5;
     41 static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
     42 static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
     43 static constexpr uint32_t kArm64MulIntegerLatency = 6;
     44 static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
     45 static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
     46 
     47 static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
     48 static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
     49 static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
     50 static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
     51 static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
     52 static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
     53 static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
     54 static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
     55 static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
     56 static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
     57 
     58 class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
     59  public:
     60   // Default visitor for instructions not handled specifically below.
     61   void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
     62     last_visited_latency_ = kArm64IntegerOpLatency;
     63   }
     64 
     65 // We add a second unused parameter to be able to use this macro like the others
     66 // defined in `nodes.h`.
     67 #define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M)     \
     68   M(ArrayGet             , unused)                   \
     69   M(ArrayLength          , unused)                   \
     70   M(ArraySet             , unused)                   \
     71   M(BinaryOperation      , unused)                   \
     72   M(BoundsCheck          , unused)                   \
     73   M(Div                  , unused)                   \
     74   M(InstanceFieldGet     , unused)                   \
     75   M(InstanceOf           , unused)                   \
     76   M(Invoke               , unused)                   \
     77   M(LoadString           , unused)                   \
     78   M(Mul                  , unused)                   \
     79   M(NewArray             , unused)                   \
     80   M(NewInstance          , unused)                   \
     81   M(Rem                  , unused)                   \
     82   M(StaticFieldGet       , unused)                   \
     83   M(SuspendCheck         , unused)                   \
     84   M(TypeConversion       , unused)                   \
     85   M(VecReplicateScalar   , unused)                   \
     86   M(VecExtractScalar     , unused)                   \
     87   M(VecReduce            , unused)                   \
     88   M(VecCnv               , unused)                   \
     89   M(VecNeg               , unused)                   \
     90   M(VecAbs               , unused)                   \
     91   M(VecNot               , unused)                   \
     92   M(VecAdd               , unused)                   \
     93   M(VecHalvingAdd        , unused)                   \
     94   M(VecSub               , unused)                   \
     95   M(VecMul               , unused)                   \
     96   M(VecDiv               , unused)                   \
     97   M(VecMin               , unused)                   \
     98   M(VecMax               , unused)                   \
     99   M(VecAnd               , unused)                   \
    100   M(VecAndNot            , unused)                   \
    101   M(VecOr                , unused)                   \
    102   M(VecXor               , unused)                   \
    103   M(VecShl               , unused)                   \
    104   M(VecShr               , unused)                   \
    105   M(VecUShr              , unused)                   \
    106   M(VecSetScalars        , unused)                   \
    107   M(VecMultiplyAccumulate, unused)                   \
    108   M(VecLoad              , unused)                   \
    109   M(VecStore             , unused)
    110 
    111 #define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
    112   M(BitwiseNegatedRight, unused)                 \
    113   M(MultiplyAccumulate, unused)                  \
    114   M(IntermediateAddress, unused)                 \
    115   M(IntermediateAddressIndex, unused)            \
    116   M(DataProcWithShifterOp, unused)
    117 
    118 #define DECLARE_VISIT_INSTRUCTION(type, unused)  \
    119   void Visit##type(H##type* instruction) OVERRIDE;
    120 
    121   FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
    122   FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
    123   FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
    124 
    125 #undef DECLARE_VISIT_INSTRUCTION
    126 
    127  private:
    128   void HandleSimpleArithmeticSIMD(HVecOperation *instr);
    129   void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
    130 };
    131 
    132 class HSchedulerARM64 : public HScheduler {
    133  public:
    134   HSchedulerARM64(ScopedArenaAllocator* allocator, SchedulingNodeSelector* selector)
    135       : HScheduler(allocator, &arm64_latency_visitor_, selector) {}
    136   ~HSchedulerARM64() OVERRIDE {}
    137 
    138   bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
    139 #define CASE_INSTRUCTION_KIND(type, unused) case \
    140   HInstruction::InstructionKind::k##type:
    141     switch (instruction->GetKind()) {
    142       FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
    143         return true;
    144       FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
    145         return true;
    146       FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND)
    147         return true;
    148       default:
    149         return HScheduler::IsSchedulable(instruction);
    150     }
    151 #undef CASE_INSTRUCTION_KIND
    152   }
    153 
    154   // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized
    155   // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler;
    156   // around a call we have to save/restore all live SIMD&FP registers (only lower 64 bits of
    157   // SIMD&FP registers are callee saved) so don't reorder such vector instructions.
    158   //
    159   // TODO: remove this when a proper support of SIMD registers is introduced to the compiler.
    160   bool IsSchedulingBarrier(const HInstruction* instr) const OVERRIDE {
    161     return HScheduler::IsSchedulingBarrier(instr) ||
    162            instr->IsVecReduce() ||
    163            instr->IsVecExtractScalar() ||
    164            instr->IsVecSetScalars() ||
    165            instr->IsVecReplicateScalar();
    166   }
    167 
    168  private:
    169   SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
    170   DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);
    171 };
    172 
    173 }  // namespace arm64
    174 }  // namespace art
    175 
    176 #endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
    177