Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "intrinsics_arm_vixl.h"
     18 
     19 #include "arch/arm/instruction_set_features_arm.h"
     20 #include "art_method.h"
     21 #include "code_generator_arm_vixl.h"
     22 #include "common_arm.h"
     23 #include "heap_poisoning.h"
     24 #include "lock_word.h"
     25 #include "mirror/array-inl.h"
     26 #include "mirror/object_array-inl.h"
     27 #include "mirror/reference.h"
     28 #include "mirror/string.h"
     29 #include "scoped_thread_state_change-inl.h"
     30 #include "thread-current-inl.h"
     31 
     32 #include "aarch32/constants-aarch32.h"
     33 
     34 namespace art {
     35 namespace arm {
     36 
     37 #define __ assembler->GetVIXLAssembler()->
     38 
     39 using helpers::DRegisterFrom;
     40 using helpers::HighRegisterFrom;
     41 using helpers::InputDRegisterAt;
     42 using helpers::InputRegisterAt;
     43 using helpers::InputSRegisterAt;
     44 using helpers::InputVRegisterAt;
     45 using helpers::Int32ConstantFrom;
     46 using helpers::LocationFrom;
     47 using helpers::LowRegisterFrom;
     48 using helpers::LowSRegisterFrom;
     49 using helpers::HighSRegisterFrom;
     50 using helpers::OutputDRegister;
     51 using helpers::OutputSRegister;
     52 using helpers::OutputRegister;
     53 using helpers::OutputVRegister;
     54 using helpers::RegisterFrom;
     55 using helpers::SRegisterFrom;
     56 using helpers::DRegisterFromS;
     57 
     58 using namespace vixl::aarch32;  // NOLINT(build/namespaces)
     59 
     60 using vixl::ExactAssemblyScope;
     61 using vixl::CodeBufferCheckScope;
     62 
     63 ArmVIXLAssembler* IntrinsicCodeGeneratorARMVIXL::GetAssembler() {
     64   return codegen_->GetAssembler();
     65 }
     66 
     67 ArenaAllocator* IntrinsicCodeGeneratorARMVIXL::GetAllocator() {
     68   return codegen_->GetGraph()->GetAllocator();
     69 }
     70 
     71 // Default slow-path for fallback (calling the managed code to handle the intrinsic) in an
     72 // intrinsified call. This will copy the arguments into the positions for a regular call.
     73 //
     74 // Note: The actual parameters are required to be in the locations given by the invoke's location
     75 //       summary. If an intrinsic modifies those locations before a slowpath call, they must be
     76 //       restored!
     77 //
     78 // Note: If an invoke wasn't sharpened, we will put down an invoke-virtual here. That's potentially
     79 //       sub-optimal (compared to a direct pointer call), but this is a slow-path.
     80 
     81 class IntrinsicSlowPathARMVIXL : public SlowPathCodeARMVIXL {
     82  public:
     83   explicit IntrinsicSlowPathARMVIXL(HInvoke* invoke)
     84       : SlowPathCodeARMVIXL(invoke), invoke_(invoke) {}
     85 
     86   Location MoveArguments(CodeGenerator* codegen) {
     87     InvokeDexCallingConventionVisitorARMVIXL calling_convention_visitor;
     88     IntrinsicVisitor::MoveArguments(invoke_, codegen, &calling_convention_visitor);
     89     return calling_convention_visitor.GetMethodLocation();
     90   }
     91 
     92   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     93     ArmVIXLAssembler* assembler = down_cast<ArmVIXLAssembler*>(codegen->GetAssembler());
     94     __ Bind(GetEntryLabel());
     95 
     96     SaveLiveRegisters(codegen, invoke_->GetLocations());
     97 
     98     Location method_loc = MoveArguments(codegen);
     99 
    100     if (invoke_->IsInvokeStaticOrDirect()) {
    101       codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), method_loc, this);
    102     } else {
    103       codegen->GenerateVirtualCall(invoke_->AsInvokeVirtual(), method_loc, this);
    104     }
    105 
    106     // Copy the result back to the expected output.
    107     Location out = invoke_->GetLocations()->Out();
    108     if (out.IsValid()) {
    109       DCHECK(out.IsRegister());  // TODO: Replace this when we support output in memory.
    110       DCHECK(!invoke_->GetLocations()->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
    111       codegen->MoveFromReturnRegister(out, invoke_->GetType());
    112     }
    113 
    114     RestoreLiveRegisters(codegen, invoke_->GetLocations());
    115     __ B(GetExitLabel());
    116   }
    117 
    118   const char* GetDescription() const OVERRIDE { return "IntrinsicSlowPath"; }
    119 
    120  private:
    121   // The instruction where this slow path is happening.
    122   HInvoke* const invoke_;
    123 
    124   DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARMVIXL);
    125 };
    126 
    127 // Compute base address for the System.arraycopy intrinsic in `base`.
    128 static void GenSystemArrayCopyBaseAddress(ArmVIXLAssembler* assembler,
    129                                           DataType::Type type,
    130                                           const vixl32::Register& array,
    131                                           const Location& pos,
    132                                           const vixl32::Register& base) {
    133   // This routine is only used by the SystemArrayCopy intrinsic at the
    134   // moment. We can allow DataType::Type::kReference as `type` to implement
    135   // the SystemArrayCopyChar intrinsic.
    136   DCHECK_EQ(type, DataType::Type::kReference);
    137   const int32_t element_size = DataType::Size(type);
    138   const uint32_t element_size_shift = DataType::SizeShift(type);
    139   const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
    140 
    141   if (pos.IsConstant()) {
    142     int32_t constant = Int32ConstantFrom(pos);
    143     __ Add(base, array, element_size * constant + data_offset);
    144   } else {
    145     __ Add(base, array, Operand(RegisterFrom(pos), vixl32::LSL, element_size_shift));
    146     __ Add(base, base, data_offset);
    147   }
    148 }
    149 
    150 // Compute end address for the System.arraycopy intrinsic in `end`.
    151 static void GenSystemArrayCopyEndAddress(ArmVIXLAssembler* assembler,
    152                                          DataType::Type type,
    153                                          const Location& copy_length,
    154                                          const vixl32::Register& base,
    155                                          const vixl32::Register& end) {
    156   // This routine is only used by the SystemArrayCopy intrinsic at the
    157   // moment. We can allow DataType::Type::kReference as `type` to implement
    158   // the SystemArrayCopyChar intrinsic.
    159   DCHECK_EQ(type, DataType::Type::kReference);
    160   const int32_t element_size = DataType::Size(type);
    161   const uint32_t element_size_shift = DataType::SizeShift(type);
    162 
    163   if (copy_length.IsConstant()) {
    164     int32_t constant = Int32ConstantFrom(copy_length);
    165     __ Add(end, base, element_size * constant);
    166   } else {
    167     __ Add(end, base, Operand(RegisterFrom(copy_length), vixl32::LSL, element_size_shift));
    168   }
    169 }
    170 
    171 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
    172 class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL {
    173  public:
    174   explicit ReadBarrierSystemArrayCopySlowPathARMVIXL(HInstruction* instruction)
    175       : SlowPathCodeARMVIXL(instruction) {
    176     DCHECK(kEmitCompilerReadBarrier);
    177     DCHECK(kUseBakerReadBarrier);
    178   }
    179 
    180   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
    181     CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
    182     ArmVIXLAssembler* assembler = arm_codegen->GetAssembler();
    183     LocationSummary* locations = instruction_->GetLocations();
    184     DCHECK(locations->CanCall());
    185     DCHECK(instruction_->IsInvokeStaticOrDirect())
    186         << "Unexpected instruction in read barrier arraycopy slow path: "
    187         << instruction_->DebugName();
    188     DCHECK(instruction_->GetLocations()->Intrinsified());
    189     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
    190 
    191     DataType::Type type = DataType::Type::kReference;
    192     const int32_t element_size = DataType::Size(type);
    193 
    194     vixl32::Register dest = InputRegisterAt(instruction_, 2);
    195     Location dest_pos = locations->InAt(3);
    196     vixl32::Register src_curr_addr = RegisterFrom(locations->GetTemp(0));
    197     vixl32::Register dst_curr_addr = RegisterFrom(locations->GetTemp(1));
    198     vixl32::Register src_stop_addr = RegisterFrom(locations->GetTemp(2));
    199     vixl32::Register tmp = RegisterFrom(locations->GetTemp(3));
    200 
    201     __ Bind(GetEntryLabel());
    202     // Compute the base destination address in `dst_curr_addr`.
    203     GenSystemArrayCopyBaseAddress(assembler, type, dest, dest_pos, dst_curr_addr);
    204 
    205     vixl32::Label loop;
    206     __ Bind(&loop);
    207     __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
    208     assembler->MaybeUnpoisonHeapReference(tmp);
    209     // TODO: Inline the mark bit check before calling the runtime?
    210     // tmp = ReadBarrier::Mark(tmp);
    211     // No need to save live registers; it's taken care of by the
    212     // entrypoint. Also, there is no need to update the stack mask,
    213     // as this runtime call will not trigger a garbage collection.
    214     // (See ReadBarrierMarkSlowPathARM::EmitNativeCode for more
    215     // explanations.)
    216     DCHECK(!tmp.IsSP());
    217     DCHECK(!tmp.IsLR());
    218     DCHECK(!tmp.IsPC());
    219     // IP is used internally by the ReadBarrierMarkRegX entry point
    220     // as a temporary (and not preserved).  It thus cannot be used by
    221     // any live register in this slow path.
    222     DCHECK(!src_curr_addr.Is(ip));
    223     DCHECK(!dst_curr_addr.Is(ip));
    224     DCHECK(!src_stop_addr.Is(ip));
    225     DCHECK(!tmp.Is(ip));
    226     DCHECK(tmp.IsRegister()) << tmp;
    227     // TODO: Load the entrypoint once before the loop, instead of
    228     // loading it at every iteration.
    229     int32_t entry_point_offset =
    230         Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode());
    231     // This runtime call does not require a stack map.
    232     arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
    233     assembler->MaybePoisonHeapReference(tmp);
    234     __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
    235     __ Cmp(src_curr_addr, src_stop_addr);
    236     __ B(ne, &loop, /* far_target */ false);
    237     __ B(GetExitLabel());
    238   }
    239 
    240   const char* GetDescription() const OVERRIDE {
    241     return "ReadBarrierSystemArrayCopySlowPathARMVIXL";
    242   }
    243 
    244  private:
    245   DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARMVIXL);
    246 };
    247 
    248 IntrinsicLocationsBuilderARMVIXL::IntrinsicLocationsBuilderARMVIXL(CodeGeneratorARMVIXL* codegen)
    249     : allocator_(codegen->GetGraph()->GetAllocator()),
    250       codegen_(codegen),
    251       assembler_(codegen->GetAssembler()),
    252       features_(codegen->GetInstructionSetFeatures()) {}
    253 
    254 bool IntrinsicLocationsBuilderARMVIXL::TryDispatch(HInvoke* invoke) {
    255   Dispatch(invoke);
    256   LocationSummary* res = invoke->GetLocations();
    257   if (res == nullptr) {
    258     return false;
    259   }
    260   return res->Intrinsified();
    261 }
    262 
    263 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    264   LocationSummary* locations =
    265       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    266   locations->SetInAt(0, Location::RequiresFpuRegister());
    267   locations->SetOut(Location::RequiresRegister());
    268 }
    269 
    270 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    271   LocationSummary* locations =
    272       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    273   locations->SetInAt(0, Location::RequiresRegister());
    274   locations->SetOut(Location::RequiresFpuRegister());
    275 }
    276 
    277 static void MoveFPToInt(LocationSummary* locations, bool is64bit, ArmVIXLAssembler* assembler) {
    278   Location input = locations->InAt(0);
    279   Location output = locations->Out();
    280   if (is64bit) {
    281     __ Vmov(LowRegisterFrom(output), HighRegisterFrom(output), DRegisterFrom(input));
    282   } else {
    283     __ Vmov(RegisterFrom(output), SRegisterFrom(input));
    284   }
    285 }
    286 
    287 static void MoveIntToFP(LocationSummary* locations, bool is64bit, ArmVIXLAssembler* assembler) {
    288   Location input = locations->InAt(0);
    289   Location output = locations->Out();
    290   if (is64bit) {
    291     __ Vmov(DRegisterFrom(output), LowRegisterFrom(input), HighRegisterFrom(input));
    292   } else {
    293     __ Vmov(SRegisterFrom(output), RegisterFrom(input));
    294   }
    295 }
    296 
    297 void IntrinsicLocationsBuilderARMVIXL::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    298   CreateFPToIntLocations(allocator_, invoke);
    299 }
    300 void IntrinsicLocationsBuilderARMVIXL::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    301   CreateIntToFPLocations(allocator_, invoke);
    302 }
    303 
    304 void IntrinsicCodeGeneratorARMVIXL::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    305   MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    306 }
    307 void IntrinsicCodeGeneratorARMVIXL::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    308   MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    309 }
    310 
    311 void IntrinsicLocationsBuilderARMVIXL::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    312   CreateFPToIntLocations(allocator_, invoke);
    313 }
    314 void IntrinsicLocationsBuilderARMVIXL::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    315   CreateIntToFPLocations(allocator_, invoke);
    316 }
    317 
    318 void IntrinsicCodeGeneratorARMVIXL::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    319   MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    320 }
    321 void IntrinsicCodeGeneratorARMVIXL::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    322   MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    323 }
    324 
    325 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    326   LocationSummary* locations =
    327       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    328   locations->SetInAt(0, Location::RequiresRegister());
    329   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
    330 }
    331 
    332 static void CreateLongToLongLocationsWithOverlap(ArenaAllocator* allocator, HInvoke* invoke) {
    333   LocationSummary* locations =
    334       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    335   locations->SetInAt(0, Location::RequiresRegister());
    336   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
    337 }
    338 
    339 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    340   LocationSummary* locations =
    341       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    342   locations->SetInAt(0, Location::RequiresFpuRegister());
    343   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
    344 }
    345 
    346 static void GenNumberOfLeadingZeros(HInvoke* invoke,
    347                                     DataType::Type type,
    348                                     CodeGeneratorARMVIXL* codegen) {
    349   ArmVIXLAssembler* assembler = codegen->GetAssembler();
    350   LocationSummary* locations = invoke->GetLocations();
    351   Location in = locations->InAt(0);
    352   vixl32::Register out = RegisterFrom(locations->Out());
    353 
    354   DCHECK((type == DataType::Type::kInt32) || (type == DataType::Type::kInt64));
    355 
    356   if (type == DataType::Type::kInt64) {
    357     vixl32::Register in_reg_lo = LowRegisterFrom(in);
    358     vixl32::Register in_reg_hi = HighRegisterFrom(in);
    359     vixl32::Label end;
    360     vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
    361     __ Clz(out, in_reg_hi);
    362     __ CompareAndBranchIfNonZero(in_reg_hi, final_label, /* far_target */ false);
    363     __ Clz(out, in_reg_lo);
    364     __ Add(out, out, 32);
    365     if (end.IsReferenced()) {
    366       __ Bind(&end);
    367     }
    368   } else {
    369     __ Clz(out, RegisterFrom(in));
    370   }
    371 }
    372 
    373 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
    374   CreateIntToIntLocations(allocator_, invoke);
    375 }
    376 
    377 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
    378   GenNumberOfLeadingZeros(invoke, DataType::Type::kInt32, codegen_);
    379 }
    380 
    381 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
    382   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
    383 }
    384 
    385 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
    386   GenNumberOfLeadingZeros(invoke, DataType::Type::kInt64, codegen_);
    387 }
    388 
    389 static void GenNumberOfTrailingZeros(HInvoke* invoke,
    390                                      DataType::Type type,
    391                                      CodeGeneratorARMVIXL* codegen) {
    392   DCHECK((type == DataType::Type::kInt32) || (type == DataType::Type::kInt64));
    393 
    394   ArmVIXLAssembler* assembler = codegen->GetAssembler();
    395   LocationSummary* locations = invoke->GetLocations();
    396   vixl32::Register out = RegisterFrom(locations->Out());
    397 
    398   if (type == DataType::Type::kInt64) {
    399     vixl32::Register in_reg_lo = LowRegisterFrom(locations->InAt(0));
    400     vixl32::Register in_reg_hi = HighRegisterFrom(locations->InAt(0));
    401     vixl32::Label end;
    402     vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
    403     __ Rbit(out, in_reg_lo);
    404     __ Clz(out, out);
    405     __ CompareAndBranchIfNonZero(in_reg_lo, final_label, /* far_target */ false);
    406     __ Rbit(out, in_reg_hi);
    407     __ Clz(out, out);
    408     __ Add(out, out, 32);
    409     if (end.IsReferenced()) {
    410       __ Bind(&end);
    411     }
    412   } else {
    413     vixl32::Register in = RegisterFrom(locations->InAt(0));
    414     __ Rbit(out, in);
    415     __ Clz(out, out);
    416   }
    417 }
    418 
    419 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
    420   CreateIntToIntLocations(allocator_, invoke);
    421 }
    422 
    423 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
    424   GenNumberOfTrailingZeros(invoke, DataType::Type::kInt32, codegen_);
    425 }
    426 
    427 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
    428   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
    429 }
    430 
    431 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
    432   GenNumberOfTrailingZeros(invoke, DataType::Type::kInt64, codegen_);
    433 }
    434 
    435 static void MathAbsFP(HInvoke* invoke, ArmVIXLAssembler* assembler) {
    436   __ Vabs(OutputVRegister(invoke), InputVRegisterAt(invoke, 0));
    437 }
    438 
    439 void IntrinsicLocationsBuilderARMVIXL::VisitMathAbsDouble(HInvoke* invoke) {
    440   CreateFPToFPLocations(allocator_, invoke);
    441 }
    442 
    443 void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsDouble(HInvoke* invoke) {
    444   MathAbsFP(invoke, GetAssembler());
    445 }
    446 
    447 void IntrinsicLocationsBuilderARMVIXL::VisitMathAbsFloat(HInvoke* invoke) {
    448   CreateFPToFPLocations(allocator_, invoke);
    449 }
    450 
    451 void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsFloat(HInvoke* invoke) {
    452   MathAbsFP(invoke, GetAssembler());
    453 }
    454 
    455 static void CreateIntToIntPlusTemp(ArenaAllocator* allocator, HInvoke* invoke) {
    456   LocationSummary* locations =
    457       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    458   locations->SetInAt(0, Location::RequiresRegister());
    459   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
    460 
    461   locations->AddTemp(Location::RequiresRegister());
    462 }
    463 
    464 static void GenAbsInteger(LocationSummary* locations,
    465                           bool is64bit,
    466                           ArmVIXLAssembler* assembler) {
    467   Location in = locations->InAt(0);
    468   Location output = locations->Out();
    469 
    470   vixl32::Register mask = RegisterFrom(locations->GetTemp(0));
    471 
    472   if (is64bit) {
    473     vixl32::Register in_reg_lo = LowRegisterFrom(in);
    474     vixl32::Register in_reg_hi = HighRegisterFrom(in);
    475     vixl32::Register out_reg_lo = LowRegisterFrom(output);
    476     vixl32::Register out_reg_hi = HighRegisterFrom(output);
    477 
    478     DCHECK(!out_reg_lo.Is(in_reg_hi)) << "Diagonal overlap unexpected.";
    479 
    480     __ Asr(mask, in_reg_hi, 31);
    481     __ Adds(out_reg_lo, in_reg_lo, mask);
    482     __ Adc(out_reg_hi, in_reg_hi, mask);
    483     __ Eor(out_reg_lo, mask, out_reg_lo);
    484     __ Eor(out_reg_hi, mask, out_reg_hi);
    485   } else {
    486     vixl32::Register in_reg = RegisterFrom(in);
    487     vixl32::Register out_reg = RegisterFrom(output);
    488 
    489     __ Asr(mask, in_reg, 31);
    490     __ Add(out_reg, in_reg, mask);
    491     __ Eor(out_reg, mask, out_reg);
    492   }
    493 }
    494 
    495 void IntrinsicLocationsBuilderARMVIXL::VisitMathAbsInt(HInvoke* invoke) {
    496   CreateIntToIntPlusTemp(allocator_, invoke);
    497 }
    498 
    499 void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsInt(HInvoke* invoke) {
    500   GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    501 }
    502 
    503 
    504 void IntrinsicLocationsBuilderARMVIXL::VisitMathAbsLong(HInvoke* invoke) {
    505   CreateIntToIntPlusTemp(allocator_, invoke);
    506 }
    507 
    508 void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsLong(HInvoke* invoke) {
    509   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    510 }
    511 
    512 static void GenMinMaxFloat(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
    513   ArmVIXLAssembler* assembler = codegen->GetAssembler();
    514   Location op1_loc = invoke->GetLocations()->InAt(0);
    515   Location op2_loc = invoke->GetLocations()->InAt(1);
    516   Location out_loc = invoke->GetLocations()->Out();
    517 
    518   // Optimization: don't generate any code if inputs are the same.
    519   if (op1_loc.Equals(op2_loc)) {
    520     DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in location builder.
    521     return;
    522   }
    523 
    524   vixl32::SRegister op1 = SRegisterFrom(op1_loc);
    525   vixl32::SRegister op2 = SRegisterFrom(op2_loc);
    526   vixl32::SRegister out = OutputSRegister(invoke);
    527   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
    528   const vixl32::Register temp1 = temps.Acquire();
    529   vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0));
    530   vixl32::Label nan, done;
    531   vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
    532 
    533   DCHECK(op1.Is(out));
    534 
    535   __ Vcmp(op1, op2);
    536   __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
    537   __ B(vs, &nan, /* far_target */ false);  // if un-ordered, go to NaN handling.
    538 
    539   // op1 <> op2
    540   vixl32::ConditionType cond = is_min ? gt : lt;
    541   {
    542     ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
    543                                 2 * kMaxInstructionSizeInBytes,
    544                                 CodeBufferCheckScope::kMaximumSize);
    545     __ it(cond);
    546     __ vmov(cond, F32, out, op2);
    547   }
    548   // for <>(not equal), we've done min/max calculation.
    549   __ B(ne, final_label, /* far_target */ false);
    550 
    551   // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0).
    552   __ Vmov(temp1, op1);
    553   __ Vmov(temp2, op2);
    554   if (is_min) {
    555     __ Orr(temp1, temp1, temp2);
    556   } else {
    557     __ And(temp1, temp1, temp2);
    558   }
    559   __ Vmov(out, temp1);
    560   __ B(final_label);
    561 
    562   // handle NaN input.
    563   __ Bind(&nan);
    564   __ Movt(temp1, High16Bits(kNanFloat));  // 0x7FC0xxxx is a NaN.
    565   __ Vmov(out, temp1);
    566 
    567   if (done.IsReferenced()) {
    568     __ Bind(&done);
    569   }
    570 }
    571 
    572 static void CreateFPFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    573   LocationSummary* locations =
    574       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    575   locations->SetInAt(0, Location::RequiresFpuRegister());
    576   locations->SetInAt(1, Location::RequiresFpuRegister());
    577   locations->SetOut(Location::SameAsFirstInput());
    578 }
    579 
    580 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
    581   CreateFPFPToFPLocations(allocator_, invoke);
    582   invoke->GetLocations()->AddTemp(Location::RequiresRegister());
    583 }
    584 
    585 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
    586   GenMinMaxFloat(invoke, /* is_min */ true, codegen_);
    587 }
    588 
    589 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
    590   CreateFPFPToFPLocations(allocator_, invoke);
    591   invoke->GetLocations()->AddTemp(Location::RequiresRegister());
    592 }
    593 
    594 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
    595   GenMinMaxFloat(invoke, /* is_min */ false, codegen_);
    596 }
    597 
    598 static void GenMinMaxDouble(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
    599   ArmVIXLAssembler* assembler = codegen->GetAssembler();
    600   Location op1_loc = invoke->GetLocations()->InAt(0);
    601   Location op2_loc = invoke->GetLocations()->InAt(1);
    602   Location out_loc = invoke->GetLocations()->Out();
    603 
    604   // Optimization: don't generate any code if inputs are the same.
    605   if (op1_loc.Equals(op2_loc)) {
    606     DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in.
    607     return;
    608   }
    609 
    610   vixl32::DRegister op1 = DRegisterFrom(op1_loc);
    611   vixl32::DRegister op2 = DRegisterFrom(op2_loc);
    612   vixl32::DRegister out = OutputDRegister(invoke);
    613   vixl32::Label handle_nan_eq, done;
    614   vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
    615 
    616   DCHECK(op1.Is(out));
    617 
    618   __ Vcmp(op1, op2);
    619   __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
    620   __ B(vs, &handle_nan_eq, /* far_target */ false);  // if un-ordered, go to NaN handling.
    621 
    622   // op1 <> op2
    623   vixl32::ConditionType cond = is_min ? gt : lt;
    624   {
    625     ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
    626                                 2 * kMaxInstructionSizeInBytes,
    627                                 CodeBufferCheckScope::kMaximumSize);
    628     __ it(cond);
    629     __ vmov(cond, F64, out, op2);
    630   }
    631   // for <>(not equal), we've done min/max calculation.
    632   __ B(ne, final_label, /* far_target */ false);
    633 
    634   // handle op1 == op2, max(+0.0,-0.0).
    635   if (!is_min) {
    636     __ Vand(F64, out, op1, op2);
    637     __ B(final_label);
    638   }
    639 
    640   // handle op1 == op2, min(+0.0,-0.0), NaN input.
    641   __ Bind(&handle_nan_eq);
    642   __ Vorr(F64, out, op1, op2);  // assemble op1/-0.0/NaN.
    643 
    644   if (done.IsReferenced()) {
    645     __ Bind(&done);
    646   }
    647 }
    648 
    649 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
    650   CreateFPFPToFPLocations(allocator_, invoke);
    651 }
    652 
    653 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
    654   GenMinMaxDouble(invoke, /* is_min */ true , codegen_);
    655 }
    656 
    657 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    658   CreateFPFPToFPLocations(allocator_, invoke);
    659 }
    660 
    661 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    662   GenMinMaxDouble(invoke, /* is_min */ false, codegen_);
    663 }
    664 
    665 static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
    666   Location op1_loc = invoke->GetLocations()->InAt(0);
    667   Location op2_loc = invoke->GetLocations()->InAt(1);
    668   Location out_loc = invoke->GetLocations()->Out();
    669 
    670   // Optimization: don't generate any code if inputs are the same.
    671   if (op1_loc.Equals(op2_loc)) {
    672     DCHECK(out_loc.Equals(op1_loc));  // out_loc is set as SameAsFirstInput() in location builder.
    673     return;
    674   }
    675 
    676   vixl32::Register op1_lo = LowRegisterFrom(op1_loc);
    677   vixl32::Register op1_hi = HighRegisterFrom(op1_loc);
    678   vixl32::Register op2_lo = LowRegisterFrom(op2_loc);
    679   vixl32::Register op2_hi = HighRegisterFrom(op2_loc);
    680   vixl32::Register out_lo = LowRegisterFrom(out_loc);
    681   vixl32::Register out_hi = HighRegisterFrom(out_loc);
    682   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
    683   const vixl32::Register temp = temps.Acquire();
    684 
    685   DCHECK(op1_lo.Is(out_lo));
    686   DCHECK(op1_hi.Is(out_hi));
    687 
    688   // Compare op1 >= op2, or op1 < op2.
    689   __ Cmp(out_lo, op2_lo);
    690   __ Sbcs(temp, out_hi, op2_hi);
    691 
    692   // Now GE/LT condition code is correct for the long comparison.
    693   {
    694     vixl32::ConditionType cond = is_min ? ge : lt;
    695     ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
    696                                 3 * kMaxInstructionSizeInBytes,
    697                                 CodeBufferCheckScope::kMaximumSize);
    698     __ itt(cond);
    699     __ mov(cond, out_lo, op2_lo);
    700     __ mov(cond, out_hi, op2_hi);
    701   }
    702 }
    703 
    704 static void CreateLongLongToLongLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    705   LocationSummary* locations =
    706       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    707   locations->SetInAt(0, Location::RequiresRegister());
    708   locations->SetInAt(1, Location::RequiresRegister());
    709   locations->SetOut(Location::SameAsFirstInput());
    710 }
    711 
    712 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinLongLong(HInvoke* invoke) {
    713   CreateLongLongToLongLocations(allocator_, invoke);
    714 }
    715 
    716 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinLongLong(HInvoke* invoke) {
    717   GenMinMaxLong(invoke, /* is_min */ true, GetAssembler());
    718 }
    719 
    720 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) {
    721   CreateLongLongToLongLocations(allocator_, invoke);
    722 }
    723 
    724 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxLongLong(HInvoke* invoke) {
    725   GenMinMaxLong(invoke, /* is_min */ false, GetAssembler());
    726 }
    727 
    728 static void GenMinMax(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
    729   vixl32::Register op1 = InputRegisterAt(invoke, 0);
    730   vixl32::Register op2 = InputRegisterAt(invoke, 1);
    731   vixl32::Register out = OutputRegister(invoke);
    732 
    733   __ Cmp(op1, op2);
    734 
    735   {
    736     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
    737                            3 * kMaxInstructionSizeInBytes,
    738                            CodeBufferCheckScope::kMaximumSize);
    739 
    740     __ ite(is_min ? lt : gt);
    741     __ mov(is_min ? lt : gt, out, op1);
    742     __ mov(is_min ? ge : le, out, op2);
    743   }
    744 }
    745 
    746 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    747   LocationSummary* locations =
    748       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    749   locations->SetInAt(0, Location::RequiresRegister());
    750   locations->SetInAt(1, Location::RequiresRegister());
    751   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
    752 }
    753 
    754 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinIntInt(HInvoke* invoke) {
    755   CreateIntIntToIntLocations(allocator_, invoke);
    756 }
    757 
    758 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinIntInt(HInvoke* invoke) {
    759   GenMinMax(invoke, /* is_min */ true, GetAssembler());
    760 }
    761 
    762 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxIntInt(HInvoke* invoke) {
    763   CreateIntIntToIntLocations(allocator_, invoke);
    764 }
    765 
    766 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxIntInt(HInvoke* invoke) {
    767   GenMinMax(invoke, /* is_min */ false, GetAssembler());
    768 }
    769 
    770 void IntrinsicLocationsBuilderARMVIXL::VisitMathSqrt(HInvoke* invoke) {
    771   CreateFPToFPLocations(allocator_, invoke);
    772 }
    773 
    774 void IntrinsicCodeGeneratorARMVIXL::VisitMathSqrt(HInvoke* invoke) {
    775   ArmVIXLAssembler* assembler = GetAssembler();
    776   __ Vsqrt(OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
    777 }
    778 
    779 void IntrinsicLocationsBuilderARMVIXL::VisitMathRint(HInvoke* invoke) {
    780   if (features_.HasARMv8AInstructions()) {
    781     CreateFPToFPLocations(allocator_, invoke);
    782   }
    783 }
    784 
    785 void IntrinsicCodeGeneratorARMVIXL::VisitMathRint(HInvoke* invoke) {
    786   DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
    787   ArmVIXLAssembler* assembler = GetAssembler();
    788   __ Vrintn(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
    789 }
    790 
    791 void IntrinsicLocationsBuilderARMVIXL::VisitMathRoundFloat(HInvoke* invoke) {
    792   if (features_.HasARMv8AInstructions()) {
    793     LocationSummary* locations =
    794         new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    795     locations->SetInAt(0, Location::RequiresFpuRegister());
    796     locations->SetOut(Location::RequiresRegister());
    797     locations->AddTemp(Location::RequiresFpuRegister());
    798   }
    799 }
    800 
    801 void IntrinsicCodeGeneratorARMVIXL::VisitMathRoundFloat(HInvoke* invoke) {
    802   DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
    803 
    804   ArmVIXLAssembler* assembler = GetAssembler();
    805   vixl32::SRegister in_reg = InputSRegisterAt(invoke, 0);
    806   vixl32::Register out_reg = OutputRegister(invoke);
    807   vixl32::SRegister temp1 = LowSRegisterFrom(invoke->GetLocations()->GetTemp(0));
    808   vixl32::SRegister temp2 = HighSRegisterFrom(invoke->GetLocations()->GetTemp(0));
    809   vixl32::Label done;
    810   vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done);
    811 
    812   // Round to nearest integer, ties away from zero.
    813   __ Vcvta(S32, F32, temp1, in_reg);
    814   __ Vmov(out_reg, temp1);
    815 
    816   // For positive, zero or NaN inputs, rounding is done.
    817   __ Cmp(out_reg, 0);
    818   __ B(ge, final_label, /* far_target */ false);
    819 
    820   // Handle input < 0 cases.
    821   // If input is negative but not a tie, previous result (round to nearest) is valid.
    822   // If input is a negative tie, change rounding direction to positive infinity, out_reg += 1.
    823   __ Vrinta(F32, F32, temp1, in_reg);
    824   __ Vmov(temp2, 0.5);
    825   __ Vsub(F32, temp1, in_reg, temp1);
    826   __ Vcmp(F32, temp1, temp2);
    827   __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR);
    828   {
    829     // Use ExactAsemblyScope here because we are using IT.
    830     ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(),
    831                                 2 * kMaxInstructionSizeInBytes,
    832                                 CodeBufferCheckScope::kMaximumSize);
    833     __ it(eq);
    834     __ add(eq, out_reg, out_reg, 1);
    835   }
    836 
    837   if (done.IsReferenced()) {
    838     __ Bind(&done);
    839   }
    840 }
    841 
    842 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekByte(HInvoke* invoke) {
    843   CreateIntToIntLocations(allocator_, invoke);
    844 }
    845 
    846 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPeekByte(HInvoke* invoke) {
    847   ArmVIXLAssembler* assembler = GetAssembler();
    848   // Ignore upper 4B of long address.
    849   __ Ldrsb(OutputRegister(invoke), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    850 }
    851 
    852 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekIntNative(HInvoke* invoke) {
    853   CreateIntToIntLocations(allocator_, invoke);
    854 }
    855 
    856 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPeekIntNative(HInvoke* invoke) {
    857   ArmVIXLAssembler* assembler = GetAssembler();
    858   // Ignore upper 4B of long address.
    859   __ Ldr(OutputRegister(invoke), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    860 }
    861 
    862 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekLongNative(HInvoke* invoke) {
    863   CreateIntToIntLocations(allocator_, invoke);
    864 }
    865 
    866 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPeekLongNative(HInvoke* invoke) {
    867   ArmVIXLAssembler* assembler = GetAssembler();
    868   // Ignore upper 4B of long address.
    869   vixl32::Register addr = LowRegisterFrom(invoke->GetLocations()->InAt(0));
    870   // Worst case: Control register bit SCTLR.A = 0. Then unaligned accesses throw a processor
    871   // exception. So we can't use ldrd as addr may be unaligned.
    872   vixl32::Register lo = LowRegisterFrom(invoke->GetLocations()->Out());
    873   vixl32::Register hi = HighRegisterFrom(invoke->GetLocations()->Out());
    874   if (addr.Is(lo)) {
    875     __ Ldr(hi, MemOperand(addr, 4));
    876     __ Ldr(lo, MemOperand(addr));
    877   } else {
    878     __ Ldr(lo, MemOperand(addr));
    879     __ Ldr(hi, MemOperand(addr, 4));
    880   }
    881 }
    882 
    883 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekShortNative(HInvoke* invoke) {
    884   CreateIntToIntLocations(allocator_, invoke);
    885 }
    886 
    887 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPeekShortNative(HInvoke* invoke) {
    888   ArmVIXLAssembler* assembler = GetAssembler();
    889   // Ignore upper 4B of long address.
    890   __ Ldrsh(OutputRegister(invoke), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    891 }
    892 
    893 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    894   LocationSummary* locations =
    895       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    896   locations->SetInAt(0, Location::RequiresRegister());
    897   locations->SetInAt(1, Location::RequiresRegister());
    898 }
    899 
    900 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPokeByte(HInvoke* invoke) {
    901   CreateIntIntToVoidLocations(allocator_, invoke);
    902 }
    903 
    904 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPokeByte(HInvoke* invoke) {
    905   ArmVIXLAssembler* assembler = GetAssembler();
    906   __ Strb(InputRegisterAt(invoke, 1), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    907 }
    908 
    909 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPokeIntNative(HInvoke* invoke) {
    910   CreateIntIntToVoidLocations(allocator_, invoke);
    911 }
    912 
    913 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPokeIntNative(HInvoke* invoke) {
    914   ArmVIXLAssembler* assembler = GetAssembler();
    915   __ Str(InputRegisterAt(invoke, 1), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    916 }
    917 
    918 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPokeLongNative(HInvoke* invoke) {
    919   CreateIntIntToVoidLocations(allocator_, invoke);
    920 }
    921 
    922 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPokeLongNative(HInvoke* invoke) {
    923   ArmVIXLAssembler* assembler = GetAssembler();
    924   // Ignore upper 4B of long address.
    925   vixl32::Register addr = LowRegisterFrom(invoke->GetLocations()->InAt(0));
    926   // Worst case: Control register bit SCTLR.A = 0. Then unaligned accesses throw a processor
    927   // exception. So we can't use ldrd as addr may be unaligned.
    928   __ Str(LowRegisterFrom(invoke->GetLocations()->InAt(1)), MemOperand(addr));
    929   __ Str(HighRegisterFrom(invoke->GetLocations()->InAt(1)), MemOperand(addr, 4));
    930 }
    931 
    932 void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPokeShortNative(HInvoke* invoke) {
    933   CreateIntIntToVoidLocations(allocator_, invoke);
    934 }
    935 
    936 void IntrinsicCodeGeneratorARMVIXL::VisitMemoryPokeShortNative(HInvoke* invoke) {
    937   ArmVIXLAssembler* assembler = GetAssembler();
    938   __ Strh(InputRegisterAt(invoke, 1), MemOperand(LowRegisterFrom(invoke->GetLocations()->InAt(0))));
    939 }
    940 
    941 void IntrinsicLocationsBuilderARMVIXL::VisitThreadCurrentThread(HInvoke* invoke) {
    942   LocationSummary* locations =
    943       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    944   locations->SetOut(Location::RequiresRegister());
    945 }
    946 
    947 void IntrinsicCodeGeneratorARMVIXL::VisitThreadCurrentThread(HInvoke* invoke) {
    948   ArmVIXLAssembler* assembler = GetAssembler();
    949   __ Ldr(OutputRegister(invoke),
    950          MemOperand(tr, Thread::PeerOffset<kArmPointerSize>().Int32Value()));
    951 }
    952 
    953 static void GenUnsafeGet(HInvoke* invoke,
    954                          DataType::Type type,
    955                          bool is_volatile,
    956                          CodeGeneratorARMVIXL* codegen) {
    957   LocationSummary* locations = invoke->GetLocations();
    958   ArmVIXLAssembler* assembler = codegen->GetAssembler();
    959   Location base_loc = locations->InAt(1);
    960   vixl32::Register base = InputRegisterAt(invoke, 1);     // Object pointer.
    961   Location offset_loc = locations->InAt(2);
    962   vixl32::Register offset = LowRegisterFrom(offset_loc);  // Long offset, lo part only.
    963   Location trg_loc = locations->Out();
    964 
    965   switch (type) {
    966     case DataType::Type::kInt32: {
    967       vixl32::Register trg = RegisterFrom(trg_loc);
    968       __ Ldr(trg, MemOperand(base, offset));
    969       if (is_volatile) {
    970         __ Dmb(vixl32::ISH);
    971       }
    972       break;
    973     }
    974 
    975     case DataType::Type::kReference: {
    976       vixl32::Register trg = RegisterFrom(trg_loc);
    977       if (kEmitCompilerReadBarrier) {
    978         if (kUseBakerReadBarrier) {
    979           Location temp = locations->GetTemp(0);
    980           codegen->GenerateReferenceLoadWithBakerReadBarrier(
    981               invoke, trg_loc, base, 0U, offset_loc, TIMES_1, temp, /* needs_null_check */ false);
    982           if (is_volatile) {
    983             __ Dmb(vixl32::ISH);
    984           }
    985         } else {
    986           __ Ldr(trg, MemOperand(base, offset));
    987           if (is_volatile) {
    988             __ Dmb(vixl32::ISH);
    989           }
    990           codegen->GenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0U, offset_loc);
    991         }
    992       } else {
    993         __ Ldr(trg, MemOperand(base, offset));
    994         if (is_volatile) {
    995           __ Dmb(vixl32::ISH);
    996         }
    997         assembler->MaybeUnpoisonHeapReference(trg);
    998       }
    999       break;
   1000     }
   1001 
   1002     case DataType::Type::kInt64: {
   1003       vixl32::Register trg_lo = LowRegisterFrom(trg_loc);
   1004       vixl32::Register trg_hi = HighRegisterFrom(trg_loc);
   1005       if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
   1006         UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   1007         const vixl32::Register temp_reg = temps.Acquire();
   1008         __ Add(temp_reg, base, offset);
   1009         __ Ldrexd(trg_lo, trg_hi, MemOperand(temp_reg));
   1010       } else {
   1011         __ Ldrd(trg_lo, trg_hi, MemOperand(base, offset));
   1012       }
   1013       if (is_volatile) {
   1014         __ Dmb(vixl32::ISH);
   1015       }
   1016       break;
   1017     }
   1018 
   1019     default:
   1020       LOG(FATAL) << "Unexpected type " << type;
   1021       UNREACHABLE();
   1022   }
   1023 }
   1024 
   1025 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator,
   1026                                           HInvoke* invoke,
   1027                                           DataType::Type type) {
   1028   bool can_call = kEmitCompilerReadBarrier &&
   1029       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
   1030        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   1031   LocationSummary* locations =
   1032       new (allocator) LocationSummary(invoke,
   1033                                       can_call
   1034                                           ? LocationSummary::kCallOnSlowPath
   1035                                           : LocationSummary::kNoCall,
   1036                                       kIntrinsified);
   1037   if (can_call && kUseBakerReadBarrier) {
   1038     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   1039   }
   1040   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1041   locations->SetInAt(1, Location::RequiresRegister());
   1042   locations->SetInAt(2, Location::RequiresRegister());
   1043   locations->SetOut(Location::RequiresRegister(),
   1044                     (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
   1045   if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1046     // We need a temporary register for the read barrier marking slow
   1047     // path in CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier.
   1048     locations->AddTemp(Location::RequiresRegister());
   1049   }
   1050 }
   1051 
   1052 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGet(HInvoke* invoke) {
   1053   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt32);
   1054 }
   1055 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGetVolatile(HInvoke* invoke) {
   1056   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt32);
   1057 }
   1058 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGetLong(HInvoke* invoke) {
   1059   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt64);
   1060 }
   1061 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   1062   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt64);
   1063 }
   1064 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGetObject(HInvoke* invoke) {
   1065   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kReference);
   1066 }
   1067 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   1068   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kReference);
   1069 }
   1070 
   1071 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGet(HInvoke* invoke) {
   1072   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ false, codegen_);
   1073 }
   1074 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGetVolatile(HInvoke* invoke) {
   1075   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ true, codegen_);
   1076 }
   1077 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGetLong(HInvoke* invoke) {
   1078   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ false, codegen_);
   1079 }
   1080 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   1081   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ true, codegen_);
   1082 }
   1083 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGetObject(HInvoke* invoke) {
   1084   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ false, codegen_);
   1085 }
   1086 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   1087   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ true, codegen_);
   1088 }
   1089 
   1090 static void CreateIntIntIntIntToVoid(ArenaAllocator* allocator,
   1091                                      const ArmInstructionSetFeatures& features,
   1092                                      DataType::Type type,
   1093                                      bool is_volatile,
   1094                                      HInvoke* invoke) {
   1095   LocationSummary* locations =
   1096       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1097   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1098   locations->SetInAt(1, Location::RequiresRegister());
   1099   locations->SetInAt(2, Location::RequiresRegister());
   1100   locations->SetInAt(3, Location::RequiresRegister());
   1101 
   1102   if (type == DataType::Type::kInt64) {
   1103     // Potentially need temps for ldrexd-strexd loop.
   1104     if (is_volatile && !features.HasAtomicLdrdAndStrd()) {
   1105       locations->AddTemp(Location::RequiresRegister());  // Temp_lo.
   1106       locations->AddTemp(Location::RequiresRegister());  // Temp_hi.
   1107     }
   1108   } else if (type == DataType::Type::kReference) {
   1109     // Temps for card-marking.
   1110     locations->AddTemp(Location::RequiresRegister());  // Temp.
   1111     locations->AddTemp(Location::RequiresRegister());  // Card.
   1112   }
   1113 }
   1114 
   1115 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePut(HInvoke* invoke) {
   1116   CreateIntIntIntIntToVoid(
   1117       allocator_, features_, DataType::Type::kInt32, /* is_volatile */ false, invoke);
   1118 }
   1119 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutOrdered(HInvoke* invoke) {
   1120   CreateIntIntIntIntToVoid(
   1121       allocator_, features_, DataType::Type::kInt32, /* is_volatile */ false, invoke);
   1122 }
   1123 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutVolatile(HInvoke* invoke) {
   1124   CreateIntIntIntIntToVoid(
   1125       allocator_, features_, DataType::Type::kInt32, /* is_volatile */ true, invoke);
   1126 }
   1127 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutObject(HInvoke* invoke) {
   1128   CreateIntIntIntIntToVoid(
   1129       allocator_, features_, DataType::Type::kReference, /* is_volatile */ false, invoke);
   1130 }
   1131 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   1132   CreateIntIntIntIntToVoid(
   1133       allocator_, features_, DataType::Type::kReference, /* is_volatile */ false, invoke);
   1134 }
   1135 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   1136   CreateIntIntIntIntToVoid(
   1137       allocator_, features_, DataType::Type::kReference, /* is_volatile */ true, invoke);
   1138 }
   1139 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutLong(HInvoke* invoke) {
   1140   CreateIntIntIntIntToVoid(
   1141       allocator_, features_, DataType::Type::kInt64, /* is_volatile */ false, invoke);
   1142 }
   1143 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   1144   CreateIntIntIntIntToVoid(
   1145       allocator_, features_, DataType::Type::kInt64, /* is_volatile */ false, invoke);
   1146 }
   1147 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   1148   CreateIntIntIntIntToVoid(
   1149       allocator_, features_, DataType::Type::kInt64, /* is_volatile */ true, invoke);
   1150 }
   1151 
   1152 static void GenUnsafePut(LocationSummary* locations,
   1153                          DataType::Type type,
   1154                          bool is_volatile,
   1155                          bool is_ordered,
   1156                          CodeGeneratorARMVIXL* codegen) {
   1157   ArmVIXLAssembler* assembler = codegen->GetAssembler();
   1158 
   1159   vixl32::Register base = RegisterFrom(locations->InAt(1));       // Object pointer.
   1160   vixl32::Register offset = LowRegisterFrom(locations->InAt(2));  // Long offset, lo part only.
   1161   vixl32::Register value;
   1162 
   1163   if (is_volatile || is_ordered) {
   1164     __ Dmb(vixl32::ISH);
   1165   }
   1166 
   1167   if (type == DataType::Type::kInt64) {
   1168     vixl32::Register value_lo = LowRegisterFrom(locations->InAt(3));
   1169     vixl32::Register value_hi = HighRegisterFrom(locations->InAt(3));
   1170     value = value_lo;
   1171     if (is_volatile && !codegen->GetInstructionSetFeatures().HasAtomicLdrdAndStrd()) {
   1172       vixl32::Register temp_lo = RegisterFrom(locations->GetTemp(0));
   1173       vixl32::Register temp_hi = RegisterFrom(locations->GetTemp(1));
   1174       UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   1175       const vixl32::Register temp_reg = temps.Acquire();
   1176 
   1177       __ Add(temp_reg, base, offset);
   1178       vixl32::Label loop_head;
   1179       __ Bind(&loop_head);
   1180       __ Ldrexd(temp_lo, temp_hi, MemOperand(temp_reg));
   1181       __ Strexd(temp_lo, value_lo, value_hi, MemOperand(temp_reg));
   1182       __ Cmp(temp_lo, 0);
   1183       __ B(ne, &loop_head, /* far_target */ false);
   1184     } else {
   1185       __ Strd(value_lo, value_hi, MemOperand(base, offset));
   1186     }
   1187   } else {
   1188     value = RegisterFrom(locations->InAt(3));
   1189     vixl32::Register source = value;
   1190     if (kPoisonHeapReferences && type == DataType::Type::kReference) {
   1191       vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
   1192       __ Mov(temp, value);
   1193       assembler->PoisonHeapReference(temp);
   1194       source = temp;
   1195     }
   1196     __ Str(source, MemOperand(base, offset));
   1197   }
   1198 
   1199   if (is_volatile) {
   1200     __ Dmb(vixl32::ISH);
   1201   }
   1202 
   1203   if (type == DataType::Type::kReference) {
   1204     vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
   1205     vixl32::Register card = RegisterFrom(locations->GetTemp(1));
   1206     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   1207     codegen->MarkGCCard(temp, card, base, value, value_can_be_null);
   1208   }
   1209 }
   1210 
   1211 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePut(HInvoke* invoke) {
   1212   GenUnsafePut(invoke->GetLocations(),
   1213                DataType::Type::kInt32,
   1214                /* is_volatile */ false,
   1215                /* is_ordered */ false,
   1216                codegen_);
   1217 }
   1218 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutOrdered(HInvoke* invoke) {
   1219   GenUnsafePut(invoke->GetLocations(),
   1220                DataType::Type::kInt32,
   1221                /* is_volatile */ false,
   1222                /* is_ordered */ true,
   1223                codegen_);
   1224 }
   1225 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutVolatile(HInvoke* invoke) {
   1226   GenUnsafePut(invoke->GetLocations(),
   1227                DataType::Type::kInt32,
   1228                /* is_volatile */ true,
   1229                /* is_ordered */ false,
   1230                codegen_);
   1231 }
   1232 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutObject(HInvoke* invoke) {
   1233   GenUnsafePut(invoke->GetLocations(),
   1234                DataType::Type::kReference,
   1235                /* is_volatile */ false,
   1236                /* is_ordered */ false,
   1237                codegen_);
   1238 }
   1239 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   1240   GenUnsafePut(invoke->GetLocations(),
   1241                DataType::Type::kReference,
   1242                /* is_volatile */ false,
   1243                /* is_ordered */ true,
   1244                codegen_);
   1245 }
   1246 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   1247   GenUnsafePut(invoke->GetLocations(),
   1248                DataType::Type::kReference,
   1249                /* is_volatile */ true,
   1250                /* is_ordered */ false,
   1251                codegen_);
   1252 }
   1253 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutLong(HInvoke* invoke) {
   1254   GenUnsafePut(invoke->GetLocations(),
   1255                DataType::Type::kInt64,
   1256                /* is_volatile */ false,
   1257                /* is_ordered */ false,
   1258                codegen_);
   1259 }
   1260 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   1261   GenUnsafePut(invoke->GetLocations(),
   1262                DataType::Type::kInt64,
   1263                /* is_volatile */ false,
   1264                /* is_ordered */ true,
   1265                codegen_);
   1266 }
   1267 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   1268   GenUnsafePut(invoke->GetLocations(),
   1269                DataType::Type::kInt64,
   1270                /* is_volatile */ true,
   1271                /* is_ordered */ false,
   1272                codegen_);
   1273 }
   1274 
   1275 static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* allocator,
   1276                                                 HInvoke* invoke,
   1277                                                 DataType::Type type) {
   1278   bool can_call = kEmitCompilerReadBarrier &&
   1279       kUseBakerReadBarrier &&
   1280       (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
   1281   LocationSummary* locations =
   1282       new (allocator) LocationSummary(invoke,
   1283                                       can_call
   1284                                           ? LocationSummary::kCallOnSlowPath
   1285                                           : LocationSummary::kNoCall,
   1286                                       kIntrinsified);
   1287   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1288   locations->SetInAt(1, Location::RequiresRegister());
   1289   locations->SetInAt(2, Location::RequiresRegister());
   1290   locations->SetInAt(3, Location::RequiresRegister());
   1291   locations->SetInAt(4, Location::RequiresRegister());
   1292 
   1293   // If heap poisoning is enabled, we don't want the unpoisoning
   1294   // operations to potentially clobber the output. Likewise when
   1295   // emitting a (Baker) read barrier, which may call.
   1296   Location::OutputOverlap overlaps =
   1297       ((kPoisonHeapReferences && type == DataType::Type::kReference) || can_call)
   1298       ? Location::kOutputOverlap
   1299       : Location::kNoOutputOverlap;
   1300   locations->SetOut(Location::RequiresRegister(), overlaps);
   1301 
   1302   // Temporary registers used in CAS. In the object case
   1303   // (UnsafeCASObject intrinsic), these are also used for
   1304   // card-marking, and possibly for (Baker) read barrier.
   1305   locations->AddTemp(Location::RequiresRegister());  // Pointer.
   1306   locations->AddTemp(Location::RequiresRegister());  // Temp 1.
   1307 }
   1308 
   1309 static void GenCas(HInvoke* invoke, DataType::Type type, CodeGeneratorARMVIXL* codegen) {
   1310   DCHECK_NE(type, DataType::Type::kInt64);
   1311 
   1312   ArmVIXLAssembler* assembler = codegen->GetAssembler();
   1313   LocationSummary* locations = invoke->GetLocations();
   1314 
   1315   Location out_loc = locations->Out();
   1316   vixl32::Register out = OutputRegister(invoke);                      // Boolean result.
   1317 
   1318   vixl32::Register base = InputRegisterAt(invoke, 1);                 // Object pointer.
   1319   Location offset_loc = locations->InAt(2);
   1320   vixl32::Register offset = LowRegisterFrom(offset_loc);              // Offset (discard high 4B).
   1321   vixl32::Register expected = InputRegisterAt(invoke, 3);             // Expected.
   1322   vixl32::Register value = InputRegisterAt(invoke, 4);                // Value.
   1323 
   1324   Location tmp_ptr_loc = locations->GetTemp(0);
   1325   vixl32::Register tmp_ptr = RegisterFrom(tmp_ptr_loc);               // Pointer to actual memory.
   1326   vixl32::Register tmp = RegisterFrom(locations->GetTemp(1));         // Value in memory.
   1327 
   1328   if (type == DataType::Type::kReference) {
   1329     // The only read barrier implementation supporting the
   1330     // UnsafeCASObject intrinsic is the Baker-style read barriers.
   1331     DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   1332 
   1333     // Mark card for object assuming new value is stored. Worst case we will mark an unchanged
   1334     // object and scan the receiver at the next GC for nothing.
   1335     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   1336     codegen->MarkGCCard(tmp_ptr, tmp, base, value, value_can_be_null);
   1337 
   1338     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1339       // Need to make sure the reference stored in the field is a to-space
   1340       // one before attempting the CAS or the CAS could fail incorrectly.
   1341       codegen->UpdateReferenceFieldWithBakerReadBarrier(
   1342           invoke,
   1343           out_loc,  // Unused, used only as a "temporary" within the read barrier.
   1344           base,
   1345           /* field_offset */ offset_loc,
   1346           tmp_ptr_loc,
   1347           /* needs_null_check */ false,
   1348           tmp);
   1349     }
   1350   }
   1351 
   1352   // Prevent reordering with prior memory operations.
   1353   // Emit a DMB ISH instruction instead of an DMB ISHST one, as the
   1354   // latter allows a preceding load to be delayed past the STXR
   1355   // instruction below.
   1356   __ Dmb(vixl32::ISH);
   1357 
   1358   __ Add(tmp_ptr, base, offset);
   1359 
   1360   if (kPoisonHeapReferences && type == DataType::Type::kReference) {
   1361     codegen->GetAssembler()->PoisonHeapReference(expected);
   1362     if (value.Is(expected)) {
   1363       // Do not poison `value`, as it is the same register as
   1364       // `expected`, which has just been poisoned.
   1365     } else {
   1366       codegen->GetAssembler()->PoisonHeapReference(value);
   1367     }
   1368   }
   1369 
   1370   // do {
   1371   //   tmp = [r_ptr] - expected;
   1372   // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
   1373   // result = tmp != 0;
   1374 
   1375   vixl32::Label loop_head;
   1376   __ Bind(&loop_head);
   1377 
   1378   __ Ldrex(tmp, MemOperand(tmp_ptr));
   1379 
   1380   __ Subs(tmp, tmp, expected);
   1381 
   1382   {
   1383     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1384                            3 * kMaxInstructionSizeInBytes,
   1385                            CodeBufferCheckScope::kMaximumSize);
   1386 
   1387     __ itt(eq);
   1388     __ strex(eq, tmp, value, MemOperand(tmp_ptr));
   1389     __ cmp(eq, tmp, 1);
   1390   }
   1391 
   1392   __ B(eq, &loop_head, /* far_target */ false);
   1393 
   1394   __ Dmb(vixl32::ISH);
   1395 
   1396   __ Rsbs(out, tmp, 1);
   1397 
   1398   {
   1399     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1400                            2 * kMaxInstructionSizeInBytes,
   1401                            CodeBufferCheckScope::kMaximumSize);
   1402 
   1403     __ it(cc);
   1404     __ mov(cc, out, 0);
   1405   }
   1406 
   1407   if (kPoisonHeapReferences && type == DataType::Type::kReference) {
   1408     codegen->GetAssembler()->UnpoisonHeapReference(expected);
   1409     if (value.Is(expected)) {
   1410       // Do not unpoison `value`, as it is the same register as
   1411       // `expected`, which has just been unpoisoned.
   1412     } else {
   1413       codegen->GetAssembler()->UnpoisonHeapReference(value);
   1414     }
   1415   }
   1416 }
   1417 
   1418 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeCASInt(HInvoke* invoke) {
   1419   CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke, DataType::Type::kInt32);
   1420 }
   1421 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeCASObject(HInvoke* invoke) {
   1422   // The only read barrier implementation supporting the
   1423   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   1424   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   1425     return;
   1426   }
   1427 
   1428   CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke, DataType::Type::kReference);
   1429 }
   1430 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeCASInt(HInvoke* invoke) {
   1431   GenCas(invoke, DataType::Type::kInt32, codegen_);
   1432 }
   1433 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeCASObject(HInvoke* invoke) {
   1434   // The only read barrier implementation supporting the
   1435   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   1436   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   1437 
   1438   GenCas(invoke, DataType::Type::kReference, codegen_);
   1439 }
   1440 
   1441 void IntrinsicLocationsBuilderARMVIXL::VisitStringCompareTo(HInvoke* invoke) {
   1442   // The inputs plus one temp.
   1443   LocationSummary* locations =
   1444       new (allocator_) LocationSummary(invoke,
   1445                                        invoke->InputAt(1)->CanBeNull()
   1446                                            ? LocationSummary::kCallOnSlowPath
   1447                                            : LocationSummary::kNoCall,
   1448                                        kIntrinsified);
   1449   locations->SetInAt(0, Location::RequiresRegister());
   1450   locations->SetInAt(1, Location::RequiresRegister());
   1451   locations->AddTemp(Location::RequiresRegister());
   1452   locations->AddTemp(Location::RequiresRegister());
   1453   locations->AddTemp(Location::RequiresRegister());
   1454   // Need temporary registers for String compression's feature.
   1455   if (mirror::kUseStringCompression) {
   1456     locations->AddTemp(Location::RequiresRegister());
   1457   }
   1458   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   1459 }
   1460 
   1461 // Forward declaration.
   1462 //
   1463 // ART build system imposes a size limit (deviceFrameSizeLimit) on the stack frames generated
   1464 // by the compiler for every C++ function, and if this function gets inlined in
   1465 // IntrinsicCodeGeneratorARMVIXL::VisitStringCompareTo, the limit will be exceeded, resulting in a
   1466 // build failure. That is the reason why NO_INLINE attribute is used.
   1467 static void NO_INLINE GenerateStringCompareToLoop(ArmVIXLAssembler* assembler,
   1468                                                   HInvoke* invoke,
   1469                                                   vixl32::Label* end,
   1470                                                   vixl32::Label* different_compression);
   1471 
   1472 void IntrinsicCodeGeneratorARMVIXL::VisitStringCompareTo(HInvoke* invoke) {
   1473   ArmVIXLAssembler* assembler = GetAssembler();
   1474   LocationSummary* locations = invoke->GetLocations();
   1475 
   1476   const vixl32::Register str = InputRegisterAt(invoke, 0);
   1477   const vixl32::Register arg = InputRegisterAt(invoke, 1);
   1478   const vixl32::Register out = OutputRegister(invoke);
   1479 
   1480   const vixl32::Register temp0 = RegisterFrom(locations->GetTemp(0));
   1481   const vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   1482   const vixl32::Register temp2 = RegisterFrom(locations->GetTemp(2));
   1483   vixl32::Register temp3;
   1484   if (mirror::kUseStringCompression) {
   1485     temp3 = RegisterFrom(locations->GetTemp(3));
   1486   }
   1487 
   1488   vixl32::Label end;
   1489   vixl32::Label different_compression;
   1490 
   1491   // Get offsets of count and value fields within a string object.
   1492   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
   1493 
   1494   // Note that the null check must have been done earlier.
   1495   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1496 
   1497   // Take slow path and throw if input can be and is null.
   1498   SlowPathCodeARMVIXL* slow_path = nullptr;
   1499   const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
   1500   if (can_slow_path) {
   1501     slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   1502     codegen_->AddSlowPath(slow_path);
   1503     __ CompareAndBranchIfZero(arg, slow_path->GetEntryLabel());
   1504   }
   1505 
   1506   // Reference equality check, return 0 if same reference.
   1507   __ Subs(out, str, arg);
   1508   __ B(eq, &end);
   1509 
   1510   if (mirror::kUseStringCompression) {
   1511     // Load `count` fields of this and argument strings.
   1512     __ Ldr(temp3, MemOperand(str, count_offset));
   1513     __ Ldr(temp2, MemOperand(arg, count_offset));
   1514     // Extract lengths from the `count` fields.
   1515     __ Lsr(temp0, temp3, 1u);
   1516     __ Lsr(temp1, temp2, 1u);
   1517   } else {
   1518     // Load lengths of this and argument strings.
   1519     __ Ldr(temp0, MemOperand(str, count_offset));
   1520     __ Ldr(temp1, MemOperand(arg, count_offset));
   1521   }
   1522   // out = length diff.
   1523   __ Subs(out, temp0, temp1);
   1524   // temp0 = min(len(str), len(arg)).
   1525 
   1526   {
   1527     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1528                            2 * kMaxInstructionSizeInBytes,
   1529                            CodeBufferCheckScope::kMaximumSize);
   1530 
   1531     __ it(gt);
   1532     __ mov(gt, temp0, temp1);
   1533   }
   1534 
   1535   // Shorter string is empty?
   1536   // Note that mirror::kUseStringCompression==true introduces lots of instructions,
   1537   // which makes &end label far away from this branch and makes it not 'CBZ-encodable'.
   1538   __ CompareAndBranchIfZero(temp0, &end, mirror::kUseStringCompression);
   1539 
   1540   if (mirror::kUseStringCompression) {
   1541     // Check if both strings using same compression style to use this comparison loop.
   1542     __ Eors(temp2, temp2, temp3);
   1543     __ Lsrs(temp2, temp2, 1u);
   1544     __ B(cs, &different_compression);
   1545     // For string compression, calculate the number of bytes to compare (not chars).
   1546     // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
   1547     __ Lsls(temp3, temp3, 31u);  // Extract purely the compression flag.
   1548 
   1549     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1550                            2 * kMaxInstructionSizeInBytes,
   1551                            CodeBufferCheckScope::kMaximumSize);
   1552 
   1553     __ it(ne);
   1554     __ add(ne, temp0, temp0, temp0);
   1555   }
   1556 
   1557 
   1558   GenerateStringCompareToLoop(assembler, invoke, &end, &different_compression);
   1559 
   1560   __ Bind(&end);
   1561 
   1562   if (can_slow_path) {
   1563     __ Bind(slow_path->GetExitLabel());
   1564   }
   1565 }
   1566 
   1567 static void GenerateStringCompareToLoop(ArmVIXLAssembler* assembler,
   1568                                                   HInvoke* invoke,
   1569                                                   vixl32::Label* end,
   1570                                                   vixl32::Label* different_compression) {
   1571   LocationSummary* locations = invoke->GetLocations();
   1572 
   1573   const vixl32::Register str = InputRegisterAt(invoke, 0);
   1574   const vixl32::Register arg = InputRegisterAt(invoke, 1);
   1575   const vixl32::Register out = OutputRegister(invoke);
   1576 
   1577   const vixl32::Register temp0 = RegisterFrom(locations->GetTemp(0));
   1578   const vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   1579   const vixl32::Register temp2 = RegisterFrom(locations->GetTemp(2));
   1580   vixl32::Register temp3;
   1581   if (mirror::kUseStringCompression) {
   1582     temp3 = RegisterFrom(locations->GetTemp(3));
   1583   }
   1584 
   1585   vixl32::Label loop;
   1586   vixl32::Label find_char_diff;
   1587 
   1588   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   1589   // Store offset of string value in preparation for comparison loop.
   1590   __ Mov(temp1, value_offset);
   1591 
   1592   // Assertions that must hold in order to compare multiple characters at a time.
   1593   CHECK_ALIGNED(value_offset, 8);
   1594   static_assert(IsAligned<8>(kObjectAlignment),
   1595                 "String data must be 8-byte aligned for unrolled CompareTo loop.");
   1596 
   1597   const unsigned char_size = DataType::Size(DataType::Type::kUint16);
   1598   DCHECK_EQ(char_size, 2u);
   1599 
   1600   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   1601 
   1602   vixl32::Label find_char_diff_2nd_cmp;
   1603   // Unrolled loop comparing 4x16-bit chars per iteration (ok because of string data alignment).
   1604   __ Bind(&loop);
   1605   vixl32::Register temp_reg = temps.Acquire();
   1606   __ Ldr(temp_reg, MemOperand(str, temp1));
   1607   __ Ldr(temp2, MemOperand(arg, temp1));
   1608   __ Cmp(temp_reg, temp2);
   1609   __ B(ne, &find_char_diff, /* far_target */ false);
   1610   __ Add(temp1, temp1, char_size * 2);
   1611 
   1612   __ Ldr(temp_reg, MemOperand(str, temp1));
   1613   __ Ldr(temp2, MemOperand(arg, temp1));
   1614   __ Cmp(temp_reg, temp2);
   1615   __ B(ne, &find_char_diff_2nd_cmp, /* far_target */ false);
   1616   __ Add(temp1, temp1, char_size * 2);
   1617   // With string compression, we have compared 8 bytes, otherwise 4 chars.
   1618   __ Subs(temp0, temp0, (mirror::kUseStringCompression ? 8 : 4));
   1619   __ B(hi, &loop, /* far_target */ false);
   1620   __ B(end);
   1621 
   1622   __ Bind(&find_char_diff_2nd_cmp);
   1623   if (mirror::kUseStringCompression) {
   1624     __ Subs(temp0, temp0, 4);  // 4 bytes previously compared.
   1625     __ B(ls, end, /* far_target */ false);  // Was the second comparison fully beyond the end?
   1626   } else {
   1627     // Without string compression, we can start treating temp0 as signed
   1628     // and rely on the signed comparison below.
   1629     __ Sub(temp0, temp0, 2);
   1630   }
   1631 
   1632   // Find the single character difference.
   1633   __ Bind(&find_char_diff);
   1634   // Get the bit position of the first character that differs.
   1635   __ Eor(temp1, temp2, temp_reg);
   1636   __ Rbit(temp1, temp1);
   1637   __ Clz(temp1, temp1);
   1638 
   1639   // temp0 = number of characters remaining to compare.
   1640   // (Without string compression, it could be < 1 if a difference is found by the second CMP
   1641   // in the comparison loop, and after the end of the shorter string data).
   1642 
   1643   // Without string compression (temp1 >> 4) = character where difference occurs between the last
   1644   // two words compared, in the interval [0,1].
   1645   // (0 for low half-word different, 1 for high half-word different).
   1646   // With string compression, (temp1 << 3) = byte where the difference occurs,
   1647   // in the interval [0,3].
   1648 
   1649   // If temp0 <= (temp1 >> (kUseStringCompression ? 3 : 4)), the difference occurs outside
   1650   // the remaining string data, so just return length diff (out).
   1651   // The comparison is unsigned for string compression, otherwise signed.
   1652   __ Cmp(temp0, Operand(temp1, vixl32::LSR, (mirror::kUseStringCompression ? 3 : 4)));
   1653   __ B((mirror::kUseStringCompression ? ls : le), end, /* far_target */ false);
   1654 
   1655   // Extract the characters and calculate the difference.
   1656   if (mirror::kUseStringCompression) {
   1657     // For compressed strings we need to clear 0x7 from temp1, for uncompressed we need to clear
   1658     // 0xf. We also need to prepare the character extraction mask `uncompressed ? 0xffffu : 0xffu`.
   1659     // The compression flag is now in the highest bit of temp3, so let's play some tricks.
   1660     __ Orr(temp3, temp3, 0xffu << 23);                  // uncompressed ? 0xff800000u : 0x7ff80000u
   1661     __ Bic(temp1, temp1, Operand(temp3, vixl32::LSR, 31 - 3));  // &= ~(uncompressed ? 0xfu : 0x7u)
   1662     __ Asr(temp3, temp3, 7u);                           // uncompressed ? 0xffff0000u : 0xff0000u.
   1663     __ Lsr(temp2, temp2, temp1);                        // Extract second character.
   1664     __ Lsr(temp3, temp3, 16u);                          // uncompressed ? 0xffffu : 0xffu
   1665     __ Lsr(out, temp_reg, temp1);                       // Extract first character.
   1666     __ And(temp2, temp2, temp3);
   1667     __ And(out, out, temp3);
   1668   } else {
   1669     __ Bic(temp1, temp1, 0xf);
   1670     __ Lsr(temp2, temp2, temp1);
   1671     __ Lsr(out, temp_reg, temp1);
   1672     __ Movt(temp2, 0);
   1673     __ Movt(out, 0);
   1674   }
   1675 
   1676   __ Sub(out, out, temp2);
   1677   temps.Release(temp_reg);
   1678 
   1679   if (mirror::kUseStringCompression) {
   1680     __ B(end);
   1681     __ Bind(different_compression);
   1682 
   1683     // Comparison for different compression style.
   1684     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
   1685     DCHECK_EQ(c_char_size, 1u);
   1686 
   1687     // We want to free up the temp3, currently holding `str.count`, for comparison.
   1688     // So, we move it to the bottom bit of the iteration count `temp0` which we tnen
   1689     // need to treat as unsigned. Start by freeing the bit with an ADD and continue
   1690     // further down by a LSRS+SBC which will flip the meaning of the flag but allow
   1691     // `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
   1692     __ Add(temp0, temp0, temp0);              // Unlike LSL, this ADD is always 16-bit.
   1693     // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
   1694     __ Mov(temp1, str);
   1695     __ Mov(temp2, arg);
   1696     __ Lsrs(temp3, temp3, 1u);                // Continue the move of the compression flag.
   1697     {
   1698       ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1699                              3 * kMaxInstructionSizeInBytes,
   1700                              CodeBufferCheckScope::kMaximumSize);
   1701       __ itt(cs);                             // Interleave with selection of temp1 and temp2.
   1702       __ mov(cs, temp1, arg);                 // Preserves flags.
   1703       __ mov(cs, temp2, str);                 // Preserves flags.
   1704     }
   1705     __ Sbc(temp0, temp0, 0);                  // Complete the move of the compression flag.
   1706 
   1707     // Adjust temp1 and temp2 from string pointers to data pointers.
   1708     __ Add(temp1, temp1, value_offset);
   1709     __ Add(temp2, temp2, value_offset);
   1710 
   1711     vixl32::Label different_compression_loop;
   1712     vixl32::Label different_compression_diff;
   1713 
   1714     // Main loop for different compression.
   1715     temp_reg = temps.Acquire();
   1716     __ Bind(&different_compression_loop);
   1717     __ Ldrb(temp_reg, MemOperand(temp1, c_char_size, PostIndex));
   1718     __ Ldrh(temp3, MemOperand(temp2, char_size, PostIndex));
   1719     __ Cmp(temp_reg, temp3);
   1720     __ B(ne, &different_compression_diff, /* far_target */ false);
   1721     __ Subs(temp0, temp0, 2);
   1722     __ B(hi, &different_compression_loop, /* far_target */ false);
   1723     __ B(end);
   1724 
   1725     // Calculate the difference.
   1726     __ Bind(&different_compression_diff);
   1727     __ Sub(out, temp_reg, temp3);
   1728     temps.Release(temp_reg);
   1729     // Flip the difference if the `arg` is compressed.
   1730     // `temp0` contains inverted `str` compression flag, i.e the same as `arg` compression flag.
   1731     __ Lsrs(temp0, temp0, 1u);
   1732     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1733                   "Expecting 0=compressed, 1=uncompressed");
   1734 
   1735     ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1736                            2 * kMaxInstructionSizeInBytes,
   1737                            CodeBufferCheckScope::kMaximumSize);
   1738     __ it(cc);
   1739     __ rsb(cc, out, out, 0);
   1740   }
   1741 }
   1742 
   1743 // The cut off for unrolling the loop in String.equals() intrinsic for const strings.
   1744 // The normal loop plus the pre-header is 9 instructions (18-26 bytes) without string compression
   1745 // and 12 instructions (24-32 bytes) with string compression. We can compare up to 4 bytes in 4
   1746 // instructions (LDR+LDR+CMP+BNE) and up to 8 bytes in 6 instructions (LDRD+LDRD+CMP+BNE+CMP+BNE).
   1747 // Allow up to 12 instructions (32 bytes) for the unrolled loop.
   1748 constexpr size_t kShortConstStringEqualsCutoffInBytes = 16;
   1749 
   1750 static const char* GetConstString(HInstruction* candidate, uint32_t* utf16_length) {
   1751   if (candidate->IsLoadString()) {
   1752     HLoadString* load_string = candidate->AsLoadString();
   1753     const DexFile& dex_file = load_string->GetDexFile();
   1754     return dex_file.StringDataAndUtf16LengthByIdx(load_string->GetStringIndex(), utf16_length);
   1755   }
   1756   return nullptr;
   1757 }
   1758 
   1759 void IntrinsicLocationsBuilderARMVIXL::VisitStringEquals(HInvoke* invoke) {
   1760   if (kEmitCompilerReadBarrier &&
   1761       !StringEqualsOptimizations(invoke).GetArgumentIsString() &&
   1762       !StringEqualsOptimizations(invoke).GetNoReadBarrierForStringClass()) {
   1763     // No support for this odd case (String class is moveable, not in the boot image).
   1764     return;
   1765   }
   1766 
   1767   LocationSummary* locations =
   1768       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1769   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   1770   locations->SetInAt(0, Location::RequiresRegister());
   1771   locations->SetInAt(1, Location::RequiresRegister());
   1772 
   1773   // Temporary registers to store lengths of strings and for calculations.
   1774   // Using instruction cbz requires a low register, so explicitly set a temp to be R0.
   1775   locations->AddTemp(LocationFrom(r0));
   1776 
   1777   // For the generic implementation and for long const strings we need an extra temporary.
   1778   // We do not need it for short const strings, up to 4 bytes, see code generation below.
   1779   uint32_t const_string_length = 0u;
   1780   const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
   1781   if (const_string == nullptr) {
   1782     const_string = GetConstString(invoke->InputAt(1), &const_string_length);
   1783   }
   1784   bool is_compressed =
   1785       mirror::kUseStringCompression &&
   1786       const_string != nullptr &&
   1787       mirror::String::DexFileStringAllASCII(const_string, const_string_length);
   1788   if (const_string == nullptr || const_string_length > (is_compressed ? 4u : 2u)) {
   1789     locations->AddTemp(Location::RequiresRegister());
   1790   }
   1791 
   1792   // TODO: If the String.equals() is used only for an immediately following HIf, we can
   1793   // mark it as emitted-at-use-site and emit branches directly to the appropriate blocks.
   1794   // Then we shall need an extra temporary register instead of the output register.
   1795   locations->SetOut(Location::RequiresRegister());
   1796 }
   1797 
   1798 void IntrinsicCodeGeneratorARMVIXL::VisitStringEquals(HInvoke* invoke) {
   1799   ArmVIXLAssembler* assembler = GetAssembler();
   1800   LocationSummary* locations = invoke->GetLocations();
   1801 
   1802   vixl32::Register str = InputRegisterAt(invoke, 0);
   1803   vixl32::Register arg = InputRegisterAt(invoke, 1);
   1804   vixl32::Register out = OutputRegister(invoke);
   1805 
   1806   vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
   1807 
   1808   vixl32::Label loop;
   1809   vixl32::Label end;
   1810   vixl32::Label return_true;
   1811   vixl32::Label return_false;
   1812   vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &end);
   1813 
   1814   // Get offsets of count, value, and class fields within a string object.
   1815   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1816   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1817   const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
   1818 
   1819   // Note that the null check must have been done earlier.
   1820   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1821 
   1822   StringEqualsOptimizations optimizations(invoke);
   1823   if (!optimizations.GetArgumentNotNull()) {
   1824     // Check if input is null, return false if it is.
   1825     __ CompareAndBranchIfZero(arg, &return_false, /* far_target */ false);
   1826   }
   1827 
   1828   // Reference equality check, return true if same reference.
   1829   __ Cmp(str, arg);
   1830   __ B(eq, &return_true, /* far_target */ false);
   1831 
   1832   if (!optimizations.GetArgumentIsString()) {
   1833     // Instanceof check for the argument by comparing class fields.
   1834     // All string objects must have the same type since String cannot be subclassed.
   1835     // Receiver must be a string object, so its class field is equal to all strings' class fields.
   1836     // If the argument is a string object, its class field must be equal to receiver's class field.
   1837     __ Ldr(temp, MemOperand(str, class_offset));
   1838     __ Ldr(out, MemOperand(arg, class_offset));
   1839     __ Cmp(temp, out);
   1840     __ B(ne, &return_false, /* far_target */ false);
   1841   }
   1842 
   1843   // Check if one of the inputs is a const string. Do not special-case both strings
   1844   // being const, such cases should be handled by constant folding if needed.
   1845   uint32_t const_string_length = 0u;
   1846   const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
   1847   if (const_string == nullptr) {
   1848     const_string = GetConstString(invoke->InputAt(1), &const_string_length);
   1849     if (const_string != nullptr) {
   1850       std::swap(str, arg);  // Make sure the const string is in `str`.
   1851     }
   1852   }
   1853   bool is_compressed =
   1854       mirror::kUseStringCompression &&
   1855       const_string != nullptr &&
   1856       mirror::String::DexFileStringAllASCII(const_string, const_string_length);
   1857 
   1858   if (const_string != nullptr) {
   1859     // Load `count` field of the argument string and check if it matches the const string.
   1860     // Also compares the compression style, if differs return false.
   1861     __ Ldr(temp, MemOperand(arg, count_offset));
   1862     __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed)));
   1863     __ B(ne, &return_false, /* far_target */ false);
   1864   } else {
   1865     // Load `count` fields of this and argument strings.
   1866     __ Ldr(temp, MemOperand(str, count_offset));
   1867     __ Ldr(out, MemOperand(arg, count_offset));
   1868     // Check if `count` fields are equal, return false if they're not.
   1869     // Also compares the compression style, if differs return false.
   1870     __ Cmp(temp, out);
   1871     __ B(ne, &return_false, /* far_target */ false);
   1872   }
   1873 
   1874   // Assertions that must hold in order to compare strings 4 bytes at a time.
   1875   // Ok to do this because strings are zero-padded to kObjectAlignment.
   1876   DCHECK_ALIGNED(value_offset, 4);
   1877   static_assert(IsAligned<4>(kObjectAlignment), "String data must be aligned for fast compare.");
   1878 
   1879   if (const_string != nullptr &&
   1880       const_string_length <= (is_compressed ? kShortConstStringEqualsCutoffInBytes
   1881                                             : kShortConstStringEqualsCutoffInBytes / 2u)) {
   1882     // Load and compare the contents. Though we know the contents of the short const string
   1883     // at compile time, materializing constants may be more code than loading from memory.
   1884     int32_t offset = value_offset;
   1885     size_t remaining_bytes =
   1886         RoundUp(is_compressed ? const_string_length : const_string_length * 2u, 4u);
   1887     while (remaining_bytes > sizeof(uint32_t)) {
   1888       vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   1889       UseScratchRegisterScope scratch_scope(assembler->GetVIXLAssembler());
   1890       vixl32::Register temp2 = scratch_scope.Acquire();
   1891       __ Ldrd(temp, temp1, MemOperand(str, offset));
   1892       __ Ldrd(temp2, out, MemOperand(arg, offset));
   1893       __ Cmp(temp, temp2);
   1894       __ B(ne, &return_false, /* far_label */ false);
   1895       __ Cmp(temp1, out);
   1896       __ B(ne, &return_false, /* far_label */ false);
   1897       offset += 2u * sizeof(uint32_t);
   1898       remaining_bytes -= 2u * sizeof(uint32_t);
   1899     }
   1900     if (remaining_bytes != 0u) {
   1901       __ Ldr(temp, MemOperand(str, offset));
   1902       __ Ldr(out, MemOperand(arg, offset));
   1903       __ Cmp(temp, out);
   1904       __ B(ne, &return_false, /* far_label */ false);
   1905     }
   1906   } else {
   1907     // Return true if both strings are empty. Even with string compression `count == 0` means empty.
   1908     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1909                   "Expecting 0=compressed, 1=uncompressed");
   1910     __ CompareAndBranchIfZero(temp, &return_true, /* far_target */ false);
   1911 
   1912     if (mirror::kUseStringCompression) {
   1913       // For string compression, calculate the number of bytes to compare (not chars).
   1914       // This could in theory exceed INT32_MAX, so treat temp as unsigned.
   1915       __ Lsrs(temp, temp, 1u);                        // Extract length and check compression flag.
   1916       ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
   1917                              2 * kMaxInstructionSizeInBytes,
   1918                              CodeBufferCheckScope::kMaximumSize);
   1919       __ it(cs);                                      // If uncompressed,
   1920       __ add(cs, temp, temp, temp);                   //   double the byte count.
   1921     }
   1922 
   1923     vixl32::Register temp1 = RegisterFrom(locations->GetTemp(1));
   1924     UseScratchRegisterScope scratch_scope(assembler->GetVIXLAssembler());
   1925     vixl32::Register temp2 = scratch_scope.Acquire();
   1926 
   1927     // Store offset of string value in preparation for comparison loop.
   1928     __ Mov(temp1, value_offset);
   1929 
   1930     // Loop to compare strings 4 bytes at a time starting at the front of the string.
   1931     __ Bind(&loop);
   1932     __ Ldr(out, MemOperand(str, temp1));
   1933     __ Ldr(temp2, MemOperand(arg, temp1));
   1934     __ Add(temp1, temp1, Operand::From(sizeof(uint32_t)));
   1935     __ Cmp(out, temp2);
   1936     __ B(ne, &return_false, /* far_target */ false);
   1937     // With string compression, we have compared 4 bytes, otherwise 2 chars.
   1938     __ Subs(temp, temp, mirror::kUseStringCompression ? 4 : 2);
   1939     __ B(hi, &loop, /* far_target */ false);
   1940   }
   1941 
   1942   // Return true and exit the function.
   1943   // If loop does not result in returning false, we return true.
   1944   __ Bind(&return_true);
   1945   __ Mov(out, 1);
   1946   __ B(final_label);
   1947 
   1948   // Return false and exit the function.
   1949   __ Bind(&return_false);
   1950   __ Mov(out, 0);
   1951 
   1952   if (end.IsReferenced()) {
   1953     __ Bind(&end);
   1954   }
   1955 }
   1956 
   1957 static void GenerateVisitStringIndexOf(HInvoke* invoke,
   1958                                        ArmVIXLAssembler* assembler,
   1959                                        CodeGeneratorARMVIXL* codegen,
   1960                                        bool start_at_zero) {
   1961   LocationSummary* locations = invoke->GetLocations();
   1962 
   1963   // Note that the null check must have been done earlier.
   1964   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1965 
   1966   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
   1967   // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   1968   SlowPathCodeARMVIXL* slow_path = nullptr;
   1969   HInstruction* code_point = invoke->InputAt(1);
   1970   if (code_point->IsIntConstant()) {
   1971     if (static_cast<uint32_t>(Int32ConstantFrom(code_point)) >
   1972         std::numeric_limits<uint16_t>::max()) {
   1973       // Always needs the slow-path. We could directly dispatch to it, but this case should be
   1974       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
   1975       slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   1976       codegen->AddSlowPath(slow_path);
   1977       __ B(slow_path->GetEntryLabel());
   1978       __ Bind(slow_path->GetExitLabel());
   1979       return;
   1980     }
   1981   } else if (code_point->GetType() != DataType::Type::kUint16) {
   1982     vixl32::Register char_reg = InputRegisterAt(invoke, 1);
   1983     // 0xffff is not modified immediate but 0x10000 is, so use `>= 0x10000` instead of `> 0xffff`.
   1984     __ Cmp(char_reg, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
   1985     slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   1986     codegen->AddSlowPath(slow_path);
   1987     __ B(hs, slow_path->GetEntryLabel());
   1988   }
   1989 
   1990   if (start_at_zero) {
   1991     vixl32::Register tmp_reg = RegisterFrom(locations->GetTemp(0));
   1992     DCHECK(tmp_reg.Is(r2));
   1993     // Start-index = 0.
   1994     __ Mov(tmp_reg, 0);
   1995   }
   1996 
   1997   codegen->InvokeRuntime(kQuickIndexOf, invoke, invoke->GetDexPc(), slow_path);
   1998   CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
   1999 
   2000   if (slow_path != nullptr) {
   2001     __ Bind(slow_path->GetExitLabel());
   2002   }
   2003 }
   2004 
   2005 void IntrinsicLocationsBuilderARMVIXL::VisitStringIndexOf(HInvoke* invoke) {
   2006   LocationSummary* locations = new (allocator_) LocationSummary(
   2007       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   2008   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   2009   // best to align the inputs accordingly.
   2010   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2011   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   2012   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   2013   locations->SetOut(LocationFrom(r0));
   2014 
   2015   // Need to send start-index=0.
   2016   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
   2017 }
   2018 
   2019 void IntrinsicCodeGeneratorARMVIXL::VisitStringIndexOf(HInvoke* invoke) {
   2020   GenerateVisitStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ true);
   2021 }
   2022 
   2023 void IntrinsicLocationsBuilderARMVIXL::VisitStringIndexOfAfter(HInvoke* invoke) {
   2024   LocationSummary* locations = new (allocator_) LocationSummary(
   2025       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   2026   // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
   2027   // best to align the inputs accordingly.
   2028   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2029   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   2030   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   2031   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
   2032   locations->SetOut(LocationFrom(r0));
   2033 }
   2034 
   2035 void IntrinsicCodeGeneratorARMVIXL::VisitStringIndexOfAfter(HInvoke* invoke) {
   2036   GenerateVisitStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ false);
   2037 }
   2038 
   2039 void IntrinsicLocationsBuilderARMVIXL::VisitStringNewStringFromBytes(HInvoke* invoke) {
   2040   LocationSummary* locations = new (allocator_) LocationSummary(
   2041       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   2042   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2043   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   2044   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   2045   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
   2046   locations->SetInAt(3, LocationFrom(calling_convention.GetRegisterAt(3)));
   2047   locations->SetOut(LocationFrom(r0));
   2048 }
   2049 
   2050 void IntrinsicCodeGeneratorARMVIXL::VisitStringNewStringFromBytes(HInvoke* invoke) {
   2051   ArmVIXLAssembler* assembler = GetAssembler();
   2052   vixl32::Register byte_array = InputRegisterAt(invoke, 0);
   2053   __ Cmp(byte_array, 0);
   2054   SlowPathCodeARMVIXL* slow_path =
   2055       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   2056   codegen_->AddSlowPath(slow_path);
   2057   __ B(eq, slow_path->GetEntryLabel());
   2058 
   2059   codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc(), slow_path);
   2060   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   2061   __ Bind(slow_path->GetExitLabel());
   2062 }
   2063 
   2064 void IntrinsicLocationsBuilderARMVIXL::VisitStringNewStringFromChars(HInvoke* invoke) {
   2065   LocationSummary* locations =
   2066       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
   2067   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2068   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   2069   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   2070   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
   2071   locations->SetOut(LocationFrom(r0));
   2072 }
   2073 
   2074 void IntrinsicCodeGeneratorARMVIXL::VisitStringNewStringFromChars(HInvoke* invoke) {
   2075   // No need to emit code checking whether `locations->InAt(2)` is a null
   2076   // pointer, as callers of the native method
   2077   //
   2078   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   2079   //
   2080   // all include a null check on `data` before calling that method.
   2081   codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
   2082   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   2083 }
   2084 
   2085 void IntrinsicLocationsBuilderARMVIXL::VisitStringNewStringFromString(HInvoke* invoke) {
   2086   LocationSummary* locations = new (allocator_) LocationSummary(
   2087       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   2088   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2089   locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
   2090   locations->SetOut(LocationFrom(r0));
   2091 }
   2092 
   2093 void IntrinsicCodeGeneratorARMVIXL::VisitStringNewStringFromString(HInvoke* invoke) {
   2094   ArmVIXLAssembler* assembler = GetAssembler();
   2095   vixl32::Register string_to_copy = InputRegisterAt(invoke, 0);
   2096   __ Cmp(string_to_copy, 0);
   2097   SlowPathCodeARMVIXL* slow_path =
   2098       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   2099   codegen_->AddSlowPath(slow_path);
   2100   __ B(eq, slow_path->GetEntryLabel());
   2101 
   2102   codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc(), slow_path);
   2103   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   2104 
   2105   __ Bind(slow_path->GetExitLabel());
   2106 }
   2107 
   2108 void IntrinsicLocationsBuilderARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) {
   2109   // The only read barrier implementation supporting the
   2110   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   2111   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   2112     return;
   2113   }
   2114 
   2115   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
   2116   LocationSummary* locations = invoke->GetLocations();
   2117   if (locations == nullptr) {
   2118     return;
   2119   }
   2120 
   2121   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
   2122   HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
   2123   HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
   2124 
   2125   if (src_pos != nullptr && !assembler_->ShifterOperandCanAlwaysHold(src_pos->GetValue())) {
   2126     locations->SetInAt(1, Location::RequiresRegister());
   2127   }
   2128   if (dest_pos != nullptr && !assembler_->ShifterOperandCanAlwaysHold(dest_pos->GetValue())) {
   2129     locations->SetInAt(3, Location::RequiresRegister());
   2130   }
   2131   if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) {
   2132     locations->SetInAt(4, Location::RequiresRegister());
   2133   }
   2134   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2135     // Temporary register IP cannot be used in
   2136     // ReadBarrierSystemArrayCopySlowPathARM (because that register
   2137     // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
   2138     // temporary register from the register allocator.
   2139     locations->AddTemp(Location::RequiresRegister());
   2140     CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_);
   2141     arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations);
   2142   }
   2143 }
   2144 
   2145 static void CheckPosition(ArmVIXLAssembler* assembler,
   2146                           Location pos,
   2147                           vixl32::Register input,
   2148                           Location length,
   2149                           SlowPathCodeARMVIXL* slow_path,
   2150                           vixl32::Register temp,
   2151                           bool length_is_input_length = false) {
   2152   // Where is the length in the Array?
   2153   const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
   2154 
   2155   if (pos.IsConstant()) {
   2156     int32_t pos_const = Int32ConstantFrom(pos);
   2157     if (pos_const == 0) {
   2158       if (!length_is_input_length) {
   2159         // Check that length(input) >= length.
   2160         __ Ldr(temp, MemOperand(input, length_offset));
   2161         if (length.IsConstant()) {
   2162           __ Cmp(temp, Int32ConstantFrom(length));
   2163         } else {
   2164           __ Cmp(temp, RegisterFrom(length));
   2165         }
   2166         __ B(lt, slow_path->GetEntryLabel());
   2167       }
   2168     } else {
   2169       // Check that length(input) >= pos.
   2170       __ Ldr(temp, MemOperand(input, length_offset));
   2171       __ Subs(temp, temp, pos_const);
   2172       __ B(lt, slow_path->GetEntryLabel());
   2173 
   2174       // Check that (length(input) - pos) >= length.
   2175       if (length.IsConstant()) {
   2176         __ Cmp(temp, Int32ConstantFrom(length));
   2177       } else {
   2178         __ Cmp(temp, RegisterFrom(length));
   2179       }
   2180       __ B(lt, slow_path->GetEntryLabel());
   2181     }
   2182   } else if (length_is_input_length) {
   2183     // The only way the copy can succeed is if pos is zero.
   2184     vixl32::Register pos_reg = RegisterFrom(pos);
   2185     __ CompareAndBranchIfNonZero(pos_reg, slow_path->GetEntryLabel());
   2186   } else {
   2187     // Check that pos >= 0.
   2188     vixl32::Register pos_reg = RegisterFrom(pos);
   2189     __ Cmp(pos_reg, 0);
   2190     __ B(lt, slow_path->GetEntryLabel());
   2191 
   2192     // Check that pos <= length(input).
   2193     __ Ldr(temp, MemOperand(input, length_offset));
   2194     __ Subs(temp, temp, pos_reg);
   2195     __ B(lt, slow_path->GetEntryLabel());
   2196 
   2197     // Check that (length(input) - pos) >= length.
   2198     if (length.IsConstant()) {
   2199       __ Cmp(temp, Int32ConstantFrom(length));
   2200     } else {
   2201       __ Cmp(temp, RegisterFrom(length));
   2202     }
   2203     __ B(lt, slow_path->GetEntryLabel());
   2204   }
   2205 }
   2206 
   2207 void IntrinsicCodeGeneratorARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) {
   2208   // The only read barrier implementation supporting the
   2209   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   2210   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2211 
   2212   ArmVIXLAssembler* assembler = GetAssembler();
   2213   LocationSummary* locations = invoke->GetLocations();
   2214 
   2215   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   2216   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   2217   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   2218   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   2219   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
   2220 
   2221   vixl32::Register src = InputRegisterAt(invoke, 0);
   2222   Location src_pos = locations->InAt(1);
   2223   vixl32::Register dest = InputRegisterAt(invoke, 2);
   2224   Location dest_pos = locations->InAt(3);
   2225   Location length = locations->InAt(4);
   2226   Location temp1_loc = locations->GetTemp(0);
   2227   vixl32::Register temp1 = RegisterFrom(temp1_loc);
   2228   Location temp2_loc = locations->GetTemp(1);
   2229   vixl32::Register temp2 = RegisterFrom(temp2_loc);
   2230   Location temp3_loc = locations->GetTemp(2);
   2231   vixl32::Register temp3 = RegisterFrom(temp3_loc);
   2232 
   2233   SlowPathCodeARMVIXL* intrinsic_slow_path =
   2234       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARMVIXL(invoke);
   2235   codegen_->AddSlowPath(intrinsic_slow_path);
   2236 
   2237   vixl32::Label conditions_on_positions_validated;
   2238   SystemArrayCopyOptimizations optimizations(invoke);
   2239 
   2240   // If source and destination are the same, we go to slow path if we need to do
   2241   // forward copying.
   2242   if (src_pos.IsConstant()) {
   2243     int32_t src_pos_constant = Int32ConstantFrom(src_pos);
   2244     if (dest_pos.IsConstant()) {
   2245       int32_t dest_pos_constant = Int32ConstantFrom(dest_pos);
   2246       if (optimizations.GetDestinationIsSource()) {
   2247         // Checked when building locations.
   2248         DCHECK_GE(src_pos_constant, dest_pos_constant);
   2249       } else if (src_pos_constant < dest_pos_constant) {
   2250         __ Cmp(src, dest);
   2251         __ B(eq, intrinsic_slow_path->GetEntryLabel());
   2252       }
   2253 
   2254       // Checked when building locations.
   2255       DCHECK(!optimizations.GetDestinationIsSource()
   2256              || (src_pos_constant >= Int32ConstantFrom(dest_pos)));
   2257     } else {
   2258       if (!optimizations.GetDestinationIsSource()) {
   2259         __ Cmp(src, dest);
   2260         __ B(ne, &conditions_on_positions_validated, /* far_target */ false);
   2261       }
   2262       __ Cmp(RegisterFrom(dest_pos), src_pos_constant);
   2263       __ B(gt, intrinsic_slow_path->GetEntryLabel());
   2264     }
   2265   } else {
   2266     if (!optimizations.GetDestinationIsSource()) {
   2267       __ Cmp(src, dest);
   2268       __ B(ne, &conditions_on_positions_validated, /* far_target */ false);
   2269     }
   2270     if (dest_pos.IsConstant()) {
   2271       int32_t dest_pos_constant = Int32ConstantFrom(dest_pos);
   2272       __ Cmp(RegisterFrom(src_pos), dest_pos_constant);
   2273     } else {
   2274       __ Cmp(RegisterFrom(src_pos), RegisterFrom(dest_pos));
   2275     }
   2276     __ B(lt, intrinsic_slow_path->GetEntryLabel());
   2277   }
   2278 
   2279   __ Bind(&conditions_on_positions_validated);
   2280 
   2281   if (!optimizations.GetSourceIsNotNull()) {
   2282     // Bail out if the source is null.
   2283     __ CompareAndBranchIfZero(src, intrinsic_slow_path->GetEntryLabel());
   2284   }
   2285 
   2286   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
   2287     // Bail out if the destination is null.
   2288     __ CompareAndBranchIfZero(dest, intrinsic_slow_path->GetEntryLabel());
   2289   }
   2290 
   2291   // If the length is negative, bail out.
   2292   // We have already checked in the LocationsBuilder for the constant case.
   2293   if (!length.IsConstant() &&
   2294       !optimizations.GetCountIsSourceLength() &&
   2295       !optimizations.GetCountIsDestinationLength()) {
   2296     __ Cmp(RegisterFrom(length), 0);
   2297     __ B(lt, intrinsic_slow_path->GetEntryLabel());
   2298   }
   2299 
   2300   // Validity checks: source.
   2301   CheckPosition(assembler,
   2302                 src_pos,
   2303                 src,
   2304                 length,
   2305                 intrinsic_slow_path,
   2306                 temp1,
   2307                 optimizations.GetCountIsSourceLength());
   2308 
   2309   // Validity checks: dest.
   2310   CheckPosition(assembler,
   2311                 dest_pos,
   2312                 dest,
   2313                 length,
   2314                 intrinsic_slow_path,
   2315                 temp1,
   2316                 optimizations.GetCountIsDestinationLength());
   2317 
   2318   if (!optimizations.GetDoesNotNeedTypeCheck()) {
   2319     // Check whether all elements of the source array are assignable to the component
   2320     // type of the destination array. We do two checks: the classes are the same,
   2321     // or the destination is Object[]. If none of these checks succeed, we go to the
   2322     // slow path.
   2323 
   2324     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2325       if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   2326         // /* HeapReference<Class> */ temp1 = src->klass_
   2327         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2328             invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
   2329         // Bail out if the source is not a non primitive array.
   2330         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2331         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2332             invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
   2333         __ CompareAndBranchIfZero(temp1, intrinsic_slow_path->GetEntryLabel());
   2334         // If heap poisoning is enabled, `temp1` has been unpoisoned
   2335         // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2336         // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
   2337         __ Ldrh(temp1, MemOperand(temp1, primitive_offset));
   2338         static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
   2339         __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
   2340       }
   2341 
   2342       // /* HeapReference<Class> */ temp1 = dest->klass_
   2343       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2344           invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
   2345 
   2346       if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   2347         // Bail out if the destination is not a non primitive array.
   2348         //
   2349         // Register `temp1` is not trashed by the read barrier emitted
   2350         // by GenerateFieldLoadWithBakerReadBarrier below, as that
   2351         // method produces a call to a ReadBarrierMarkRegX entry point,
   2352         // which saves all potentially live registers, including
   2353         // temporaries such a `temp1`.
   2354         // /* HeapReference<Class> */ temp2 = temp1->component_type_
   2355         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2356             invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
   2357         __ CompareAndBranchIfZero(temp2, intrinsic_slow_path->GetEntryLabel());
   2358         // If heap poisoning is enabled, `temp2` has been unpoisoned
   2359         // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2360         // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
   2361         __ Ldrh(temp2, MemOperand(temp2, primitive_offset));
   2362         static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
   2363         __ CompareAndBranchIfNonZero(temp2, intrinsic_slow_path->GetEntryLabel());
   2364       }
   2365 
   2366       // For the same reason given earlier, `temp1` is not trashed by the
   2367       // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
   2368       // /* HeapReference<Class> */ temp2 = src->klass_
   2369       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2370           invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
   2371       // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
   2372       __ Cmp(temp1, temp2);
   2373 
   2374       if (optimizations.GetDestinationIsTypedObjectArray()) {
   2375         vixl32::Label do_copy;
   2376         __ B(eq, &do_copy, /* far_target */ false);
   2377         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2378         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2379             invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
   2380         // /* HeapReference<Class> */ temp1 = temp1->super_class_
   2381         // We do not need to emit a read barrier for the following
   2382         // heap reference load, as `temp1` is only used in a
   2383         // comparison with null below, and this reference is not
   2384         // kept afterwards.
   2385         __ Ldr(temp1, MemOperand(temp1, super_offset));
   2386         __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
   2387         __ Bind(&do_copy);
   2388       } else {
   2389         __ B(ne, intrinsic_slow_path->GetEntryLabel());
   2390       }
   2391     } else {
   2392       // Non read barrier code.
   2393 
   2394       // /* HeapReference<Class> */ temp1 = dest->klass_
   2395       __ Ldr(temp1, MemOperand(dest, class_offset));
   2396       // /* HeapReference<Class> */ temp2 = src->klass_
   2397       __ Ldr(temp2, MemOperand(src, class_offset));
   2398       bool did_unpoison = false;
   2399       if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
   2400           !optimizations.GetSourceIsNonPrimitiveArray()) {
   2401         // One or two of the references need to be unpoisoned. Unpoison them
   2402         // both to make the identity check valid.
   2403         assembler->MaybeUnpoisonHeapReference(temp1);
   2404         assembler->MaybeUnpoisonHeapReference(temp2);
   2405         did_unpoison = true;
   2406       }
   2407 
   2408       if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   2409         // Bail out if the destination is not a non primitive array.
   2410         // /* HeapReference<Class> */ temp3 = temp1->component_type_
   2411         __ Ldr(temp3, MemOperand(temp1, component_offset));
   2412         __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2413         assembler->MaybeUnpoisonHeapReference(temp3);
   2414         // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
   2415         __ Ldrh(temp3, MemOperand(temp3, primitive_offset));
   2416         static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
   2417         __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2418       }
   2419 
   2420       if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   2421         // Bail out if the source is not a non primitive array.
   2422         // /* HeapReference<Class> */ temp3 = temp2->component_type_
   2423         __ Ldr(temp3, MemOperand(temp2, component_offset));
   2424         __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2425         assembler->MaybeUnpoisonHeapReference(temp3);
   2426         // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
   2427         __ Ldrh(temp3, MemOperand(temp3, primitive_offset));
   2428         static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
   2429         __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2430       }
   2431 
   2432       __ Cmp(temp1, temp2);
   2433 
   2434       if (optimizations.GetDestinationIsTypedObjectArray()) {
   2435         vixl32::Label do_copy;
   2436         __ B(eq, &do_copy, /* far_target */ false);
   2437         if (!did_unpoison) {
   2438           assembler->MaybeUnpoisonHeapReference(temp1);
   2439         }
   2440         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2441         __ Ldr(temp1, MemOperand(temp1, component_offset));
   2442         assembler->MaybeUnpoisonHeapReference(temp1);
   2443         // /* HeapReference<Class> */ temp1 = temp1->super_class_
   2444         __ Ldr(temp1, MemOperand(temp1, super_offset));
   2445         // No need to unpoison the result, we're comparing against null.
   2446         __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
   2447         __ Bind(&do_copy);
   2448       } else {
   2449         __ B(ne, intrinsic_slow_path->GetEntryLabel());
   2450       }
   2451     }
   2452   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   2453     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
   2454     // Bail out if the source is not a non primitive array.
   2455     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2456       // /* HeapReference<Class> */ temp1 = src->klass_
   2457       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2458           invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
   2459       // /* HeapReference<Class> */ temp3 = temp1->component_type_
   2460       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2461           invoke, temp3_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
   2462       __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2463       // If heap poisoning is enabled, `temp3` has been unpoisoned
   2464       // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2465     } else {
   2466       // /* HeapReference<Class> */ temp1 = src->klass_
   2467       __ Ldr(temp1, MemOperand(src, class_offset));
   2468       assembler->MaybeUnpoisonHeapReference(temp1);
   2469       // /* HeapReference<Class> */ temp3 = temp1->component_type_
   2470       __ Ldr(temp3, MemOperand(temp1, component_offset));
   2471       __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2472       assembler->MaybeUnpoisonHeapReference(temp3);
   2473     }
   2474     // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
   2475     __ Ldrh(temp3, MemOperand(temp3, primitive_offset));
   2476     static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
   2477     __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   2478   }
   2479 
   2480   if (length.IsConstant() && Int32ConstantFrom(length) == 0) {
   2481     // Null constant length: not need to emit the loop code at all.
   2482   } else {
   2483     vixl32::Label done;
   2484     const DataType::Type type = DataType::Type::kReference;
   2485     const int32_t element_size = DataType::Size(type);
   2486 
   2487     if (length.IsRegister()) {
   2488       // Don't enter the copy loop if the length is null.
   2489       __ CompareAndBranchIfZero(RegisterFrom(length), &done, /* is_far_target */ false);
   2490     }
   2491 
   2492     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2493       // TODO: Also convert this intrinsic to the IsGcMarking strategy?
   2494 
   2495       // SystemArrayCopy implementation for Baker read barriers (see
   2496       // also CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier):
   2497       //
   2498       //   uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
   2499       //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
   2500       //   bool is_gray = (rb_state == ReadBarrier::GrayState());
   2501       //   if (is_gray) {
   2502       //     // Slow-path copy.
   2503       //     do {
   2504       //       *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
   2505       //     } while (src_ptr != end_ptr)
   2506       //   } else {
   2507       //     // Fast-path copy.
   2508       //     do {
   2509       //       *dest_ptr++ = *src_ptr++;
   2510       //     } while (src_ptr != end_ptr)
   2511       //   }
   2512 
   2513       // /* int32_t */ monitor = src->monitor_
   2514       __ Ldr(temp2, MemOperand(src, monitor_offset));
   2515       // /* LockWord */ lock_word = LockWord(monitor)
   2516       static_assert(sizeof(LockWord) == sizeof(int32_t),
   2517                     "art::LockWord and int32_t have different sizes.");
   2518 
   2519       // Introduce a dependency on the lock_word including the rb_state,
   2520       // which shall prevent load-load reordering without using
   2521       // a memory barrier (which would be more expensive).
   2522       // `src` is unchanged by this operation, but its value now depends
   2523       // on `temp2`.
   2524       __ Add(src, src, Operand(temp2, vixl32::LSR, 32));
   2525 
   2526       // Compute the base source address in `temp1`.
   2527       // Note that `temp1` (the base source address) is computed from
   2528       // `src` (and `src_pos`) here, and thus honors the artificial
   2529       // dependency of `src` on `temp2`.
   2530       GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
   2531       // Compute the end source address in `temp3`.
   2532       GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
   2533       // The base destination address is computed later, as `temp2` is
   2534       // used for intermediate computations.
   2535 
   2536       // Slow path used to copy array when `src` is gray.
   2537       // Note that the base destination address is computed in `temp2`
   2538       // by the slow path code.
   2539       SlowPathCodeARMVIXL* read_barrier_slow_path =
   2540           new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathARMVIXL(invoke);
   2541       codegen_->AddSlowPath(read_barrier_slow_path);
   2542 
   2543       // Given the numeric representation, it's enough to check the low bit of the
   2544       // rb_state. We do that by shifting the bit out of the lock word with LSRS
   2545       // which can be a 16-bit instruction unlike the TST immediate.
   2546       static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
   2547       static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
   2548       __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
   2549       // Carry flag is the last bit shifted out by LSRS.
   2550       __ B(cs, read_barrier_slow_path->GetEntryLabel());
   2551 
   2552       // Fast-path copy.
   2553       // Compute the base destination address in `temp2`.
   2554       GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
   2555       // Iterate over the arrays and do a raw copy of the objects. We don't need to
   2556       // poison/unpoison.
   2557       vixl32::Label loop;
   2558       __ Bind(&loop);
   2559       {
   2560         UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   2561         const vixl32::Register temp_reg = temps.Acquire();
   2562         __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
   2563         __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
   2564       }
   2565       __ Cmp(temp1, temp3);
   2566       __ B(ne, &loop, /* far_target */ false);
   2567 
   2568       __ Bind(read_barrier_slow_path->GetExitLabel());
   2569     } else {
   2570       // Non read barrier code.
   2571       // Compute the base source address in `temp1`.
   2572       GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
   2573       // Compute the base destination address in `temp2`.
   2574       GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
   2575       // Compute the end source address in `temp3`.
   2576       GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
   2577       // Iterate over the arrays and do a raw copy of the objects. We don't need to
   2578       // poison/unpoison.
   2579       vixl32::Label loop;
   2580       __ Bind(&loop);
   2581       {
   2582         UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   2583         const vixl32::Register temp_reg = temps.Acquire();
   2584         __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex));
   2585         __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex));
   2586       }
   2587       __ Cmp(temp1, temp3);
   2588       __ B(ne, &loop, /* far_target */ false);
   2589     }
   2590     __ Bind(&done);
   2591   }
   2592 
   2593   // We only need one card marking on the destination array.
   2594   codegen_->MarkGCCard(temp1, temp2, dest, NoReg, /* value_can_be_null */ false);
   2595 
   2596   __ Bind(intrinsic_slow_path->GetExitLabel());
   2597 }
   2598 
   2599 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2600   // If the graph is debuggable, all callee-saved floating-point registers are blocked by
   2601   // the code generator. Furthermore, the register allocator creates fixed live intervals
   2602   // for all caller-saved registers because we are doing a function call. As a result, if
   2603   // the input and output locations are unallocated, the register allocator runs out of
   2604   // registers and fails; however, a debuggable graph is not the common case.
   2605   if (invoke->GetBlock()->GetGraph()->IsDebuggable()) {
   2606     return;
   2607   }
   2608 
   2609   DCHECK_EQ(invoke->GetNumberOfArguments(), 1U);
   2610   DCHECK_EQ(invoke->InputAt(0)->GetType(), DataType::Type::kFloat64);
   2611   DCHECK_EQ(invoke->GetType(), DataType::Type::kFloat64);
   2612 
   2613   LocationSummary* const locations =
   2614       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
   2615   const InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2616 
   2617   locations->SetInAt(0, Location::RequiresFpuRegister());
   2618   locations->SetOut(Location::RequiresFpuRegister());
   2619   // Native code uses the soft float ABI.
   2620   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
   2621   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(1)));
   2622 }
   2623 
   2624 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2625   // If the graph is debuggable, all callee-saved floating-point registers are blocked by
   2626   // the code generator. Furthermore, the register allocator creates fixed live intervals
   2627   // for all caller-saved registers because we are doing a function call. As a result, if
   2628   // the input and output locations are unallocated, the register allocator runs out of
   2629   // registers and fails; however, a debuggable graph is not the common case.
   2630   if (invoke->GetBlock()->GetGraph()->IsDebuggable()) {
   2631     return;
   2632   }
   2633 
   2634   DCHECK_EQ(invoke->GetNumberOfArguments(), 2U);
   2635   DCHECK_EQ(invoke->InputAt(0)->GetType(), DataType::Type::kFloat64);
   2636   DCHECK_EQ(invoke->InputAt(1)->GetType(), DataType::Type::kFloat64);
   2637   DCHECK_EQ(invoke->GetType(), DataType::Type::kFloat64);
   2638 
   2639   LocationSummary* const locations =
   2640       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
   2641   const InvokeRuntimeCallingConventionARMVIXL calling_convention;
   2642 
   2643   locations->SetInAt(0, Location::RequiresFpuRegister());
   2644   locations->SetInAt(1, Location::RequiresFpuRegister());
   2645   locations->SetOut(Location::RequiresFpuRegister());
   2646   // Native code uses the soft float ABI.
   2647   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(0)));
   2648   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(1)));
   2649   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
   2650   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(3)));
   2651 }
   2652 
   2653 static void GenFPToFPCall(HInvoke* invoke,
   2654                           ArmVIXLAssembler* assembler,
   2655                           CodeGeneratorARMVIXL* codegen,
   2656                           QuickEntrypointEnum entry) {
   2657   LocationSummary* const locations = invoke->GetLocations();
   2658 
   2659   DCHECK_EQ(invoke->GetNumberOfArguments(), 1U);
   2660   DCHECK(locations->WillCall() && locations->Intrinsified());
   2661 
   2662   // Native code uses the soft float ABI.
   2663   __ Vmov(RegisterFrom(locations->GetTemp(0)),
   2664           RegisterFrom(locations->GetTemp(1)),
   2665           InputDRegisterAt(invoke, 0));
   2666   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
   2667   __ Vmov(OutputDRegister(invoke),
   2668           RegisterFrom(locations->GetTemp(0)),
   2669           RegisterFrom(locations->GetTemp(1)));
   2670 }
   2671 
   2672 static void GenFPFPToFPCall(HInvoke* invoke,
   2673                             ArmVIXLAssembler* assembler,
   2674                             CodeGeneratorARMVIXL* codegen,
   2675                             QuickEntrypointEnum entry) {
   2676   LocationSummary* const locations = invoke->GetLocations();
   2677 
   2678   DCHECK_EQ(invoke->GetNumberOfArguments(), 2U);
   2679   DCHECK(locations->WillCall() && locations->Intrinsified());
   2680 
   2681   // Native code uses the soft float ABI.
   2682   __ Vmov(RegisterFrom(locations->GetTemp(0)),
   2683           RegisterFrom(locations->GetTemp(1)),
   2684           InputDRegisterAt(invoke, 0));
   2685   __ Vmov(RegisterFrom(locations->GetTemp(2)),
   2686           RegisterFrom(locations->GetTemp(3)),
   2687           InputDRegisterAt(invoke, 1));
   2688   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
   2689   __ Vmov(OutputDRegister(invoke),
   2690           RegisterFrom(locations->GetTemp(0)),
   2691           RegisterFrom(locations->GetTemp(1)));
   2692 }
   2693 
   2694 void IntrinsicLocationsBuilderARMVIXL::VisitMathCos(HInvoke* invoke) {
   2695   CreateFPToFPCallLocations(allocator_, invoke);
   2696 }
   2697 
   2698 void IntrinsicCodeGeneratorARMVIXL::VisitMathCos(HInvoke* invoke) {
   2699   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickCos);
   2700 }
   2701 
   2702 void IntrinsicLocationsBuilderARMVIXL::VisitMathSin(HInvoke* invoke) {
   2703   CreateFPToFPCallLocations(allocator_, invoke);
   2704 }
   2705 
   2706 void IntrinsicCodeGeneratorARMVIXL::VisitMathSin(HInvoke* invoke) {
   2707   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickSin);
   2708 }
   2709 
   2710 void IntrinsicLocationsBuilderARMVIXL::VisitMathAcos(HInvoke* invoke) {
   2711   CreateFPToFPCallLocations(allocator_, invoke);
   2712 }
   2713 
   2714 void IntrinsicCodeGeneratorARMVIXL::VisitMathAcos(HInvoke* invoke) {
   2715   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickAcos);
   2716 }
   2717 
   2718 void IntrinsicLocationsBuilderARMVIXL::VisitMathAsin(HInvoke* invoke) {
   2719   CreateFPToFPCallLocations(allocator_, invoke);
   2720 }
   2721 
   2722 void IntrinsicCodeGeneratorARMVIXL::VisitMathAsin(HInvoke* invoke) {
   2723   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickAsin);
   2724 }
   2725 
   2726 void IntrinsicLocationsBuilderARMVIXL::VisitMathAtan(HInvoke* invoke) {
   2727   CreateFPToFPCallLocations(allocator_, invoke);
   2728 }
   2729 
   2730 void IntrinsicCodeGeneratorARMVIXL::VisitMathAtan(HInvoke* invoke) {
   2731   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickAtan);
   2732 }
   2733 
   2734 void IntrinsicLocationsBuilderARMVIXL::VisitMathCbrt(HInvoke* invoke) {
   2735   CreateFPToFPCallLocations(allocator_, invoke);
   2736 }
   2737 
   2738 void IntrinsicCodeGeneratorARMVIXL::VisitMathCbrt(HInvoke* invoke) {
   2739   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickCbrt);
   2740 }
   2741 
   2742 void IntrinsicLocationsBuilderARMVIXL::VisitMathCosh(HInvoke* invoke) {
   2743   CreateFPToFPCallLocations(allocator_, invoke);
   2744 }
   2745 
   2746 void IntrinsicCodeGeneratorARMVIXL::VisitMathCosh(HInvoke* invoke) {
   2747   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickCosh);
   2748 }
   2749 
   2750 void IntrinsicLocationsBuilderARMVIXL::VisitMathExp(HInvoke* invoke) {
   2751   CreateFPToFPCallLocations(allocator_, invoke);
   2752 }
   2753 
   2754 void IntrinsicCodeGeneratorARMVIXL::VisitMathExp(HInvoke* invoke) {
   2755   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickExp);
   2756 }
   2757 
   2758 void IntrinsicLocationsBuilderARMVIXL::VisitMathExpm1(HInvoke* invoke) {
   2759   CreateFPToFPCallLocations(allocator_, invoke);
   2760 }
   2761 
   2762 void IntrinsicCodeGeneratorARMVIXL::VisitMathExpm1(HInvoke* invoke) {
   2763   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickExpm1);
   2764 }
   2765 
   2766 void IntrinsicLocationsBuilderARMVIXL::VisitMathLog(HInvoke* invoke) {
   2767   CreateFPToFPCallLocations(allocator_, invoke);
   2768 }
   2769 
   2770 void IntrinsicCodeGeneratorARMVIXL::VisitMathLog(HInvoke* invoke) {
   2771   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickLog);
   2772 }
   2773 
   2774 void IntrinsicLocationsBuilderARMVIXL::VisitMathLog10(HInvoke* invoke) {
   2775   CreateFPToFPCallLocations(allocator_, invoke);
   2776 }
   2777 
   2778 void IntrinsicCodeGeneratorARMVIXL::VisitMathLog10(HInvoke* invoke) {
   2779   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickLog10);
   2780 }
   2781 
   2782 void IntrinsicLocationsBuilderARMVIXL::VisitMathSinh(HInvoke* invoke) {
   2783   CreateFPToFPCallLocations(allocator_, invoke);
   2784 }
   2785 
   2786 void IntrinsicCodeGeneratorARMVIXL::VisitMathSinh(HInvoke* invoke) {
   2787   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickSinh);
   2788 }
   2789 
   2790 void IntrinsicLocationsBuilderARMVIXL::VisitMathTan(HInvoke* invoke) {
   2791   CreateFPToFPCallLocations(allocator_, invoke);
   2792 }
   2793 
   2794 void IntrinsicCodeGeneratorARMVIXL::VisitMathTan(HInvoke* invoke) {
   2795   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickTan);
   2796 }
   2797 
   2798 void IntrinsicLocationsBuilderARMVIXL::VisitMathTanh(HInvoke* invoke) {
   2799   CreateFPToFPCallLocations(allocator_, invoke);
   2800 }
   2801 
   2802 void IntrinsicCodeGeneratorARMVIXL::VisitMathTanh(HInvoke* invoke) {
   2803   GenFPToFPCall(invoke, GetAssembler(), codegen_, kQuickTanh);
   2804 }
   2805 
   2806 void IntrinsicLocationsBuilderARMVIXL::VisitMathAtan2(HInvoke* invoke) {
   2807   CreateFPFPToFPCallLocations(allocator_, invoke);
   2808 }
   2809 
   2810 void IntrinsicCodeGeneratorARMVIXL::VisitMathAtan2(HInvoke* invoke) {
   2811   GenFPFPToFPCall(invoke, GetAssembler(), codegen_, kQuickAtan2);
   2812 }
   2813 
   2814 void IntrinsicLocationsBuilderARMVIXL::VisitMathPow(HInvoke* invoke) {
   2815   CreateFPFPToFPCallLocations(allocator_, invoke);
   2816 }
   2817 
   2818 void IntrinsicCodeGeneratorARMVIXL::VisitMathPow(HInvoke* invoke) {
   2819   GenFPFPToFPCall(invoke, GetAssembler(), codegen_, kQuickPow);
   2820 }
   2821 
   2822 void IntrinsicLocationsBuilderARMVIXL::VisitMathHypot(HInvoke* invoke) {
   2823   CreateFPFPToFPCallLocations(allocator_, invoke);
   2824 }
   2825 
   2826 void IntrinsicCodeGeneratorARMVIXL::VisitMathHypot(HInvoke* invoke) {
   2827   GenFPFPToFPCall(invoke, GetAssembler(), codegen_, kQuickHypot);
   2828 }
   2829 
   2830 void IntrinsicLocationsBuilderARMVIXL::VisitMathNextAfter(HInvoke* invoke) {
   2831   CreateFPFPToFPCallLocations(allocator_, invoke);
   2832 }
   2833 
   2834 void IntrinsicCodeGeneratorARMVIXL::VisitMathNextAfter(HInvoke* invoke) {
   2835   GenFPFPToFPCall(invoke, GetAssembler(), codegen_, kQuickNextAfter);
   2836 }
   2837 
   2838 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerReverse(HInvoke* invoke) {
   2839   CreateIntToIntLocations(allocator_, invoke);
   2840 }
   2841 
   2842 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerReverse(HInvoke* invoke) {
   2843   ArmVIXLAssembler* assembler = GetAssembler();
   2844   __ Rbit(OutputRegister(invoke), InputRegisterAt(invoke, 0));
   2845 }
   2846 
   2847 void IntrinsicLocationsBuilderARMVIXL::VisitLongReverse(HInvoke* invoke) {
   2848   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
   2849 }
   2850 
   2851 void IntrinsicCodeGeneratorARMVIXL::VisitLongReverse(HInvoke* invoke) {
   2852   ArmVIXLAssembler* assembler = GetAssembler();
   2853   LocationSummary* locations = invoke->GetLocations();
   2854 
   2855   vixl32::Register in_reg_lo  = LowRegisterFrom(locations->InAt(0));
   2856   vixl32::Register in_reg_hi  = HighRegisterFrom(locations->InAt(0));
   2857   vixl32::Register out_reg_lo = LowRegisterFrom(locations->Out());
   2858   vixl32::Register out_reg_hi = HighRegisterFrom(locations->Out());
   2859 
   2860   __ Rbit(out_reg_lo, in_reg_hi);
   2861   __ Rbit(out_reg_hi, in_reg_lo);
   2862 }
   2863 
   2864 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerReverseBytes(HInvoke* invoke) {
   2865   CreateIntToIntLocations(allocator_, invoke);
   2866 }
   2867 
   2868 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerReverseBytes(HInvoke* invoke) {
   2869   ArmVIXLAssembler* assembler = GetAssembler();
   2870   __ Rev(OutputRegister(invoke), InputRegisterAt(invoke, 0));
   2871 }
   2872 
   2873 void IntrinsicLocationsBuilderARMVIXL::VisitLongReverseBytes(HInvoke* invoke) {
   2874   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
   2875 }
   2876 
   2877 void IntrinsicCodeGeneratorARMVIXL::VisitLongReverseBytes(HInvoke* invoke) {
   2878   ArmVIXLAssembler* assembler = GetAssembler();
   2879   LocationSummary* locations = invoke->GetLocations();
   2880 
   2881   vixl32::Register in_reg_lo  = LowRegisterFrom(locations->InAt(0));
   2882   vixl32::Register in_reg_hi  = HighRegisterFrom(locations->InAt(0));
   2883   vixl32::Register out_reg_lo = LowRegisterFrom(locations->Out());
   2884   vixl32::Register out_reg_hi = HighRegisterFrom(locations->Out());
   2885 
   2886   __ Rev(out_reg_lo, in_reg_hi);
   2887   __ Rev(out_reg_hi, in_reg_lo);
   2888 }
   2889 
   2890 void IntrinsicLocationsBuilderARMVIXL::VisitShortReverseBytes(HInvoke* invoke) {
   2891   CreateIntToIntLocations(allocator_, invoke);
   2892 }
   2893 
   2894 void IntrinsicCodeGeneratorARMVIXL::VisitShortReverseBytes(HInvoke* invoke) {
   2895   ArmVIXLAssembler* assembler = GetAssembler();
   2896   __ Revsh(OutputRegister(invoke), InputRegisterAt(invoke, 0));
   2897 }
   2898 
   2899 static void GenBitCount(HInvoke* instr, DataType::Type type, ArmVIXLAssembler* assembler) {
   2900   DCHECK(DataType::IsIntOrLongType(type)) << type;
   2901   DCHECK_EQ(instr->GetType(), DataType::Type::kInt32);
   2902   DCHECK_EQ(DataType::Kind(instr->InputAt(0)->GetType()), type);
   2903 
   2904   bool is_long = type == DataType::Type::kInt64;
   2905   LocationSummary* locations = instr->GetLocations();
   2906   Location in = locations->InAt(0);
   2907   vixl32::Register src_0 = is_long ? LowRegisterFrom(in) : RegisterFrom(in);
   2908   vixl32::Register src_1 = is_long ? HighRegisterFrom(in) : src_0;
   2909   vixl32::SRegister tmp_s = LowSRegisterFrom(locations->GetTemp(0));
   2910   vixl32::DRegister tmp_d = DRegisterFrom(locations->GetTemp(0));
   2911   vixl32::Register  out_r = OutputRegister(instr);
   2912 
   2913   // Move data from core register(s) to temp D-reg for bit count calculation, then move back.
   2914   // According to Cortex A57 and A72 optimization guides, compared to transferring to full D-reg,
   2915   // transferring data from core reg to upper or lower half of vfp D-reg requires extra latency,
   2916   // That's why for integer bit count, we use 'vmov d0, r0, r0' instead of 'vmov d0[0], r0'.
   2917   __ Vmov(tmp_d, src_1, src_0);     // Temp DReg |--src_1|--src_0|
   2918   __ Vcnt(Untyped8, tmp_d, tmp_d);  // Temp DReg |c|c|c|c|c|c|c|c|
   2919   __ Vpaddl(U8, tmp_d, tmp_d);      // Temp DReg |--c|--c|--c|--c|
   2920   __ Vpaddl(U16, tmp_d, tmp_d);     // Temp DReg |------c|------c|
   2921   if (is_long) {
   2922     __ Vpaddl(U32, tmp_d, tmp_d);   // Temp DReg |--------------c|
   2923   }
   2924   __ Vmov(out_r, tmp_s);
   2925 }
   2926 
   2927 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerBitCount(HInvoke* invoke) {
   2928   CreateIntToIntLocations(allocator_, invoke);
   2929   invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
   2930 }
   2931 
   2932 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerBitCount(HInvoke* invoke) {
   2933   GenBitCount(invoke, DataType::Type::kInt32, GetAssembler());
   2934 }
   2935 
   2936 void IntrinsicLocationsBuilderARMVIXL::VisitLongBitCount(HInvoke* invoke) {
   2937   VisitIntegerBitCount(invoke);
   2938 }
   2939 
   2940 void IntrinsicCodeGeneratorARMVIXL::VisitLongBitCount(HInvoke* invoke) {
   2941   GenBitCount(invoke, DataType::Type::kInt64, GetAssembler());
   2942 }
   2943 
   2944 static void GenHighestOneBit(HInvoke* invoke,
   2945                              DataType::Type type,
   2946                              CodeGeneratorARMVIXL* codegen) {
   2947   DCHECK(DataType::IsIntOrLongType(type));
   2948 
   2949   ArmVIXLAssembler* assembler = codegen->GetAssembler();
   2950   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   2951   const vixl32::Register temp = temps.Acquire();
   2952 
   2953   if (type == DataType::Type::kInt64) {
   2954     LocationSummary* locations = invoke->GetLocations();
   2955     Location in = locations->InAt(0);
   2956     Location out = locations->Out();
   2957 
   2958     vixl32::Register in_reg_lo = LowRegisterFrom(in);
   2959     vixl32::Register in_reg_hi = HighRegisterFrom(in);
   2960     vixl32::Register out_reg_lo = LowRegisterFrom(out);
   2961     vixl32::Register out_reg_hi = HighRegisterFrom(out);
   2962 
   2963     __ Mov(temp, 0x80000000);  // Modified immediate.
   2964     __ Clz(out_reg_lo, in_reg_lo);
   2965     __ Clz(out_reg_hi, in_reg_hi);
   2966     __ Lsr(out_reg_lo, temp, out_reg_lo);
   2967     __ Lsrs(out_reg_hi, temp, out_reg_hi);
   2968 
   2969     // Discard result for lowest 32 bits if highest 32 bits are not zero.
   2970     // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
   2971     // we check that the output is in a low register, so that a 16-bit MOV
   2972     // encoding can be used. If output is in a high register, then we generate
   2973     // 4 more bytes of code to avoid a branch.
   2974     Operand mov_src(0);
   2975     if (!out_reg_lo.IsLow()) {
   2976       __ Mov(LeaveFlags, temp, 0);
   2977       mov_src = Operand(temp);
   2978     }
   2979     ExactAssemblyScope it_scope(codegen->GetVIXLAssembler(),
   2980                                   2 * vixl32::k16BitT32InstructionSizeInBytes,
   2981                                   CodeBufferCheckScope::kExactSize);
   2982     __ it(ne);
   2983     __ mov(ne, out_reg_lo, mov_src);
   2984   } else {
   2985     vixl32::Register out = OutputRegister(invoke);
   2986     vixl32::Register in = InputRegisterAt(invoke, 0);
   2987 
   2988     __ Mov(temp, 0x80000000);  // Modified immediate.
   2989     __ Clz(out, in);
   2990     __ Lsr(out, temp, out);
   2991   }
   2992 }
   2993 
   2994 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2995   CreateIntToIntLocations(allocator_, invoke);
   2996 }
   2997 
   2998 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2999   GenHighestOneBit(invoke, DataType::Type::kInt32, codegen_);
   3000 }
   3001 
   3002 void IntrinsicLocationsBuilderARMVIXL::VisitLongHighestOneBit(HInvoke* invoke) {
   3003   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
   3004 }
   3005 
   3006 void IntrinsicCodeGeneratorARMVIXL::VisitLongHighestOneBit(HInvoke* invoke) {
   3007   GenHighestOneBit(invoke, DataType::Type::kInt64, codegen_);
   3008 }
   3009 
   3010 static void GenLowestOneBit(HInvoke* invoke,
   3011                             DataType::Type type,
   3012                             CodeGeneratorARMVIXL* codegen) {
   3013   DCHECK(DataType::IsIntOrLongType(type));
   3014 
   3015   ArmVIXLAssembler* assembler = codegen->GetAssembler();
   3016   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   3017   const vixl32::Register temp = temps.Acquire();
   3018 
   3019   if (type == DataType::Type::kInt64) {
   3020     LocationSummary* locations = invoke->GetLocations();
   3021     Location in = locations->InAt(0);
   3022     Location out = locations->Out();
   3023 
   3024     vixl32::Register in_reg_lo = LowRegisterFrom(in);
   3025     vixl32::Register in_reg_hi = HighRegisterFrom(in);
   3026     vixl32::Register out_reg_lo = LowRegisterFrom(out);
   3027     vixl32::Register out_reg_hi = HighRegisterFrom(out);
   3028 
   3029     __ Rsb(out_reg_hi, in_reg_hi, 0);
   3030     __ Rsb(out_reg_lo, in_reg_lo, 0);
   3031     __ And(out_reg_hi, out_reg_hi, in_reg_hi);
   3032     // The result of this operation is 0 iff in_reg_lo is 0
   3033     __ Ands(out_reg_lo, out_reg_lo, in_reg_lo);
   3034 
   3035     // Discard result for highest 32 bits if lowest 32 bits are not zero.
   3036     // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
   3037     // we check that the output is in a low register, so that a 16-bit MOV
   3038     // encoding can be used. If output is in a high register, then we generate
   3039     // 4 more bytes of code to avoid a branch.
   3040     Operand mov_src(0);
   3041     if (!out_reg_lo.IsLow()) {
   3042       __ Mov(LeaveFlags, temp, 0);
   3043       mov_src = Operand(temp);
   3044     }
   3045     ExactAssemblyScope it_scope(codegen->GetVIXLAssembler(),
   3046                                   2 * vixl32::k16BitT32InstructionSizeInBytes,
   3047                                   CodeBufferCheckScope::kExactSize);
   3048     __ it(ne);
   3049     __ mov(ne, out_reg_hi, mov_src);
   3050   } else {
   3051     vixl32::Register out = OutputRegister(invoke);
   3052     vixl32::Register in = InputRegisterAt(invoke, 0);
   3053 
   3054     __ Rsb(temp, in, 0);
   3055     __ And(out, temp, in);
   3056   }
   3057 }
   3058 
   3059 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerLowestOneBit(HInvoke* invoke) {
   3060   CreateIntToIntLocations(allocator_, invoke);
   3061 }
   3062 
   3063 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerLowestOneBit(HInvoke* invoke) {
   3064   GenLowestOneBit(invoke, DataType::Type::kInt32, codegen_);
   3065 }
   3066 
   3067 void IntrinsicLocationsBuilderARMVIXL::VisitLongLowestOneBit(HInvoke* invoke) {
   3068   CreateLongToLongLocationsWithOverlap(allocator_, invoke);
   3069 }
   3070 
   3071 void IntrinsicCodeGeneratorARMVIXL::VisitLongLowestOneBit(HInvoke* invoke) {
   3072   GenLowestOneBit(invoke, DataType::Type::kInt64, codegen_);
   3073 }
   3074 
   3075 void IntrinsicLocationsBuilderARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   3076   LocationSummary* locations =
   3077       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3078   locations->SetInAt(0, Location::RequiresRegister());
   3079   locations->SetInAt(1, Location::RequiresRegister());
   3080   locations->SetInAt(2, Location::RequiresRegister());
   3081   locations->SetInAt(3, Location::RequiresRegister());
   3082   locations->SetInAt(4, Location::RequiresRegister());
   3083 
   3084   // Temporary registers to store lengths of strings and for calculations.
   3085   locations->AddTemp(Location::RequiresRegister());
   3086   locations->AddTemp(Location::RequiresRegister());
   3087   locations->AddTemp(Location::RequiresRegister());
   3088 }
   3089 
   3090 void IntrinsicCodeGeneratorARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   3091   ArmVIXLAssembler* assembler = GetAssembler();
   3092   LocationSummary* locations = invoke->GetLocations();
   3093 
   3094   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   3095   const size_t char_size = DataType::Size(DataType::Type::kUint16);
   3096   DCHECK_EQ(char_size, 2u);
   3097 
   3098   // Location of data in char array buffer.
   3099   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
   3100 
   3101   // Location of char array data in string.
   3102   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   3103 
   3104   // void getCharsNoCheck(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   3105   // Since getChars() calls getCharsNoCheck() - we use registers rather than constants.
   3106   vixl32::Register srcObj = InputRegisterAt(invoke, 0);
   3107   vixl32::Register srcBegin = InputRegisterAt(invoke, 1);
   3108   vixl32::Register srcEnd = InputRegisterAt(invoke, 2);
   3109   vixl32::Register dstObj = InputRegisterAt(invoke, 3);
   3110   vixl32::Register dstBegin = InputRegisterAt(invoke, 4);
   3111 
   3112   vixl32::Register num_chr = RegisterFrom(locations->GetTemp(0));
   3113   vixl32::Register src_ptr = RegisterFrom(locations->GetTemp(1));
   3114   vixl32::Register dst_ptr = RegisterFrom(locations->GetTemp(2));
   3115 
   3116   vixl32::Label done, compressed_string_loop;
   3117   vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done);
   3118   // dst to be copied.
   3119   __ Add(dst_ptr, dstObj, data_offset);
   3120   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, vixl32::LSL, 1));
   3121 
   3122   __ Subs(num_chr, srcEnd, srcBegin);
   3123   // Early out for valid zero-length retrievals.
   3124   __ B(eq, final_label, /* far_target */ false);
   3125 
   3126   // src range to copy.
   3127   __ Add(src_ptr, srcObj, value_offset);
   3128 
   3129   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   3130   vixl32::Register temp;
   3131   vixl32::Label compressed_string_preloop;
   3132   if (mirror::kUseStringCompression) {
   3133     // Location of count in string.
   3134     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   3135     temp = temps.Acquire();
   3136     // String's length.
   3137     __ Ldr(temp, MemOperand(srcObj, count_offset));
   3138     __ Tst(temp, 1);
   3139     temps.Release(temp);
   3140     __ B(eq, &compressed_string_preloop, /* far_target */ false);
   3141   }
   3142   __ Add(src_ptr, src_ptr, Operand(srcBegin, vixl32::LSL, 1));
   3143 
   3144   // Do the copy.
   3145   vixl32::Label loop, remainder;
   3146 
   3147   temp = temps.Acquire();
   3148   // Save repairing the value of num_chr on the < 4 character path.
   3149   __ Subs(temp, num_chr, 4);
   3150   __ B(lt, &remainder, /* far_target */ false);
   3151 
   3152   // Keep the result of the earlier subs, we are going to fetch at least 4 characters.
   3153   __ Mov(num_chr, temp);
   3154 
   3155   // Main loop used for longer fetches loads and stores 4x16-bit characters at a time.
   3156   // (LDRD/STRD fault on unaligned addresses and it's not worth inlining extra code
   3157   // to rectify these everywhere this intrinsic applies.)
   3158   __ Bind(&loop);
   3159   __ Ldr(temp, MemOperand(src_ptr, char_size * 2));
   3160   __ Subs(num_chr, num_chr, 4);
   3161   __ Str(temp, MemOperand(dst_ptr, char_size * 2));
   3162   __ Ldr(temp, MemOperand(src_ptr, char_size * 4, PostIndex));
   3163   __ Str(temp, MemOperand(dst_ptr, char_size * 4, PostIndex));
   3164   temps.Release(temp);
   3165   __ B(ge, &loop, /* far_target */ false);
   3166 
   3167   __ Adds(num_chr, num_chr, 4);
   3168   __ B(eq, final_label, /* far_target */ false);
   3169 
   3170   // Main loop for < 4 character case and remainder handling. Loads and stores one
   3171   // 16-bit Java character at a time.
   3172   __ Bind(&remainder);
   3173   temp = temps.Acquire();
   3174   __ Ldrh(temp, MemOperand(src_ptr, char_size, PostIndex));
   3175   __ Subs(num_chr, num_chr, 1);
   3176   __ Strh(temp, MemOperand(dst_ptr, char_size, PostIndex));
   3177   temps.Release(temp);
   3178   __ B(gt, &remainder, /* far_target */ false);
   3179 
   3180   if (mirror::kUseStringCompression) {
   3181     __ B(final_label);
   3182 
   3183     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
   3184     DCHECK_EQ(c_char_size, 1u);
   3185     // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
   3186     __ Bind(&compressed_string_preloop);
   3187     __ Add(src_ptr, src_ptr, srcBegin);
   3188     __ Bind(&compressed_string_loop);
   3189     temp = temps.Acquire();
   3190     __ Ldrb(temp, MemOperand(src_ptr, c_char_size, PostIndex));
   3191     __ Strh(temp, MemOperand(dst_ptr, char_size, PostIndex));
   3192     temps.Release(temp);
   3193     __ Subs(num_chr, num_chr, 1);
   3194     __ B(gt, &compressed_string_loop, /* far_target */ false);
   3195   }
   3196 
   3197   if (done.IsReferenced()) {
   3198     __ Bind(&done);
   3199   }
   3200 }
   3201 
   3202 void IntrinsicLocationsBuilderARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) {
   3203   CreateFPToIntLocations(allocator_, invoke);
   3204 }
   3205 
   3206 void IntrinsicCodeGeneratorARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) {
   3207   ArmVIXLAssembler* const assembler = GetAssembler();
   3208   const vixl32::Register out = OutputRegister(invoke);
   3209   // Shifting left by 1 bit makes the value encodable as an immediate operand;
   3210   // we don't care about the sign bit anyway.
   3211   constexpr uint32_t infinity = kPositiveInfinityFloat << 1U;
   3212 
   3213   __ Vmov(out, InputSRegisterAt(invoke, 0));
   3214   // We don't care about the sign bit, so shift left.
   3215   __ Lsl(out, out, 1);
   3216   __ Eor(out, out, infinity);
   3217   codegen_->GenerateConditionWithZero(kCondEQ, out, out);
   3218 }
   3219 
   3220 void IntrinsicLocationsBuilderARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) {
   3221   CreateFPToIntLocations(allocator_, invoke);
   3222 }
   3223 
   3224 void IntrinsicCodeGeneratorARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) {
   3225   ArmVIXLAssembler* const assembler = GetAssembler();
   3226   const vixl32::Register out = OutputRegister(invoke);
   3227   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   3228   const vixl32::Register temp = temps.Acquire();
   3229   // The highest 32 bits of double precision positive infinity separated into
   3230   // two constants encodable as immediate operands.
   3231   constexpr uint32_t infinity_high  = 0x7f000000U;
   3232   constexpr uint32_t infinity_high2 = 0x00f00000U;
   3233 
   3234   static_assert((infinity_high | infinity_high2) ==
   3235                     static_cast<uint32_t>(kPositiveInfinityDouble >> 32U),
   3236                 "The constants do not add up to the high 32 bits of double "
   3237                 "precision positive infinity.");
   3238   __ Vmov(temp, out, InputDRegisterAt(invoke, 0));
   3239   __ Eor(out, out, infinity_high);
   3240   __ Eor(out, out, infinity_high2);
   3241   // We don't care about the sign bit, so shift left.
   3242   __ Orr(out, temp, Operand(out, vixl32::LSL, 1));
   3243   codegen_->GenerateConditionWithZero(kCondEQ, out, out);
   3244 }
   3245 
   3246 void IntrinsicLocationsBuilderARMVIXL::VisitMathCeil(HInvoke* invoke) {
   3247   if (features_.HasARMv8AInstructions()) {
   3248     CreateFPToFPLocations(allocator_, invoke);
   3249   }
   3250 }
   3251 
   3252 void IntrinsicCodeGeneratorARMVIXL::VisitMathCeil(HInvoke* invoke) {
   3253   ArmVIXLAssembler* assembler = GetAssembler();
   3254   DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
   3255   __ Vrintp(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
   3256 }
   3257 
   3258 void IntrinsicLocationsBuilderARMVIXL::VisitMathFloor(HInvoke* invoke) {
   3259   if (features_.HasARMv8AInstructions()) {
   3260     CreateFPToFPLocations(allocator_, invoke);
   3261   }
   3262 }
   3263 
   3264 void IntrinsicCodeGeneratorARMVIXL::VisitMathFloor(HInvoke* invoke) {
   3265   ArmVIXLAssembler* assembler = GetAssembler();
   3266   DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions());
   3267   __ Vrintm(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0));
   3268 }
   3269 
   3270 void IntrinsicLocationsBuilderARMVIXL::VisitIntegerValueOf(HInvoke* invoke) {
   3271   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   3272   IntrinsicVisitor::ComputeIntegerValueOfLocations(
   3273       invoke,
   3274       codegen_,
   3275       LocationFrom(r0),
   3276       LocationFrom(calling_convention.GetRegisterAt(0)));
   3277 }
   3278 
   3279 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) {
   3280   IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
   3281   LocationSummary* locations = invoke->GetLocations();
   3282   ArmVIXLAssembler* const assembler = GetAssembler();
   3283 
   3284   vixl32::Register out = RegisterFrom(locations->Out());
   3285   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   3286   vixl32::Register temp = temps.Acquire();
   3287   InvokeRuntimeCallingConventionARMVIXL calling_convention;
   3288   vixl32::Register argument = calling_convention.GetRegisterAt(0);
   3289   if (invoke->InputAt(0)->IsConstant()) {
   3290     int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
   3291     if (value >= info.low && value <= info.high) {
   3292       // Just embed the j.l.Integer in the code.
   3293       ScopedObjectAccess soa(Thread::Current());
   3294       mirror::Object* boxed = info.cache->Get(value + (-info.low));
   3295       DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
   3296       uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
   3297       __ Ldr(out, codegen_->DeduplicateBootImageAddressLiteral(address));
   3298     } else {
   3299       // Allocate and initialize a new j.l.Integer.
   3300       // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
   3301       // JIT object table.
   3302       uint32_t address =
   3303           dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
   3304       __ Ldr(argument, codegen_->DeduplicateBootImageAddressLiteral(address));
   3305       codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
   3306       CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   3307       __ Mov(temp, value);
   3308       assembler->StoreToOffset(kStoreWord, temp, out, info.value_offset);
   3309       // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
   3310       // one.
   3311       codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
   3312     }
   3313   } else {
   3314     vixl32::Register in = RegisterFrom(locations->InAt(0));
   3315     // Check bounds of our cache.
   3316     __ Add(out, in, -info.low);
   3317     __ Cmp(out, info.high - info.low + 1);
   3318     vixl32::Label allocate, done;
   3319     __ B(hs, &allocate, /* is_far_target */ false);
   3320     // If the value is within the bounds, load the j.l.Integer directly from the array.
   3321     uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
   3322     uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
   3323     __ Ldr(temp, codegen_->DeduplicateBootImageAddressLiteral(data_offset + address));
   3324     codegen_->LoadFromShiftedRegOffset(DataType::Type::kReference, locations->Out(), temp, out);
   3325     assembler->MaybeUnpoisonHeapReference(out);
   3326     __ B(&done);
   3327     __ Bind(&allocate);
   3328     // Otherwise allocate and initialize a new j.l.Integer.
   3329     address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
   3330     __ Ldr(argument, codegen_->DeduplicateBootImageAddressLiteral(address));
   3331     codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
   3332     CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   3333     assembler->StoreToOffset(kStoreWord, in, out, info.value_offset);
   3334     // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
   3335     // one.
   3336     codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
   3337     __ Bind(&done);
   3338   }
   3339 }
   3340 
   3341 void IntrinsicLocationsBuilderARMVIXL::VisitThreadInterrupted(HInvoke* invoke) {
   3342   LocationSummary* locations =
   3343       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3344   locations->SetOut(Location::RequiresRegister());
   3345 }
   3346 
   3347 void IntrinsicCodeGeneratorARMVIXL::VisitThreadInterrupted(HInvoke* invoke) {
   3348   ArmVIXLAssembler* assembler = GetAssembler();
   3349   vixl32::Register out = RegisterFrom(invoke->GetLocations()->Out());
   3350   int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value();
   3351   __ Ldr(out, MemOperand(tr, offset));
   3352   UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
   3353   vixl32::Register temp = temps.Acquire();
   3354   vixl32::Label done;
   3355   vixl32::Label* const final_label = codegen_->GetFinalLabel(invoke, &done);
   3356   __ CompareAndBranchIfZero(out, final_label, /* far_target */ false);
   3357   __ Dmb(vixl32::ISH);
   3358   __ Mov(temp, 0);
   3359   assembler->StoreToOffset(kStoreWord, temp, tr, offset);
   3360   __ Dmb(vixl32::ISH);
   3361   if (done.IsReferenced()) {
   3362     __ Bind(&done);
   3363   }
   3364 }
   3365 
   3366 void IntrinsicLocationsBuilderARMVIXL::VisitReachabilityFence(HInvoke* invoke) {
   3367   LocationSummary* locations =
   3368       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3369   locations->SetInAt(0, Location::Any());
   3370 }
   3371 
   3372 void IntrinsicCodeGeneratorARMVIXL::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
   3373 
   3374 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble)   // Could be done by changing rounding mode, maybe?
   3375 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong)     // High register pressure.
   3376 UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyChar)
   3377 UNIMPLEMENTED_INTRINSIC(ARMVIXL, ReferenceGetReferent)
   3378 
   3379 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
   3380 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
   3381 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBufferAppend);
   3382 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBufferLength);
   3383 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBufferToString);
   3384 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderAppend);
   3385 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderLength);
   3386 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderToString);
   3387 
   3388 // 1.8.
   3389 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndAddInt)
   3390 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndAddLong)
   3391 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndSetInt)
   3392 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndSetLong)
   3393 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndSetObject)
   3394 
   3395 UNREACHABLE_INTRINSICS(ARMVIXL)
   3396 
   3397 #undef __
   3398 
   3399 }  // namespace arm
   3400 }  // namespace art
   3401