Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "intrinsics_x86_64.h"
     18 
     19 #include <limits>
     20 
     21 #include "arch/x86_64/instruction_set_features_x86_64.h"
     22 #include "art_method.h"
     23 #include "base/bit_utils.h"
     24 #include "code_generator_x86_64.h"
     25 #include "entrypoints/quick/quick_entrypoints.h"
     26 #include "heap_poisoning.h"
     27 #include "intrinsics.h"
     28 #include "intrinsics_utils.h"
     29 #include "lock_word.h"
     30 #include "mirror/array-inl.h"
     31 #include "mirror/object_array-inl.h"
     32 #include "mirror/reference.h"
     33 #include "mirror/string.h"
     34 #include "scoped_thread_state_change-inl.h"
     35 #include "thread-current-inl.h"
     36 #include "utils/x86_64/assembler_x86_64.h"
     37 #include "utils/x86_64/constants_x86_64.h"
     38 
     39 namespace art {
     40 
     41 namespace x86_64 {
     42 
     43 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
     44   : allocator_(codegen->GetGraph()->GetAllocator()), codegen_(codegen) {
     45 }
     46 
     47 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
     48   return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
     49 }
     50 
     51 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
     52   return codegen_->GetGraph()->GetAllocator();
     53 }
     54 
     55 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
     56   Dispatch(invoke);
     57   LocationSummary* res = invoke->GetLocations();
     58   if (res == nullptr) {
     59     return false;
     60   }
     61   return res->Intrinsified();
     62 }
     63 
     64 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
     65   InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
     66   IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
     67 }
     68 
     69 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
     70 
     71 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
     72 #define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
     73 
     74 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
     75 class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
     76  public:
     77   explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
     78       : SlowPathCode(instruction) {
     79     DCHECK(kEmitCompilerReadBarrier);
     80     DCHECK(kUseBakerReadBarrier);
     81   }
     82 
     83   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     84     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     85     LocationSummary* locations = instruction_->GetLocations();
     86     DCHECK(locations->CanCall());
     87     DCHECK(instruction_->IsInvokeStaticOrDirect())
     88         << "Unexpected instruction in read barrier arraycopy slow path: "
     89         << instruction_->DebugName();
     90     DCHECK(instruction_->GetLocations()->Intrinsified());
     91     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
     92 
     93     int32_t element_size = DataType::Size(DataType::Type::kReference);
     94 
     95     CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
     96     CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
     97     CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
     98 
     99     __ Bind(GetEntryLabel());
    100     NearLabel loop;
    101     __ Bind(&loop);
    102     __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
    103     __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
    104     // TODO: Inline the mark bit check before calling the runtime?
    105     // TMP = ReadBarrier::Mark(TMP);
    106     // No need to save live registers; it's taken care of by the
    107     // entrypoint. Also, there is no need to update the stack mask,
    108     // as this runtime call will not trigger a garbage collection.
    109     int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
    110     // This runtime call does not require a stack map.
    111     x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
    112     __ MaybePoisonHeapReference(CpuRegister(TMP));
    113     __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
    114     __ addl(src_curr_addr, Immediate(element_size));
    115     __ addl(dst_curr_addr, Immediate(element_size));
    116     __ cmpl(src_curr_addr, src_stop_addr);
    117     __ j(kNotEqual, &loop);
    118     __ jmp(GetExitLabel());
    119   }
    120 
    121   const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
    122 
    123  private:
    124   DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
    125 };
    126 
    127 #undef __
    128 
    129 #define __ assembler->
    130 
    131 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    132   LocationSummary* locations =
    133       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    134   locations->SetInAt(0, Location::RequiresFpuRegister());
    135   locations->SetOut(Location::RequiresRegister());
    136 }
    137 
    138 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    139   LocationSummary* locations =
    140       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    141   locations->SetInAt(0, Location::RequiresRegister());
    142   locations->SetOut(Location::RequiresFpuRegister());
    143 }
    144 
    145 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
    146   Location input = locations->InAt(0);
    147   Location output = locations->Out();
    148   __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
    149 }
    150 
    151 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
    152   Location input = locations->InAt(0);
    153   Location output = locations->Out();
    154   __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
    155 }
    156 
    157 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    158   CreateFPToIntLocations(allocator_, invoke);
    159 }
    160 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    161   CreateIntToFPLocations(allocator_, invoke);
    162 }
    163 
    164 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    165   MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    166 }
    167 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    168   MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    169 }
    170 
    171 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    172   CreateFPToIntLocations(allocator_, invoke);
    173 }
    174 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    175   CreateIntToFPLocations(allocator_, invoke);
    176 }
    177 
    178 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    179   MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    180 }
    181 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    182   MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    183 }
    184 
    185 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    186   LocationSummary* locations =
    187       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    188   locations->SetInAt(0, Location::RequiresRegister());
    189   locations->SetOut(Location::SameAsFirstInput());
    190 }
    191 
    192 static void GenReverseBytes(LocationSummary* locations,
    193                             DataType::Type size,
    194                             X86_64Assembler* assembler) {
    195   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    196 
    197   switch (size) {
    198     case DataType::Type::kInt16:
    199       // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
    200       __ bswapl(out);
    201       __ sarl(out, Immediate(16));
    202       break;
    203     case DataType::Type::kInt32:
    204       __ bswapl(out);
    205       break;
    206     case DataType::Type::kInt64:
    207       __ bswapq(out);
    208       break;
    209     default:
    210       LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
    211       UNREACHABLE();
    212   }
    213 }
    214 
    215 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
    216   CreateIntToIntLocations(allocator_, invoke);
    217 }
    218 
    219 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
    220   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
    221 }
    222 
    223 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
    224   CreateIntToIntLocations(allocator_, invoke);
    225 }
    226 
    227 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
    228   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
    229 }
    230 
    231 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
    232   CreateIntToIntLocations(allocator_, invoke);
    233 }
    234 
    235 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
    236   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
    237 }
    238 
    239 
    240 // TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
    241 //       need is 64b.
    242 
    243 static void CreateFloatToFloatPlusTemps(ArenaAllocator* allocator, HInvoke* invoke) {
    244   // TODO: Enable memory operations when the assembler supports them.
    245   LocationSummary* locations =
    246       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    247   locations->SetInAt(0, Location::RequiresFpuRegister());
    248   locations->SetOut(Location::SameAsFirstInput());
    249   locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
    250 }
    251 
    252 static void MathAbsFP(LocationSummary* locations,
    253                       bool is64bit,
    254                       X86_64Assembler* assembler,
    255                       CodeGeneratorX86_64* codegen) {
    256   Location output = locations->Out();
    257 
    258   DCHECK(output.IsFpuRegister());
    259   XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    260 
    261   // TODO: Can mask directly with constant area using pand if we can guarantee
    262   // that the literal is aligned on a 16 byte boundary.  This will avoid a
    263   // temporary.
    264   if (is64bit) {
    265     __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
    266     __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
    267   } else {
    268     __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
    269     __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
    270   }
    271 }
    272 
    273 void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
    274   CreateFloatToFloatPlusTemps(allocator_, invoke);
    275 }
    276 
    277 void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
    278   MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
    279 }
    280 
    281 void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
    282   CreateFloatToFloatPlusTemps(allocator_, invoke);
    283 }
    284 
    285 void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
    286   MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
    287 }
    288 
    289 static void CreateIntToIntPlusTemp(ArenaAllocator* allocator, HInvoke* invoke) {
    290   LocationSummary* locations =
    291       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    292   locations->SetInAt(0, Location::RequiresRegister());
    293   locations->SetOut(Location::SameAsFirstInput());
    294   locations->AddTemp(Location::RequiresRegister());
    295 }
    296 
    297 static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
    298   Location output = locations->Out();
    299   CpuRegister out = output.AsRegister<CpuRegister>();
    300   CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
    301 
    302   if (is64bit) {
    303     // Create mask.
    304     __ movq(mask, out);
    305     __ sarq(mask, Immediate(63));
    306     // Add mask.
    307     __ addq(out, mask);
    308     __ xorq(out, mask);
    309   } else {
    310     // Create mask.
    311     __ movl(mask, out);
    312     __ sarl(mask, Immediate(31));
    313     // Add mask.
    314     __ addl(out, mask);
    315     __ xorl(out, mask);
    316   }
    317 }
    318 
    319 void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
    320   CreateIntToIntPlusTemp(allocator_, invoke);
    321 }
    322 
    323 void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
    324   GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    325 }
    326 
    327 void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
    328   CreateIntToIntPlusTemp(allocator_, invoke);
    329 }
    330 
    331 void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
    332   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    333 }
    334 
    335 static void GenMinMaxFP(LocationSummary* locations,
    336                         bool is_min,
    337                         bool is_double,
    338                         X86_64Assembler* assembler,
    339                         CodeGeneratorX86_64* codegen) {
    340   Location op1_loc = locations->InAt(0);
    341   Location op2_loc = locations->InAt(1);
    342   Location out_loc = locations->Out();
    343   XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
    344 
    345   // Shortcut for same input locations.
    346   if (op1_loc.Equals(op2_loc)) {
    347     DCHECK(out_loc.Equals(op1_loc));
    348     return;
    349   }
    350 
    351   //  (out := op1)
    352   //  out <=? op2
    353   //  if Nan jmp Nan_label
    354   //  if out is min jmp done
    355   //  if op2 is min jmp op2_label
    356   //  handle -0/+0
    357   //  jmp done
    358   // Nan_label:
    359   //  out := NaN
    360   // op2_label:
    361   //  out := op2
    362   // done:
    363   //
    364   // This removes one jmp, but needs to copy one input (op1) to out.
    365   //
    366   // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
    367 
    368   XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
    369 
    370   NearLabel nan, done, op2_label;
    371   if (is_double) {
    372     __ ucomisd(out, op2);
    373   } else {
    374     __ ucomiss(out, op2);
    375   }
    376 
    377   __ j(Condition::kParityEven, &nan);
    378 
    379   __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
    380   __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
    381 
    382   // Handle 0.0/-0.0.
    383   if (is_min) {
    384     if (is_double) {
    385       __ orpd(out, op2);
    386     } else {
    387       __ orps(out, op2);
    388     }
    389   } else {
    390     if (is_double) {
    391       __ andpd(out, op2);
    392     } else {
    393       __ andps(out, op2);
    394     }
    395   }
    396   __ jmp(&done);
    397 
    398   // NaN handling.
    399   __ Bind(&nan);
    400   if (is_double) {
    401     __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
    402   } else {
    403     __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
    404   }
    405   __ jmp(&done);
    406 
    407   // out := op2;
    408   __ Bind(&op2_label);
    409   if (is_double) {
    410     __ movsd(out, op2);
    411   } else {
    412     __ movss(out, op2);
    413   }
    414 
    415   // Done.
    416   __ Bind(&done);
    417 }
    418 
    419 static void CreateFPFPToFP(ArenaAllocator* allocator, HInvoke* invoke) {
    420   LocationSummary* locations =
    421       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    422   locations->SetInAt(0, Location::RequiresFpuRegister());
    423   locations->SetInAt(1, Location::RequiresFpuRegister());
    424   // The following is sub-optimal, but all we can do for now. It would be fine to also accept
    425   // the second input to be the output (we can simply swap inputs).
    426   locations->SetOut(Location::SameAsFirstInput());
    427 }
    428 
    429 void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
    430   CreateFPFPToFP(allocator_, invoke);
    431 }
    432 
    433 void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
    434   GenMinMaxFP(
    435       invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_);
    436 }
    437 
    438 void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
    439   CreateFPFPToFP(allocator_, invoke);
    440 }
    441 
    442 void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
    443   GenMinMaxFP(
    444       invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_);
    445 }
    446 
    447 void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    448   CreateFPFPToFP(allocator_, invoke);
    449 }
    450 
    451 void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    452   GenMinMaxFP(
    453       invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_);
    454 }
    455 
    456 void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
    457   CreateFPFPToFP(allocator_, invoke);
    458 }
    459 
    460 void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
    461   GenMinMaxFP(
    462       invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_);
    463 }
    464 
    465 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
    466                       X86_64Assembler* assembler) {
    467   Location op1_loc = locations->InAt(0);
    468   Location op2_loc = locations->InAt(1);
    469 
    470   // Shortcut for same input locations.
    471   if (op1_loc.Equals(op2_loc)) {
    472     // Can return immediately, as op1_loc == out_loc.
    473     // Note: if we ever support separate registers, e.g., output into memory, we need to check for
    474     //       a copy here.
    475     DCHECK(locations->Out().Equals(op1_loc));
    476     return;
    477   }
    478 
    479   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    480   CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
    481 
    482   //  (out := op1)
    483   //  out <=? op2
    484   //  if out is min jmp done
    485   //  out := op2
    486   // done:
    487 
    488   if (is_long) {
    489     __ cmpq(out, op2);
    490   } else {
    491     __ cmpl(out, op2);
    492   }
    493 
    494   __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
    495 }
    496 
    497 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    498   LocationSummary* locations =
    499       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    500   locations->SetInAt(0, Location::RequiresRegister());
    501   locations->SetInAt(1, Location::RequiresRegister());
    502   locations->SetOut(Location::SameAsFirstInput());
    503 }
    504 
    505 void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
    506   CreateIntIntToIntLocations(allocator_, invoke);
    507 }
    508 
    509 void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
    510   GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler());
    511 }
    512 
    513 void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
    514   CreateIntIntToIntLocations(allocator_, invoke);
    515 }
    516 
    517 void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
    518   GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler());
    519 }
    520 
    521 void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
    522   CreateIntIntToIntLocations(allocator_, invoke);
    523 }
    524 
    525 void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
    526   GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler());
    527 }
    528 
    529 void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
    530   CreateIntIntToIntLocations(allocator_, invoke);
    531 }
    532 
    533 void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
    534   GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler());
    535 }
    536 
    537 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    538   LocationSummary* locations =
    539       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    540   locations->SetInAt(0, Location::RequiresFpuRegister());
    541   locations->SetOut(Location::RequiresFpuRegister());
    542 }
    543 
    544 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
    545   CreateFPToFPLocations(allocator_, invoke);
    546 }
    547 
    548 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
    549   LocationSummary* locations = invoke->GetLocations();
    550   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    551   XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    552 
    553   GetAssembler()->sqrtsd(out, in);
    554 }
    555 
    556 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
    557   MoveArguments(invoke, codegen);
    558 
    559   DCHECK(invoke->IsInvokeStaticOrDirect());
    560   codegen->GenerateStaticOrDirectCall(
    561       invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
    562 
    563   // Copy the result back to the expected output.
    564   Location out = invoke->GetLocations()->Out();
    565   if (out.IsValid()) {
    566     DCHECK(out.IsRegister());
    567     codegen->MoveFromReturnRegister(out, invoke->GetType());
    568   }
    569 }
    570 
    571 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
    572                                        HInvoke* invoke,
    573                                        CodeGeneratorX86_64* codegen) {
    574   // Do we have instruction support?
    575   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
    576     CreateFPToFPLocations(allocator, invoke);
    577     return;
    578   }
    579 
    580   // We have to fall back to a call to the intrinsic.
    581   LocationSummary* locations =
    582       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
    583   InvokeRuntimeCallingConvention calling_convention;
    584   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    585   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    586   // Needs to be RDI for the invoke.
    587   locations->AddTemp(Location::RegisterLocation(RDI));
    588 }
    589 
    590 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
    591                                    HInvoke* invoke,
    592                                    X86_64Assembler* assembler,
    593                                    int round_mode) {
    594   LocationSummary* locations = invoke->GetLocations();
    595   if (locations->WillCall()) {
    596     InvokeOutOfLineIntrinsic(codegen, invoke);
    597   } else {
    598     XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    599     XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    600     __ roundsd(out, in, Immediate(round_mode));
    601   }
    602 }
    603 
    604 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
    605   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    606 }
    607 
    608 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
    609   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
    610 }
    611 
    612 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
    613   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    614 }
    615 
    616 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
    617   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
    618 }
    619 
    620 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
    621   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    622 }
    623 
    624 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
    625   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
    626 }
    627 
    628 static void CreateSSE41FPToIntLocations(ArenaAllocator* allocator,
    629                                         HInvoke* invoke,
    630                                         CodeGeneratorX86_64* codegen) {
    631   // Do we have instruction support?
    632   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
    633     LocationSummary* locations =
    634         new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    635     locations->SetInAt(0, Location::RequiresFpuRegister());
    636     locations->SetOut(Location::RequiresRegister());
    637     locations->AddTemp(Location::RequiresFpuRegister());
    638     locations->AddTemp(Location::RequiresFpuRegister());
    639     return;
    640   }
    641 
    642   // We have to fall back to a call to the intrinsic.
    643   LocationSummary* locations =
    644       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
    645   InvokeRuntimeCallingConvention calling_convention;
    646   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    647   locations->SetOut(Location::RegisterLocation(RAX));
    648   // Needs to be RDI for the invoke.
    649   locations->AddTemp(Location::RegisterLocation(RDI));
    650 }
    651 
    652 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
    653   CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
    654 }
    655 
    656 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
    657   LocationSummary* locations = invoke->GetLocations();
    658   if (locations->WillCall()) {
    659     InvokeOutOfLineIntrinsic(codegen_, invoke);
    660     return;
    661   }
    662 
    663   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    664   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    665   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    666   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
    667   NearLabel skip_incr, done;
    668   X86_64Assembler* assembler = GetAssembler();
    669 
    670   // Since no direct x86 rounding instruction matches the required semantics,
    671   // this intrinsic is implemented as follows:
    672   //  result = floor(in);
    673   //  if (in - result >= 0.5f)
    674   //    result = result + 1.0f;
    675   __ movss(t2, in);
    676   __ roundss(t1, in, Immediate(1));
    677   __ subss(t2, t1);
    678   __ comiss(t2, codegen_->LiteralFloatAddress(0.5f));
    679   __ j(kBelow, &skip_incr);
    680   __ addss(t1, codegen_->LiteralFloatAddress(1.0f));
    681   __ Bind(&skip_incr);
    682 
    683   // Final conversion to an integer. Unfortunately this also does not have a
    684   // direct x86 instruction, since NaN should map to 0 and large positive
    685   // values need to be clipped to the extreme value.
    686   codegen_->Load32BitValue(out, kPrimIntMax);
    687   __ cvtsi2ss(t2, out);
    688   __ comiss(t1, t2);
    689   __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
    690   __ movl(out, Immediate(0));  // does not change flags
    691   __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
    692   __ cvttss2si(out, t1);
    693   __ Bind(&done);
    694 }
    695 
    696 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
    697   CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
    698 }
    699 
    700 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
    701   LocationSummary* locations = invoke->GetLocations();
    702   if (locations->WillCall()) {
    703     InvokeOutOfLineIntrinsic(codegen_, invoke);
    704     return;
    705   }
    706 
    707   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    708   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    709   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    710   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
    711   NearLabel skip_incr, done;
    712   X86_64Assembler* assembler = GetAssembler();
    713 
    714   // Since no direct x86 rounding instruction matches the required semantics,
    715   // this intrinsic is implemented as follows:
    716   //  result = floor(in);
    717   //  if (in - result >= 0.5)
    718   //    result = result + 1.0f;
    719   __ movsd(t2, in);
    720   __ roundsd(t1, in, Immediate(1));
    721   __ subsd(t2, t1);
    722   __ comisd(t2, codegen_->LiteralDoubleAddress(0.5));
    723   __ j(kBelow, &skip_incr);
    724   __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f));
    725   __ Bind(&skip_incr);
    726 
    727   // Final conversion to an integer. Unfortunately this also does not have a
    728   // direct x86 instruction, since NaN should map to 0 and large positive
    729   // values need to be clipped to the extreme value.
    730   codegen_->Load64BitValue(out, kPrimLongMax);
    731   __ cvtsi2sd(t2, out, /* is64bit */ true);
    732   __ comisd(t1, t2);
    733   __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
    734   __ movl(out, Immediate(0));  // does not change flags, implicit zero extension to 64-bit
    735   __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
    736   __ cvttsd2si(out, t1, /* is64bit */ true);
    737   __ Bind(&done);
    738 }
    739 
    740 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    741   LocationSummary* locations =
    742       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
    743   InvokeRuntimeCallingConvention calling_convention;
    744   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    745   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    746 
    747   // We have to ensure that the native code doesn't clobber the XMM registers which are
    748   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
    749   // saved in the prologue and properly restored.
    750   for (FloatRegister fp_reg : non_volatile_xmm_regs) {
    751     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
    752   }
    753 }
    754 
    755 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
    756                           QuickEntrypointEnum entry) {
    757   LocationSummary* locations = invoke->GetLocations();
    758   DCHECK(locations->WillCall());
    759   DCHECK(invoke->IsInvokeStaticOrDirect());
    760 
    761   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
    762 }
    763 
    764 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
    765   CreateFPToFPCallLocations(allocator_, invoke);
    766 }
    767 
    768 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
    769   GenFPToFPCall(invoke, codegen_, kQuickCos);
    770 }
    771 
    772 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
    773   CreateFPToFPCallLocations(allocator_, invoke);
    774 }
    775 
    776 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
    777   GenFPToFPCall(invoke, codegen_, kQuickSin);
    778 }
    779 
    780 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
    781   CreateFPToFPCallLocations(allocator_, invoke);
    782 }
    783 
    784 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
    785   GenFPToFPCall(invoke, codegen_, kQuickAcos);
    786 }
    787 
    788 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
    789   CreateFPToFPCallLocations(allocator_, invoke);
    790 }
    791 
    792 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
    793   GenFPToFPCall(invoke, codegen_, kQuickAsin);
    794 }
    795 
    796 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
    797   CreateFPToFPCallLocations(allocator_, invoke);
    798 }
    799 
    800 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
    801   GenFPToFPCall(invoke, codegen_, kQuickAtan);
    802 }
    803 
    804 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
    805   CreateFPToFPCallLocations(allocator_, invoke);
    806 }
    807 
    808 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
    809   GenFPToFPCall(invoke, codegen_, kQuickCbrt);
    810 }
    811 
    812 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
    813   CreateFPToFPCallLocations(allocator_, invoke);
    814 }
    815 
    816 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
    817   GenFPToFPCall(invoke, codegen_, kQuickCosh);
    818 }
    819 
    820 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
    821   CreateFPToFPCallLocations(allocator_, invoke);
    822 }
    823 
    824 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
    825   GenFPToFPCall(invoke, codegen_, kQuickExp);
    826 }
    827 
    828 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
    829   CreateFPToFPCallLocations(allocator_, invoke);
    830 }
    831 
    832 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
    833   GenFPToFPCall(invoke, codegen_, kQuickExpm1);
    834 }
    835 
    836 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
    837   CreateFPToFPCallLocations(allocator_, invoke);
    838 }
    839 
    840 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
    841   GenFPToFPCall(invoke, codegen_, kQuickLog);
    842 }
    843 
    844 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
    845   CreateFPToFPCallLocations(allocator_, invoke);
    846 }
    847 
    848 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
    849   GenFPToFPCall(invoke, codegen_, kQuickLog10);
    850 }
    851 
    852 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
    853   CreateFPToFPCallLocations(allocator_, invoke);
    854 }
    855 
    856 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
    857   GenFPToFPCall(invoke, codegen_, kQuickSinh);
    858 }
    859 
    860 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
    861   CreateFPToFPCallLocations(allocator_, invoke);
    862 }
    863 
    864 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
    865   GenFPToFPCall(invoke, codegen_, kQuickTan);
    866 }
    867 
    868 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
    869   CreateFPToFPCallLocations(allocator_, invoke);
    870 }
    871 
    872 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
    873   GenFPToFPCall(invoke, codegen_, kQuickTanh);
    874 }
    875 
    876 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    877   LocationSummary* locations =
    878       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
    879   InvokeRuntimeCallingConvention calling_convention;
    880   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    881   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
    882   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    883 
    884   // We have to ensure that the native code doesn't clobber the XMM registers which are
    885   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
    886   // saved in the prologue and properly restored.
    887   for (FloatRegister fp_reg : non_volatile_xmm_regs) {
    888     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
    889   }
    890 }
    891 
    892 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
    893   CreateFPFPToFPCallLocations(allocator_, invoke);
    894 }
    895 
    896 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
    897   GenFPToFPCall(invoke, codegen_, kQuickAtan2);
    898 }
    899 
    900 void IntrinsicLocationsBuilderX86_64::VisitMathPow(HInvoke* invoke) {
    901   CreateFPFPToFPCallLocations(allocator_, invoke);
    902 }
    903 
    904 void IntrinsicCodeGeneratorX86_64::VisitMathPow(HInvoke* invoke) {
    905   GenFPToFPCall(invoke, codegen_, kQuickPow);
    906 }
    907 
    908 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
    909   CreateFPFPToFPCallLocations(allocator_, invoke);
    910 }
    911 
    912 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
    913   GenFPToFPCall(invoke, codegen_, kQuickHypot);
    914 }
    915 
    916 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
    917   CreateFPFPToFPCallLocations(allocator_, invoke);
    918 }
    919 
    920 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
    921   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
    922 }
    923 
    924 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
    925   // Check to see if we have known failures that will cause us to have to bail out
    926   // to the runtime, and just generate the runtime call directly.
    927   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
    928   HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
    929 
    930   // The positions must be non-negative.
    931   if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
    932       (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
    933     // We will have to fail anyways.
    934     return;
    935   }
    936 
    937   // The length must be > 0.
    938   HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
    939   if (length != nullptr) {
    940     int32_t len = length->GetValue();
    941     if (len < 0) {
    942       // Just call as normal.
    943       return;
    944     }
    945   }
    946 
    947   LocationSummary* locations =
    948       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
    949   // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
    950   locations->SetInAt(0, Location::RequiresRegister());
    951   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
    952   locations->SetInAt(2, Location::RequiresRegister());
    953   locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
    954   locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
    955 
    956   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
    957   locations->AddTemp(Location::RegisterLocation(RSI));
    958   locations->AddTemp(Location::RegisterLocation(RDI));
    959   locations->AddTemp(Location::RegisterLocation(RCX));
    960 }
    961 
    962 static void CheckPosition(X86_64Assembler* assembler,
    963                           Location pos,
    964                           CpuRegister input,
    965                           Location length,
    966                           SlowPathCode* slow_path,
    967                           CpuRegister temp,
    968                           bool length_is_input_length = false) {
    969   // Where is the length in the Array?
    970   const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
    971 
    972   if (pos.IsConstant()) {
    973     int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
    974     if (pos_const == 0) {
    975       if (!length_is_input_length) {
    976         // Check that length(input) >= length.
    977         if (length.IsConstant()) {
    978           __ cmpl(Address(input, length_offset),
    979                   Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    980         } else {
    981           __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
    982         }
    983         __ j(kLess, slow_path->GetEntryLabel());
    984       }
    985     } else {
    986       // Check that length(input) >= pos.
    987       __ movl(temp, Address(input, length_offset));
    988       __ subl(temp, Immediate(pos_const));
    989       __ j(kLess, slow_path->GetEntryLabel());
    990 
    991       // Check that (length(input) - pos) >= length.
    992       if (length.IsConstant()) {
    993         __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    994       } else {
    995         __ cmpl(temp, length.AsRegister<CpuRegister>());
    996       }
    997       __ j(kLess, slow_path->GetEntryLabel());
    998     }
    999   } else if (length_is_input_length) {
   1000     // The only way the copy can succeed is if pos is zero.
   1001     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
   1002     __ testl(pos_reg, pos_reg);
   1003     __ j(kNotEqual, slow_path->GetEntryLabel());
   1004   } else {
   1005     // Check that pos >= 0.
   1006     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
   1007     __ testl(pos_reg, pos_reg);
   1008     __ j(kLess, slow_path->GetEntryLabel());
   1009 
   1010     // Check that pos <= length(input).
   1011     __ cmpl(Address(input, length_offset), pos_reg);
   1012     __ j(kLess, slow_path->GetEntryLabel());
   1013 
   1014     // Check that (length(input) - pos) >= length.
   1015     __ movl(temp, Address(input, length_offset));
   1016     __ subl(temp, pos_reg);
   1017     if (length.IsConstant()) {
   1018       __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
   1019     } else {
   1020       __ cmpl(temp, length.AsRegister<CpuRegister>());
   1021     }
   1022     __ j(kLess, slow_path->GetEntryLabel());
   1023   }
   1024 }
   1025 
   1026 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
   1027   X86_64Assembler* assembler = GetAssembler();
   1028   LocationSummary* locations = invoke->GetLocations();
   1029 
   1030   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   1031   Location src_pos = locations->InAt(1);
   1032   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   1033   Location dest_pos = locations->InAt(3);
   1034   Location length = locations->InAt(4);
   1035 
   1036   // Temporaries that we need for MOVSW.
   1037   CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
   1038   DCHECK_EQ(src_base.AsRegister(), RSI);
   1039   CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
   1040   DCHECK_EQ(dest_base.AsRegister(), RDI);
   1041   CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
   1042   DCHECK_EQ(count.AsRegister(), RCX);
   1043 
   1044   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1045   codegen_->AddSlowPath(slow_path);
   1046 
   1047   // Bail out if the source and destination are the same.
   1048   __ cmpl(src, dest);
   1049   __ j(kEqual, slow_path->GetEntryLabel());
   1050 
   1051   // Bail out if the source is null.
   1052   __ testl(src, src);
   1053   __ j(kEqual, slow_path->GetEntryLabel());
   1054 
   1055   // Bail out if the destination is null.
   1056   __ testl(dest, dest);
   1057   __ j(kEqual, slow_path->GetEntryLabel());
   1058 
   1059   // If the length is negative, bail out.
   1060   // We have already checked in the LocationsBuilder for the constant case.
   1061   if (!length.IsConstant()) {
   1062     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
   1063     __ j(kLess, slow_path->GetEntryLabel());
   1064   }
   1065 
   1066   // Validity checks: source. Use src_base as a temporary register.
   1067   CheckPosition(assembler, src_pos, src, length, slow_path, src_base);
   1068 
   1069   // Validity checks: dest. Use src_base as a temporary register.
   1070   CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base);
   1071 
   1072   // We need the count in RCX.
   1073   if (length.IsConstant()) {
   1074     __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
   1075   } else {
   1076     __ movl(count, length.AsRegister<CpuRegister>());
   1077   }
   1078 
   1079   // Okay, everything checks out.  Finally time to do the copy.
   1080   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   1081   const size_t char_size = DataType::Size(DataType::Type::kUint16);
   1082   DCHECK_EQ(char_size, 2u);
   1083 
   1084   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
   1085 
   1086   if (src_pos.IsConstant()) {
   1087     int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1088     __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
   1089   } else {
   1090     __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
   1091                               ScaleFactor::TIMES_2, data_offset));
   1092   }
   1093   if (dest_pos.IsConstant()) {
   1094     int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1095     __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
   1096   } else {
   1097     __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
   1098                                ScaleFactor::TIMES_2, data_offset));
   1099   }
   1100 
   1101   // Do the move.
   1102   __ rep_movsw();
   1103 
   1104   __ Bind(slow_path->GetExitLabel());
   1105 }
   1106 
   1107 
   1108 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   1109   // The only read barrier implementation supporting the
   1110   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   1111   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   1112     return;
   1113   }
   1114 
   1115   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
   1116 }
   1117 
   1118 // Compute base source address, base destination address, and end
   1119 // source address for the System.arraycopy intrinsic in `src_base`,
   1120 // `dst_base` and `src_end` respectively.
   1121 static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler,
   1122                                         DataType::Type type,
   1123                                         const CpuRegister& src,
   1124                                         const Location& src_pos,
   1125                                         const CpuRegister& dst,
   1126                                         const Location& dst_pos,
   1127                                         const Location& copy_length,
   1128                                         const CpuRegister& src_base,
   1129                                         const CpuRegister& dst_base,
   1130                                         const CpuRegister& src_end) {
   1131   // This routine is only used by the SystemArrayCopy intrinsic.
   1132   DCHECK_EQ(type, DataType::Type::kReference);
   1133   const int32_t element_size = DataType::Size(type);
   1134   const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type));
   1135   const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
   1136 
   1137   if (src_pos.IsConstant()) {
   1138     int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1139     __ leal(src_base, Address(src, element_size * constant + data_offset));
   1140   } else {
   1141     __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
   1142   }
   1143 
   1144   if (dst_pos.IsConstant()) {
   1145     int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue();
   1146     __ leal(dst_base, Address(dst, element_size * constant + data_offset));
   1147   } else {
   1148     __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
   1149   }
   1150 
   1151   if (copy_length.IsConstant()) {
   1152     int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
   1153     __ leal(src_end, Address(src_base, element_size * constant));
   1154   } else {
   1155     __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0));
   1156   }
   1157 }
   1158 
   1159 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   1160   // The only read barrier implementation supporting the
   1161   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   1162   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   1163 
   1164   X86_64Assembler* assembler = GetAssembler();
   1165   LocationSummary* locations = invoke->GetLocations();
   1166 
   1167   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   1168   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   1169   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   1170   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   1171   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
   1172 
   1173   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   1174   Location src_pos = locations->InAt(1);
   1175   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   1176   Location dest_pos = locations->InAt(3);
   1177   Location length = locations->InAt(4);
   1178   Location temp1_loc = locations->GetTemp(0);
   1179   CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
   1180   Location temp2_loc = locations->GetTemp(1);
   1181   CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
   1182   Location temp3_loc = locations->GetTemp(2);
   1183   CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
   1184   Location TMP_loc = Location::RegisterLocation(TMP);
   1185 
   1186   SlowPathCode* intrinsic_slow_path =
   1187       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1188   codegen_->AddSlowPath(intrinsic_slow_path);
   1189 
   1190   NearLabel conditions_on_positions_validated;
   1191   SystemArrayCopyOptimizations optimizations(invoke);
   1192 
   1193   // If source and destination are the same, we go to slow path if we need to do
   1194   // forward copying.
   1195   if (src_pos.IsConstant()) {
   1196     int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1197     if (dest_pos.IsConstant()) {
   1198       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1199       if (optimizations.GetDestinationIsSource()) {
   1200         // Checked when building locations.
   1201         DCHECK_GE(src_pos_constant, dest_pos_constant);
   1202       } else if (src_pos_constant < dest_pos_constant) {
   1203         __ cmpl(src, dest);
   1204         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1205       }
   1206     } else {
   1207       if (!optimizations.GetDestinationIsSource()) {
   1208         __ cmpl(src, dest);
   1209         __ j(kNotEqual, &conditions_on_positions_validated);
   1210       }
   1211       __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
   1212       __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
   1213     }
   1214   } else {
   1215     if (!optimizations.GetDestinationIsSource()) {
   1216       __ cmpl(src, dest);
   1217       __ j(kNotEqual, &conditions_on_positions_validated);
   1218     }
   1219     if (dest_pos.IsConstant()) {
   1220       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1221       __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
   1222       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   1223     } else {
   1224       __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
   1225       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   1226     }
   1227   }
   1228 
   1229   __ Bind(&conditions_on_positions_validated);
   1230 
   1231   if (!optimizations.GetSourceIsNotNull()) {
   1232     // Bail out if the source is null.
   1233     __ testl(src, src);
   1234     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1235   }
   1236 
   1237   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
   1238     // Bail out if the destination is null.
   1239     __ testl(dest, dest);
   1240     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1241   }
   1242 
   1243   // If the length is negative, bail out.
   1244   // We have already checked in the LocationsBuilder for the constant case.
   1245   if (!length.IsConstant() &&
   1246       !optimizations.GetCountIsSourceLength() &&
   1247       !optimizations.GetCountIsDestinationLength()) {
   1248     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
   1249     __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   1250   }
   1251 
   1252   // Validity checks: source.
   1253   CheckPosition(assembler,
   1254                 src_pos,
   1255                 src,
   1256                 length,
   1257                 intrinsic_slow_path,
   1258                 temp1,
   1259                 optimizations.GetCountIsSourceLength());
   1260 
   1261   // Validity checks: dest.
   1262   CheckPosition(assembler,
   1263                 dest_pos,
   1264                 dest,
   1265                 length,
   1266                 intrinsic_slow_path,
   1267                 temp1,
   1268                 optimizations.GetCountIsDestinationLength());
   1269 
   1270   if (!optimizations.GetDoesNotNeedTypeCheck()) {
   1271     // Check whether all elements of the source array are assignable to the component
   1272     // type of the destination array. We do two checks: the classes are the same,
   1273     // or the destination is Object[]. If none of these checks succeed, we go to the
   1274     // slow path.
   1275 
   1276     bool did_unpoison = false;
   1277     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1278       // /* HeapReference<Class> */ temp1 = dest->klass_
   1279       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1280           invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
   1281       // Register `temp1` is not trashed by the read barrier emitted
   1282       // by GenerateFieldLoadWithBakerReadBarrier below, as that
   1283       // method produces a call to a ReadBarrierMarkRegX entry point,
   1284       // which saves all potentially live registers, including
   1285       // temporaries such a `temp1`.
   1286       // /* HeapReference<Class> */ temp2 = src->klass_
   1287       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1288           invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
   1289       // If heap poisoning is enabled, `temp1` and `temp2` have been
   1290       // unpoisoned by the the previous calls to
   1291       // GenerateFieldLoadWithBakerReadBarrier.
   1292     } else {
   1293       // /* HeapReference<Class> */ temp1 = dest->klass_
   1294       __ movl(temp1, Address(dest, class_offset));
   1295       // /* HeapReference<Class> */ temp2 = src->klass_
   1296       __ movl(temp2, Address(src, class_offset));
   1297       if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
   1298           !optimizations.GetSourceIsNonPrimitiveArray()) {
   1299         // One or two of the references need to be unpoisoned. Unpoison them
   1300         // both to make the identity check valid.
   1301         __ MaybeUnpoisonHeapReference(temp1);
   1302         __ MaybeUnpoisonHeapReference(temp2);
   1303         did_unpoison = true;
   1304       }
   1305     }
   1306 
   1307     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   1308       // Bail out if the destination is not a non primitive array.
   1309       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1310         // /* HeapReference<Class> */ TMP = temp1->component_type_
   1311         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1312             invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
   1313         __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1314         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1315         // If heap poisoning is enabled, `TMP` has been unpoisoned by
   1316         // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   1317       } else {
   1318         // /* HeapReference<Class> */ TMP = temp1->component_type_
   1319         __ movl(CpuRegister(TMP), Address(temp1, component_offset));
   1320         __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1321         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1322         __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1323       }
   1324       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1325       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   1326     }
   1327 
   1328     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   1329       // Bail out if the source is not a non primitive array.
   1330       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1331         // For the same reason given earlier, `temp1` is not trashed by the
   1332         // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
   1333         // /* HeapReference<Class> */ TMP = temp2->component_type_
   1334         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1335             invoke, TMP_loc, temp2, component_offset, /* needs_null_check */ false);
   1336         __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1337         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1338         // If heap poisoning is enabled, `TMP` has been unpoisoned by
   1339         // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   1340       } else {
   1341         // /* HeapReference<Class> */ TMP = temp2->component_type_
   1342         __ movl(CpuRegister(TMP), Address(temp2, component_offset));
   1343         __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1344         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1345         __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1346       }
   1347       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1348       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   1349     }
   1350 
   1351     __ cmpl(temp1, temp2);
   1352 
   1353     if (optimizations.GetDestinationIsTypedObjectArray()) {
   1354       NearLabel do_copy;
   1355       __ j(kEqual, &do_copy);
   1356       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1357         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   1358         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1359             invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
   1360         // We do not need to emit a read barrier for the following
   1361         // heap reference load, as `temp1` is only used in a
   1362         // comparison with null below, and this reference is not
   1363         // kept afterwards.
   1364         __ cmpl(Address(temp1, super_offset), Immediate(0));
   1365       } else {
   1366         if (!did_unpoison) {
   1367           __ MaybeUnpoisonHeapReference(temp1);
   1368         }
   1369         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   1370         __ movl(temp1, Address(temp1, component_offset));
   1371         __ MaybeUnpoisonHeapReference(temp1);
   1372         // No need to unpoison the following heap reference load, as
   1373         // we're comparing against null.
   1374         __ cmpl(Address(temp1, super_offset), Immediate(0));
   1375       }
   1376       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   1377       __ Bind(&do_copy);
   1378     } else {
   1379       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   1380     }
   1381   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   1382     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
   1383     // Bail out if the source is not a non primitive array.
   1384     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1385       // /* HeapReference<Class> */ temp1 = src->klass_
   1386       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1387           invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
   1388       // /* HeapReference<Class> */ TMP = temp1->component_type_
   1389       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   1390           invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
   1391       __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1392       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1393     } else {
   1394       // /* HeapReference<Class> */ temp1 = src->klass_
   1395       __ movl(temp1, Address(src, class_offset));
   1396       __ MaybeUnpoisonHeapReference(temp1);
   1397       // /* HeapReference<Class> */ TMP = temp1->component_type_
   1398       __ movl(CpuRegister(TMP), Address(temp1, component_offset));
   1399       // No need to unpoison `TMP` now, as we're comparing against null.
   1400       __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1401       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   1402       __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1403     }
   1404     __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1405     __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   1406   }
   1407 
   1408   const DataType::Type type = DataType::Type::kReference;
   1409   const int32_t element_size = DataType::Size(type);
   1410 
   1411   // Compute base source address, base destination address, and end
   1412   // source address in `temp1`, `temp2` and `temp3` respectively.
   1413   GenSystemArrayCopyAddresses(
   1414       GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3);
   1415 
   1416   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1417     // SystemArrayCopy implementation for Baker read barriers (see
   1418     // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
   1419     //
   1420     //   if (src_ptr != end_ptr) {
   1421     //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
   1422     //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
   1423     //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   1424     //     if (is_gray) {
   1425     //       // Slow-path copy.
   1426     //       do {
   1427     //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
   1428     //       } while (src_ptr != end_ptr)
   1429     //     } else {
   1430     //       // Fast-path copy.
   1431     //       do {
   1432     //         *dest_ptr++ = *src_ptr++;
   1433     //       } while (src_ptr != end_ptr)
   1434     //     }
   1435     //   }
   1436 
   1437     NearLabel loop, done;
   1438 
   1439     // Don't enter copy loop if `length == 0`.
   1440     __ cmpl(temp1, temp3);
   1441     __ j(kEqual, &done);
   1442 
   1443     // Given the numeric representation, it's enough to check the low bit of the rb_state.
   1444     static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
   1445     static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
   1446     constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
   1447     constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
   1448     constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
   1449 
   1450     // if (rb_state == ReadBarrier::GrayState())
   1451     //   goto slow_path;
   1452     // At this point, just do the "if" and make sure that flags are preserved until the branch.
   1453     __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
   1454 
   1455     // Load fence to prevent load-load reordering.
   1456     // Note that this is a no-op, thanks to the x86-64 memory model.
   1457     codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   1458 
   1459     // Slow path used to copy array when `src` is gray.
   1460     SlowPathCode* read_barrier_slow_path =
   1461         new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
   1462     codegen_->AddSlowPath(read_barrier_slow_path);
   1463 
   1464     // We have done the "if" of the gray bit check above, now branch based on the flags.
   1465     __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
   1466 
   1467     // Fast-path copy.
   1468     // Iterate over the arrays and do a raw copy of the objects. We don't need to
   1469     // poison/unpoison.
   1470     __ Bind(&loop);
   1471     __ movl(CpuRegister(TMP), Address(temp1, 0));
   1472     __ movl(Address(temp2, 0), CpuRegister(TMP));
   1473     __ addl(temp1, Immediate(element_size));
   1474     __ addl(temp2, Immediate(element_size));
   1475     __ cmpl(temp1, temp3);
   1476     __ j(kNotEqual, &loop);
   1477 
   1478     __ Bind(read_barrier_slow_path->GetExitLabel());
   1479     __ Bind(&done);
   1480   } else {
   1481     // Non read barrier code.
   1482 
   1483     // Iterate over the arrays and do a raw copy of the objects. We don't need to
   1484     // poison/unpoison.
   1485     NearLabel loop, done;
   1486     __ cmpl(temp1, temp3);
   1487     __ j(kEqual, &done);
   1488     __ Bind(&loop);
   1489     __ movl(CpuRegister(TMP), Address(temp1, 0));
   1490     __ movl(Address(temp2, 0), CpuRegister(TMP));
   1491     __ addl(temp1, Immediate(element_size));
   1492     __ addl(temp2, Immediate(element_size));
   1493     __ cmpl(temp1, temp3);
   1494     __ j(kNotEqual, &loop);
   1495     __ Bind(&done);
   1496   }
   1497 
   1498   // We only need one card marking on the destination array.
   1499   codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null */ false);
   1500 
   1501   __ Bind(intrinsic_slow_path->GetExitLabel());
   1502 }
   1503 
   1504 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
   1505   LocationSummary* locations = new (allocator_) LocationSummary(
   1506       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1507   InvokeRuntimeCallingConvention calling_convention;
   1508   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1509   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1510   locations->SetOut(Location::RegisterLocation(RAX));
   1511 }
   1512 
   1513 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
   1514   X86_64Assembler* assembler = GetAssembler();
   1515   LocationSummary* locations = invoke->GetLocations();
   1516 
   1517   // Note that the null check must have been done earlier.
   1518   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1519 
   1520   CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
   1521   __ testl(argument, argument);
   1522   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1523   codegen_->AddSlowPath(slow_path);
   1524   __ j(kEqual, slow_path->GetEntryLabel());
   1525 
   1526   codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path);
   1527   __ Bind(slow_path->GetExitLabel());
   1528 }
   1529 
   1530 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
   1531   if (kEmitCompilerReadBarrier &&
   1532       !StringEqualsOptimizations(invoke).GetArgumentIsString() &&
   1533       !StringEqualsOptimizations(invoke).GetNoReadBarrierForStringClass()) {
   1534     // No support for this odd case (String class is moveable, not in the boot image).
   1535     return;
   1536   }
   1537 
   1538   LocationSummary* locations =
   1539       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1540   locations->SetInAt(0, Location::RequiresRegister());
   1541   locations->SetInAt(1, Location::RequiresRegister());
   1542 
   1543   // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
   1544   locations->AddTemp(Location::RegisterLocation(RCX));
   1545   locations->AddTemp(Location::RegisterLocation(RDI));
   1546 
   1547   // Set output, RSI needed for repe_cmpsq instruction anyways.
   1548   locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
   1549 }
   1550 
   1551 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
   1552   X86_64Assembler* assembler = GetAssembler();
   1553   LocationSummary* locations = invoke->GetLocations();
   1554 
   1555   CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
   1556   CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
   1557   CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
   1558   CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
   1559   CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
   1560 
   1561   NearLabel end, return_true, return_false;
   1562 
   1563   // Get offsets of count, value, and class fields within a string object.
   1564   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1565   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1566   const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
   1567 
   1568   // Note that the null check must have been done earlier.
   1569   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1570 
   1571   StringEqualsOptimizations optimizations(invoke);
   1572   if (!optimizations.GetArgumentNotNull()) {
   1573     // Check if input is null, return false if it is.
   1574     __ testl(arg, arg);
   1575     __ j(kEqual, &return_false);
   1576   }
   1577 
   1578   if (!optimizations.GetArgumentIsString()) {
   1579     // Instanceof check for the argument by comparing class fields.
   1580     // All string objects must have the same type since String cannot be subclassed.
   1581     // Receiver must be a string object, so its class field is equal to all strings' class fields.
   1582     // If the argument is a string object, its class field must be equal to receiver's class field.
   1583     __ movl(rcx, Address(str, class_offset));
   1584     __ cmpl(rcx, Address(arg, class_offset));
   1585     __ j(kNotEqual, &return_false);
   1586   }
   1587 
   1588   // Reference equality check, return true if same reference.
   1589   __ cmpl(str, arg);
   1590   __ j(kEqual, &return_true);
   1591 
   1592   // Load length and compression flag of receiver string.
   1593   __ movl(rcx, Address(str, count_offset));
   1594   // Check if lengths and compressiond flags are equal, return false if they're not.
   1595   // Two identical strings will always have same compression style since
   1596   // compression style is decided on alloc.
   1597   __ cmpl(rcx, Address(arg, count_offset));
   1598   __ j(kNotEqual, &return_false);
   1599   // Return true if both strings are empty. Even with string compression `count == 0` means empty.
   1600   static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1601                 "Expecting 0=compressed, 1=uncompressed");
   1602   __ jrcxz(&return_true);
   1603 
   1604   if (mirror::kUseStringCompression) {
   1605     NearLabel string_uncompressed;
   1606     // Extract length and differentiate between both compressed or both uncompressed.
   1607     // Different compression style is cut above.
   1608     __ shrl(rcx, Immediate(1));
   1609     __ j(kCarrySet, &string_uncompressed);
   1610     // Divide string length by 2, rounding up, and continue as if uncompressed.
   1611     // Merge clearing the compression flag with +1 for rounding.
   1612     __ addl(rcx, Immediate(1));
   1613     __ shrl(rcx, Immediate(1));
   1614     __ Bind(&string_uncompressed);
   1615   }
   1616   // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
   1617   __ leal(rsi, Address(str, value_offset));
   1618   __ leal(rdi, Address(arg, value_offset));
   1619 
   1620   // Divide string length by 4 and adjust for lengths not divisible by 4.
   1621   __ addl(rcx, Immediate(3));
   1622   __ shrl(rcx, Immediate(2));
   1623 
   1624   // Assertions that must hold in order to compare strings 4 characters (uncompressed)
   1625   // or 8 characters (compressed) at a time.
   1626   DCHECK_ALIGNED(value_offset, 8);
   1627   static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
   1628 
   1629   // Loop to compare strings four characters at a time starting at the beginning of the string.
   1630   __ repe_cmpsq();
   1631   // If strings are not equal, zero flag will be cleared.
   1632   __ j(kNotEqual, &return_false);
   1633 
   1634   // Return true and exit the function.
   1635   // If loop does not result in returning false, we return true.
   1636   __ Bind(&return_true);
   1637   __ movl(rsi, Immediate(1));
   1638   __ jmp(&end);
   1639 
   1640   // Return false and exit the function.
   1641   __ Bind(&return_false);
   1642   __ xorl(rsi, rsi);
   1643   __ Bind(&end);
   1644 }
   1645 
   1646 static void CreateStringIndexOfLocations(HInvoke* invoke,
   1647                                          ArenaAllocator* allocator,
   1648                                          bool start_at_zero) {
   1649   LocationSummary* locations = new (allocator) LocationSummary(invoke,
   1650                                                                LocationSummary::kCallOnSlowPath,
   1651                                                                kIntrinsified);
   1652   // The data needs to be in RDI for scasw. So request that the string is there, anyways.
   1653   locations->SetInAt(0, Location::RegisterLocation(RDI));
   1654   // If we look for a constant char, we'll still have to copy it into RAX. So just request the
   1655   // allocator to do that, anyways. We can still do the constant check by checking the parameter
   1656   // of the instruction explicitly.
   1657   // Note: This works as we don't clobber RAX anywhere.
   1658   locations->SetInAt(1, Location::RegisterLocation(RAX));
   1659   if (!start_at_zero) {
   1660     locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
   1661   }
   1662   // As we clobber RDI during execution anyways, also use it as the output.
   1663   locations->SetOut(Location::SameAsFirstInput());
   1664 
   1665   // repne scasw uses RCX as the counter.
   1666   locations->AddTemp(Location::RegisterLocation(RCX));
   1667   // Need another temporary to be able to compute the result.
   1668   locations->AddTemp(Location::RequiresRegister());
   1669 }
   1670 
   1671 static void GenerateStringIndexOf(HInvoke* invoke,
   1672                                   X86_64Assembler* assembler,
   1673                                   CodeGeneratorX86_64* codegen,
   1674                                   bool start_at_zero) {
   1675   LocationSummary* locations = invoke->GetLocations();
   1676 
   1677   // Note that the null check must have been done earlier.
   1678   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1679 
   1680   CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
   1681   CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
   1682   CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
   1683   CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
   1684   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   1685 
   1686   // Check our assumptions for registers.
   1687   DCHECK_EQ(string_obj.AsRegister(), RDI);
   1688   DCHECK_EQ(search_value.AsRegister(), RAX);
   1689   DCHECK_EQ(counter.AsRegister(), RCX);
   1690   DCHECK_EQ(out.AsRegister(), RDI);
   1691 
   1692   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
   1693   // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   1694   SlowPathCode* slow_path = nullptr;
   1695   HInstruction* code_point = invoke->InputAt(1);
   1696   if (code_point->IsIntConstant()) {
   1697     if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
   1698     std::numeric_limits<uint16_t>::max()) {
   1699       // Always needs the slow-path. We could directly dispatch to it, but this case should be
   1700       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
   1701       slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1702       codegen->AddSlowPath(slow_path);
   1703       __ jmp(slow_path->GetEntryLabel());
   1704       __ Bind(slow_path->GetExitLabel());
   1705       return;
   1706     }
   1707   } else if (code_point->GetType() != DataType::Type::kUint16) {
   1708     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
   1709     slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1710     codegen->AddSlowPath(slow_path);
   1711     __ j(kAbove, slow_path->GetEntryLabel());
   1712   }
   1713 
   1714   // From here down, we know that we are looking for a char that fits in
   1715   // 16 bits (uncompressed) or 8 bits (compressed).
   1716   // Location of reference to data array within the String object.
   1717   int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   1718   // Location of count within the String object.
   1719   int32_t count_offset = mirror::String::CountOffset().Int32Value();
   1720 
   1721   // Load the count field of the string containing the length and compression flag.
   1722   __ movl(string_length, Address(string_obj, count_offset));
   1723 
   1724   // Do a zero-length check. Even with string compression `count == 0` means empty.
   1725   // TODO: Support jecxz.
   1726   NearLabel not_found_label;
   1727   __ testl(string_length, string_length);
   1728   __ j(kEqual, &not_found_label);
   1729 
   1730   if (mirror::kUseStringCompression) {
   1731     // Use TMP to keep string_length_flagged.
   1732     __ movl(CpuRegister(TMP), string_length);
   1733     // Mask out first bit used as compression flag.
   1734     __ shrl(string_length, Immediate(1));
   1735   }
   1736 
   1737   if (start_at_zero) {
   1738     // Number of chars to scan is the same as the string length.
   1739     __ movl(counter, string_length);
   1740     // Move to the start of the string.
   1741     __ addq(string_obj, Immediate(value_offset));
   1742   } else {
   1743     CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
   1744 
   1745     // Do a start_index check.
   1746     __ cmpl(start_index, string_length);
   1747     __ j(kGreaterEqual, &not_found_label);
   1748 
   1749     // Ensure we have a start index >= 0;
   1750     __ xorl(counter, counter);
   1751     __ cmpl(start_index, Immediate(0));
   1752     __ cmov(kGreater, counter, start_index, /* is64bit */ false);  // 32-bit copy is enough.
   1753 
   1754     if (mirror::kUseStringCompression) {
   1755       NearLabel modify_counter, offset_uncompressed_label;
   1756       __ testl(CpuRegister(TMP), Immediate(1));
   1757       __ j(kNotZero, &offset_uncompressed_label);
   1758       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
   1759       __ jmp(&modify_counter);
   1760       // Move to the start of the string: string_obj + value_offset + 2 * start_index.
   1761       __ Bind(&offset_uncompressed_label);
   1762       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
   1763       __ Bind(&modify_counter);
   1764     } else {
   1765       __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
   1766     }
   1767     // Now update ecx, the work counter: it's gonna be string.length - start_index.
   1768     __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
   1769     __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
   1770   }
   1771 
   1772   if (mirror::kUseStringCompression) {
   1773     NearLabel uncompressed_string_comparison;
   1774     NearLabel comparison_done;
   1775     __ testl(CpuRegister(TMP), Immediate(1));
   1776     __ j(kNotZero, &uncompressed_string_comparison);
   1777     // Check if RAX (search_value) is ASCII.
   1778     __ cmpl(search_value, Immediate(127));
   1779     __ j(kGreater, &not_found_label);
   1780     // Comparing byte-per-byte.
   1781     __ repne_scasb();
   1782     __ jmp(&comparison_done);
   1783     // Everything is set up for repne scasw:
   1784     //   * Comparison address in RDI.
   1785     //   * Counter in ECX.
   1786     __ Bind(&uncompressed_string_comparison);
   1787     __ repne_scasw();
   1788     __ Bind(&comparison_done);
   1789   } else {
   1790     __ repne_scasw();
   1791   }
   1792   // Did we find a match?
   1793   __ j(kNotEqual, &not_found_label);
   1794 
   1795   // Yes, we matched.  Compute the index of the result.
   1796   __ subl(string_length, counter);
   1797   __ leal(out, Address(string_length, -1));
   1798 
   1799   NearLabel done;
   1800   __ jmp(&done);
   1801 
   1802   // Failed to match; return -1.
   1803   __ Bind(&not_found_label);
   1804   __ movl(out, Immediate(-1));
   1805 
   1806   // And join up at the end.
   1807   __ Bind(&done);
   1808   if (slow_path != nullptr) {
   1809     __ Bind(slow_path->GetExitLabel());
   1810   }
   1811 }
   1812 
   1813 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
   1814   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero */ true);
   1815 }
   1816 
   1817 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
   1818   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ true);
   1819 }
   1820 
   1821 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
   1822   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero */ false);
   1823 }
   1824 
   1825 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
   1826   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ false);
   1827 }
   1828 
   1829 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1830   LocationSummary* locations = new (allocator_) LocationSummary(
   1831       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1832   InvokeRuntimeCallingConvention calling_convention;
   1833   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1834   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1835   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1836   locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
   1837   locations->SetOut(Location::RegisterLocation(RAX));
   1838 }
   1839 
   1840 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1841   X86_64Assembler* assembler = GetAssembler();
   1842   LocationSummary* locations = invoke->GetLocations();
   1843 
   1844   CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
   1845   __ testl(byte_array, byte_array);
   1846   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1847   codegen_->AddSlowPath(slow_path);
   1848   __ j(kEqual, slow_path->GetEntryLabel());
   1849 
   1850   codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc());
   1851   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   1852   __ Bind(slow_path->GetExitLabel());
   1853 }
   1854 
   1855 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
   1856   LocationSummary* locations =
   1857       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
   1858   InvokeRuntimeCallingConvention calling_convention;
   1859   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1860   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1861   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1862   locations->SetOut(Location::RegisterLocation(RAX));
   1863 }
   1864 
   1865 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
   1866   // No need to emit code checking whether `locations->InAt(2)` is a null
   1867   // pointer, as callers of the native method
   1868   //
   1869   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   1870   //
   1871   // all include a null check on `data` before calling that method.
   1872   codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
   1873   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   1874 }
   1875 
   1876 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
   1877   LocationSummary* locations = new (allocator_) LocationSummary(
   1878       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1879   InvokeRuntimeCallingConvention calling_convention;
   1880   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1881   locations->SetOut(Location::RegisterLocation(RAX));
   1882 }
   1883 
   1884 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
   1885   X86_64Assembler* assembler = GetAssembler();
   1886   LocationSummary* locations = invoke->GetLocations();
   1887 
   1888   CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
   1889   __ testl(string_to_copy, string_to_copy);
   1890   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
   1891   codegen_->AddSlowPath(slow_path);
   1892   __ j(kEqual, slow_path->GetEntryLabel());
   1893 
   1894   codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc());
   1895   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   1896   __ Bind(slow_path->GetExitLabel());
   1897 }
   1898 
   1899 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1900   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1901   LocationSummary* locations =
   1902       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1903   locations->SetInAt(0, Location::RequiresRegister());
   1904   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
   1905   locations->SetInAt(2, Location::RequiresRegister());
   1906   locations->SetInAt(3, Location::RequiresRegister());
   1907   locations->SetInAt(4, Location::RequiresRegister());
   1908 
   1909   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
   1910   locations->AddTemp(Location::RegisterLocation(RSI));
   1911   locations->AddTemp(Location::RegisterLocation(RDI));
   1912   locations->AddTemp(Location::RegisterLocation(RCX));
   1913 }
   1914 
   1915 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1916   X86_64Assembler* assembler = GetAssembler();
   1917   LocationSummary* locations = invoke->GetLocations();
   1918 
   1919   size_t char_component_size = DataType::Size(DataType::Type::kUint16);
   1920   // Location of data in char array buffer.
   1921   const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
   1922   // Location of char array data in string.
   1923   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1924 
   1925   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1926   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   1927   Location srcBegin = locations->InAt(1);
   1928   int srcBegin_value =
   1929     srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
   1930   CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
   1931   CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
   1932   CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
   1933 
   1934   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   1935   const size_t char_size = DataType::Size(DataType::Type::kUint16);
   1936   DCHECK_EQ(char_size, 2u);
   1937 
   1938   NearLabel done;
   1939   // Compute the number of chars (words) to move.
   1940   __ movl(CpuRegister(RCX), srcEnd);
   1941   if (srcBegin.IsConstant()) {
   1942     __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
   1943   } else {
   1944     DCHECK(srcBegin.IsRegister());
   1945     __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
   1946   }
   1947   if (mirror::kUseStringCompression) {
   1948     NearLabel copy_uncompressed, copy_loop;
   1949     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
   1950     DCHECK_EQ(c_char_size, 1u);
   1951     // Location of count in string.
   1952     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1953 
   1954     __ testl(Address(obj, count_offset), Immediate(1));
   1955     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1956                   "Expecting 0=compressed, 1=uncompressed");
   1957     __ j(kNotZero, &copy_uncompressed);
   1958     // Compute the address of the source string by adding the number of chars from
   1959     // the source beginning to the value offset of a string.
   1960     __ leaq(CpuRegister(RSI),
   1961             CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
   1962     // Start the loop to copy String's value to Array of Char.
   1963     __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
   1964 
   1965     __ Bind(&copy_loop);
   1966     __ jrcxz(&done);
   1967     // Use TMP as temporary (convert byte from RSI to word).
   1968     // TODO: Selecting RAX as the temporary and using LODSB/STOSW.
   1969     __ movzxb(CpuRegister(TMP), Address(CpuRegister(RSI), 0));
   1970     __ movw(Address(CpuRegister(RDI), 0), CpuRegister(TMP));
   1971     __ leaq(CpuRegister(RDI), Address(CpuRegister(RDI), char_size));
   1972     __ leaq(CpuRegister(RSI), Address(CpuRegister(RSI), c_char_size));
   1973     // TODO: Add support for LOOP to X86_64Assembler.
   1974     __ subl(CpuRegister(RCX), Immediate(1));
   1975     __ jmp(&copy_loop);
   1976 
   1977     __ Bind(&copy_uncompressed);
   1978   }
   1979 
   1980   __ leaq(CpuRegister(RSI),
   1981           CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_2, value_offset));
   1982   // Compute the address of the destination buffer.
   1983   __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
   1984   // Do the move.
   1985   __ rep_movsw();
   1986 
   1987   __ Bind(&done);
   1988 }
   1989 
   1990 static void GenPeek(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
   1991   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
   1992   CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
   1993   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   1994   // to avoid a SIGBUS.
   1995   switch (size) {
   1996     case DataType::Type::kInt8:
   1997       __ movsxb(out, Address(address, 0));
   1998       break;
   1999     case DataType::Type::kInt16:
   2000       __ movsxw(out, Address(address, 0));
   2001       break;
   2002     case DataType::Type::kInt32:
   2003       __ movl(out, Address(address, 0));
   2004       break;
   2005     case DataType::Type::kInt64:
   2006       __ movq(out, Address(address, 0));
   2007       break;
   2008     default:
   2009       LOG(FATAL) << "Type not recognized for peek: " << size;
   2010       UNREACHABLE();
   2011   }
   2012 }
   2013 
   2014 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
   2015   CreateIntToIntLocations(allocator_, invoke);
   2016 }
   2017 
   2018 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
   2019   GenPeek(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
   2020 }
   2021 
   2022 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
   2023   CreateIntToIntLocations(allocator_, invoke);
   2024 }
   2025 
   2026 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
   2027   GenPeek(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
   2028 }
   2029 
   2030 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
   2031   CreateIntToIntLocations(allocator_, invoke);
   2032 }
   2033 
   2034 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
   2035   GenPeek(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
   2036 }
   2037 
   2038 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
   2039   CreateIntToIntLocations(allocator_, invoke);
   2040 }
   2041 
   2042 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
   2043   GenPeek(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
   2044 }
   2045 
   2046 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2047   LocationSummary* locations =
   2048       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2049   locations->SetInAt(0, Location::RequiresRegister());
   2050   locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
   2051 }
   2052 
   2053 static void GenPoke(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
   2054   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
   2055   Location value = locations->InAt(1);
   2056   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   2057   // to avoid a SIGBUS.
   2058   switch (size) {
   2059     case DataType::Type::kInt8:
   2060       if (value.IsConstant()) {
   2061         __ movb(Address(address, 0),
   2062                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   2063       } else {
   2064         __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
   2065       }
   2066       break;
   2067     case DataType::Type::kInt16:
   2068       if (value.IsConstant()) {
   2069         __ movw(Address(address, 0),
   2070                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   2071       } else {
   2072         __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
   2073       }
   2074       break;
   2075     case DataType::Type::kInt32:
   2076       if (value.IsConstant()) {
   2077         __ movl(Address(address, 0),
   2078                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   2079       } else {
   2080         __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
   2081       }
   2082       break;
   2083     case DataType::Type::kInt64:
   2084       if (value.IsConstant()) {
   2085         int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
   2086         DCHECK(IsInt<32>(v));
   2087         int32_t v_32 = v;
   2088         __ movq(Address(address, 0), Immediate(v_32));
   2089       } else {
   2090         __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
   2091       }
   2092       break;
   2093     default:
   2094       LOG(FATAL) << "Type not recognized for poke: " << size;
   2095       UNREACHABLE();
   2096   }
   2097 }
   2098 
   2099 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
   2100   CreateIntIntToVoidLocations(allocator_, invoke);
   2101 }
   2102 
   2103 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
   2104   GenPoke(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
   2105 }
   2106 
   2107 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
   2108   CreateIntIntToVoidLocations(allocator_, invoke);
   2109 }
   2110 
   2111 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
   2112   GenPoke(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
   2113 }
   2114 
   2115 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
   2116   CreateIntIntToVoidLocations(allocator_, invoke);
   2117 }
   2118 
   2119 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
   2120   GenPoke(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
   2121 }
   2122 
   2123 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
   2124   CreateIntIntToVoidLocations(allocator_, invoke);
   2125 }
   2126 
   2127 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
   2128   GenPoke(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
   2129 }
   2130 
   2131 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   2132   LocationSummary* locations =
   2133       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2134   locations->SetOut(Location::RequiresRegister());
   2135 }
   2136 
   2137 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   2138   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
   2139   GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(),
   2140                                                     /* no_rip */ true));
   2141 }
   2142 
   2143 static void GenUnsafeGet(HInvoke* invoke,
   2144                          DataType::Type type,
   2145                          bool is_volatile ATTRIBUTE_UNUSED,
   2146                          CodeGeneratorX86_64* codegen) {
   2147   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   2148   LocationSummary* locations = invoke->GetLocations();
   2149   Location base_loc = locations->InAt(1);
   2150   CpuRegister base = base_loc.AsRegister<CpuRegister>();
   2151   Location offset_loc = locations->InAt(2);
   2152   CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
   2153   Location output_loc = locations->Out();
   2154   CpuRegister output = output_loc.AsRegister<CpuRegister>();
   2155 
   2156   switch (type) {
   2157     case DataType::Type::kInt32:
   2158       __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   2159       break;
   2160 
   2161     case DataType::Type::kReference: {
   2162       if (kEmitCompilerReadBarrier) {
   2163         if (kUseBakerReadBarrier) {
   2164           Address src(base, offset, ScaleFactor::TIMES_1, 0);
   2165           codegen->GenerateReferenceLoadWithBakerReadBarrier(
   2166               invoke, output_loc, base, src, /* needs_null_check */ false);
   2167         } else {
   2168           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   2169           codegen->GenerateReadBarrierSlow(
   2170               invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
   2171         }
   2172       } else {
   2173         __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   2174         __ MaybeUnpoisonHeapReference(output);
   2175       }
   2176       break;
   2177     }
   2178 
   2179     case DataType::Type::kInt64:
   2180       __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   2181       break;
   2182 
   2183     default:
   2184       LOG(FATAL) << "Unsupported op size " << type;
   2185       UNREACHABLE();
   2186   }
   2187 }
   2188 
   2189 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2190   bool can_call = kEmitCompilerReadBarrier &&
   2191       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
   2192        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   2193   LocationSummary* locations =
   2194       new (allocator) LocationSummary(invoke,
   2195                                       can_call
   2196                                           ? LocationSummary::kCallOnSlowPath
   2197                                           : LocationSummary::kNoCall,
   2198                                       kIntrinsified);
   2199   if (can_call && kUseBakerReadBarrier) {
   2200     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   2201   }
   2202   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   2203   locations->SetInAt(1, Location::RequiresRegister());
   2204   locations->SetInAt(2, Location::RequiresRegister());
   2205   locations->SetOut(Location::RequiresRegister(),
   2206                     (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
   2207 }
   2208 
   2209 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
   2210   CreateIntIntIntToIntLocations(allocator_, invoke);
   2211 }
   2212 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
   2213   CreateIntIntIntToIntLocations(allocator_, invoke);
   2214 }
   2215 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
   2216   CreateIntIntIntToIntLocations(allocator_, invoke);
   2217 }
   2218 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   2219   CreateIntIntIntToIntLocations(allocator_, invoke);
   2220 }
   2221 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
   2222   CreateIntIntIntToIntLocations(allocator_, invoke);
   2223 }
   2224 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   2225   CreateIntIntIntToIntLocations(allocator_, invoke);
   2226 }
   2227 
   2228 
   2229 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
   2230   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ false, codegen_);
   2231 }
   2232 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
   2233   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ true, codegen_);
   2234 }
   2235 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
   2236   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ false, codegen_);
   2237 }
   2238 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   2239   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ true, codegen_);
   2240 }
   2241 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
   2242   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ false, codegen_);
   2243 }
   2244 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   2245   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ true, codegen_);
   2246 }
   2247 
   2248 
   2249 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* allocator,
   2250                                                        DataType::Type type,
   2251                                                        HInvoke* invoke) {
   2252   LocationSummary* locations =
   2253       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2254   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   2255   locations->SetInAt(1, Location::RequiresRegister());
   2256   locations->SetInAt(2, Location::RequiresRegister());
   2257   locations->SetInAt(3, Location::RequiresRegister());
   2258   if (type == DataType::Type::kReference) {
   2259     // Need temp registers for card-marking.
   2260     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   2261     locations->AddTemp(Location::RequiresRegister());
   2262   }
   2263 }
   2264 
   2265 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
   2266   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
   2267 }
   2268 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
   2269   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
   2270 }
   2271 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
   2272   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
   2273 }
   2274 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
   2275   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
   2276 }
   2277 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   2278   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
   2279 }
   2280 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   2281   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
   2282 }
   2283 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
   2284   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
   2285 }
   2286 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   2287   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
   2288 }
   2289 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   2290   CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
   2291 }
   2292 
   2293 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
   2294 // memory model.
   2295 static void GenUnsafePut(LocationSummary* locations, DataType::Type type, bool is_volatile,
   2296                          CodeGeneratorX86_64* codegen) {
   2297   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   2298   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
   2299   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
   2300   CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
   2301 
   2302   if (type == DataType::Type::kInt64) {
   2303     __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
   2304   } else if (kPoisonHeapReferences && type == DataType::Type::kReference) {
   2305     CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2306     __ movl(temp, value);
   2307     __ PoisonHeapReference(temp);
   2308     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
   2309   } else {
   2310     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
   2311   }
   2312 
   2313   if (is_volatile) {
   2314     codegen->MemoryFence();
   2315   }
   2316 
   2317   if (type == DataType::Type::kReference) {
   2318     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   2319     codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
   2320                         locations->GetTemp(1).AsRegister<CpuRegister>(),
   2321                         base,
   2322                         value,
   2323                         value_can_be_null);
   2324   }
   2325 }
   2326 
   2327 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
   2328   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ false, codegen_);
   2329 }
   2330 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
   2331   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ false, codegen_);
   2332 }
   2333 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
   2334   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ true, codegen_);
   2335 }
   2336 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
   2337   GenUnsafePut(
   2338       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ false, codegen_);
   2339 }
   2340 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   2341   GenUnsafePut(
   2342       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ false, codegen_);
   2343 }
   2344 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   2345   GenUnsafePut(
   2346       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ true, codegen_);
   2347 }
   2348 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
   2349   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ false, codegen_);
   2350 }
   2351 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   2352   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ false, codegen_);
   2353 }
   2354 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   2355   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ true, codegen_);
   2356 }
   2357 
   2358 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
   2359                                        DataType::Type type,
   2360                                        HInvoke* invoke) {
   2361   bool can_call = kEmitCompilerReadBarrier &&
   2362       kUseBakerReadBarrier &&
   2363       (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
   2364   LocationSummary* locations =
   2365       new (allocator) LocationSummary(invoke,
   2366                                       can_call
   2367                                           ? LocationSummary::kCallOnSlowPath
   2368                                           : LocationSummary::kNoCall,
   2369                                       kIntrinsified);
   2370   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   2371   locations->SetInAt(1, Location::RequiresRegister());
   2372   locations->SetInAt(2, Location::RequiresRegister());
   2373   // expected value must be in EAX/RAX.
   2374   locations->SetInAt(3, Location::RegisterLocation(RAX));
   2375   locations->SetInAt(4, Location::RequiresRegister());
   2376 
   2377   locations->SetOut(Location::RequiresRegister());
   2378   if (type == DataType::Type::kReference) {
   2379     // Need temporary registers for card-marking, and possibly for
   2380     // (Baker) read barrier.
   2381     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   2382     locations->AddTemp(Location::RequiresRegister());
   2383   }
   2384 }
   2385 
   2386 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
   2387   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt32, invoke);
   2388 }
   2389 
   2390 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
   2391   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt64, invoke);
   2392 }
   2393 
   2394 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
   2395   // The only read barrier implementation supporting the
   2396   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2397   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   2398     return;
   2399   }
   2400 
   2401   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke);
   2402 }
   2403 
   2404 static void GenCAS(DataType::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
   2405   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   2406   LocationSummary* locations = invoke->GetLocations();
   2407 
   2408   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
   2409   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
   2410   CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
   2411   // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
   2412   DCHECK_EQ(expected.AsRegister(), RAX);
   2413   CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
   2414   Location out_loc = locations->Out();
   2415   CpuRegister out = out_loc.AsRegister<CpuRegister>();
   2416 
   2417   if (type == DataType::Type::kReference) {
   2418     // The only read barrier implementation supporting the
   2419     // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2420     DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2421 
   2422     CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
   2423     CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
   2424 
   2425     // Mark card for object assuming new value is stored.
   2426     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   2427     codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
   2428 
   2429     // The address of the field within the holding object.
   2430     Address field_addr(base, offset, ScaleFactor::TIMES_1, 0);
   2431 
   2432     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2433       // Need to make sure the reference stored in the field is a to-space
   2434       // one before attempting the CAS or the CAS could fail incorrectly.
   2435       codegen->GenerateReferenceLoadWithBakerReadBarrier(
   2436           invoke,
   2437           out_loc,  // Unused, used only as a "temporary" within the read barrier.
   2438           base,
   2439           field_addr,
   2440           /* needs_null_check */ false,
   2441           /* always_update_field */ true,
   2442           &temp1,
   2443           &temp2);
   2444     }
   2445 
   2446     bool base_equals_value = (base.AsRegister() == value.AsRegister());
   2447     Register value_reg = value.AsRegister();
   2448     if (kPoisonHeapReferences) {
   2449       if (base_equals_value) {
   2450         // If `base` and `value` are the same register location, move
   2451         // `value_reg` to a temporary register.  This way, poisoning
   2452         // `value_reg` won't invalidate `base`.
   2453         value_reg = temp1.AsRegister();
   2454         __ movl(CpuRegister(value_reg), base);
   2455       }
   2456 
   2457       // Check that the register allocator did not assign the location
   2458       // of `expected` (RAX) to `value` nor to `base`, so that heap
   2459       // poisoning (when enabled) works as intended below.
   2460       // - If `value` were equal to `expected`, both references would
   2461       //   be poisoned twice, meaning they would not be poisoned at
   2462       //   all, as heap poisoning uses address negation.
   2463       // - If `base` were equal to `expected`, poisoning `expected`
   2464       //   would invalidate `base`.
   2465       DCHECK_NE(value_reg, expected.AsRegister());
   2466       DCHECK_NE(base.AsRegister(), expected.AsRegister());
   2467 
   2468       __ PoisonHeapReference(expected);
   2469       __ PoisonHeapReference(CpuRegister(value_reg));
   2470     }
   2471 
   2472     __ LockCmpxchgl(field_addr, CpuRegister(value_reg));
   2473 
   2474     // LOCK CMPXCHG has full barrier semantics, and we don't need
   2475     // scheduling barriers at this time.
   2476 
   2477     // Convert ZF into the Boolean result.
   2478     __ setcc(kZero, out);
   2479     __ movzxb(out, out);
   2480 
   2481     // If heap poisoning is enabled, we need to unpoison the values
   2482     // that were poisoned earlier.
   2483     if (kPoisonHeapReferences) {
   2484       if (base_equals_value) {
   2485         // `value_reg` has been moved to a temporary register, no need
   2486         // to unpoison it.
   2487       } else {
   2488         // Ensure `value` is different from `out`, so that unpoisoning
   2489         // the former does not invalidate the latter.
   2490         DCHECK_NE(value_reg, out.AsRegister());
   2491         __ UnpoisonHeapReference(CpuRegister(value_reg));
   2492       }
   2493       // Ensure `expected` is different from `out`, so that unpoisoning
   2494       // the former does not invalidate the latter.
   2495       DCHECK_NE(expected.AsRegister(), out.AsRegister());
   2496       __ UnpoisonHeapReference(expected);
   2497     }
   2498   } else {
   2499     if (type == DataType::Type::kInt32) {
   2500       __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
   2501     } else if (type == DataType::Type::kInt64) {
   2502       __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
   2503     } else {
   2504       LOG(FATAL) << "Unexpected CAS type " << type;
   2505     }
   2506 
   2507     // LOCK CMPXCHG has full barrier semantics, and we don't need
   2508     // scheduling barriers at this time.
   2509 
   2510     // Convert ZF into the Boolean result.
   2511     __ setcc(kZero, out);
   2512     __ movzxb(out, out);
   2513   }
   2514 }
   2515 
   2516 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
   2517   GenCAS(DataType::Type::kInt32, invoke, codegen_);
   2518 }
   2519 
   2520 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
   2521   GenCAS(DataType::Type::kInt64, invoke, codegen_);
   2522 }
   2523 
   2524 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
   2525   // The only read barrier implementation supporting the
   2526   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2527   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2528 
   2529   GenCAS(DataType::Type::kReference, invoke, codegen_);
   2530 }
   2531 
   2532 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
   2533   LocationSummary* locations =
   2534       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2535   locations->SetInAt(0, Location::RequiresRegister());
   2536   locations->SetOut(Location::SameAsFirstInput());
   2537   locations->AddTemp(Location::RequiresRegister());
   2538 }
   2539 
   2540 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
   2541                      X86_64Assembler* assembler) {
   2542   Immediate imm_shift(shift);
   2543   Immediate imm_mask(mask);
   2544   __ movl(temp, reg);
   2545   __ shrl(reg, imm_shift);
   2546   __ andl(temp, imm_mask);
   2547   __ andl(reg, imm_mask);
   2548   __ shll(temp, imm_shift);
   2549   __ orl(reg, temp);
   2550 }
   2551 
   2552 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
   2553   X86_64Assembler* assembler = GetAssembler();
   2554   LocationSummary* locations = invoke->GetLocations();
   2555 
   2556   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
   2557   CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2558 
   2559   /*
   2560    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
   2561    * swapping bits to reverse bits in a number x. Using bswap to save instructions
   2562    * compared to generic luni implementation which has 5 rounds of swapping bits.
   2563    * x = bswap x
   2564    * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
   2565    * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
   2566    * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
   2567    */
   2568   __ bswapl(reg);
   2569   SwapBits(reg, temp, 1, 0x55555555, assembler);
   2570   SwapBits(reg, temp, 2, 0x33333333, assembler);
   2571   SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
   2572 }
   2573 
   2574 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
   2575   LocationSummary* locations =
   2576       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2577   locations->SetInAt(0, Location::RequiresRegister());
   2578   locations->SetOut(Location::SameAsFirstInput());
   2579   locations->AddTemp(Location::RequiresRegister());
   2580   locations->AddTemp(Location::RequiresRegister());
   2581 }
   2582 
   2583 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
   2584                        int32_t shift, int64_t mask, X86_64Assembler* assembler) {
   2585   Immediate imm_shift(shift);
   2586   __ movq(temp_mask, Immediate(mask));
   2587   __ movq(temp, reg);
   2588   __ shrq(reg, imm_shift);
   2589   __ andq(temp, temp_mask);
   2590   __ andq(reg, temp_mask);
   2591   __ shlq(temp, imm_shift);
   2592   __ orq(reg, temp);
   2593 }
   2594 
   2595 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
   2596   X86_64Assembler* assembler = GetAssembler();
   2597   LocationSummary* locations = invoke->GetLocations();
   2598 
   2599   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
   2600   CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
   2601   CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
   2602 
   2603   /*
   2604    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
   2605    * swapping bits to reverse bits in a long number x. Using bswap to save instructions
   2606    * compared to generic luni implementation which has 5 rounds of swapping bits.
   2607    * x = bswap x
   2608    * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
   2609    * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
   2610    * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
   2611    */
   2612   __ bswapq(reg);
   2613   SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
   2614   SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
   2615   SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
   2616 }
   2617 
   2618 static void CreateBitCountLocations(
   2619     ArenaAllocator* allocator, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
   2620   if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
   2621     // Do nothing if there is no popcnt support. This results in generating
   2622     // a call for the intrinsic rather than direct code.
   2623     return;
   2624   }
   2625   LocationSummary* locations =
   2626       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2627   locations->SetInAt(0, Location::Any());
   2628   locations->SetOut(Location::RequiresRegister());
   2629 }
   2630 
   2631 static void GenBitCount(X86_64Assembler* assembler,
   2632                         CodeGeneratorX86_64* codegen,
   2633                         HInvoke* invoke,
   2634                         bool is_long) {
   2635   LocationSummary* locations = invoke->GetLocations();
   2636   Location src = locations->InAt(0);
   2637   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2638 
   2639   if (invoke->InputAt(0)->IsConstant()) {
   2640     // Evaluate this at compile time.
   2641     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2642     int32_t result = is_long
   2643         ? POPCOUNT(static_cast<uint64_t>(value))
   2644         : POPCOUNT(static_cast<uint32_t>(value));
   2645     codegen->Load32BitValue(out, result);
   2646     return;
   2647   }
   2648 
   2649   if (src.IsRegister()) {
   2650     if (is_long) {
   2651       __ popcntq(out, src.AsRegister<CpuRegister>());
   2652     } else {
   2653       __ popcntl(out, src.AsRegister<CpuRegister>());
   2654     }
   2655   } else if (is_long) {
   2656     DCHECK(src.IsDoubleStackSlot());
   2657     __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2658   } else {
   2659     DCHECK(src.IsStackSlot());
   2660     __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2661   }
   2662 }
   2663 
   2664 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
   2665   CreateBitCountLocations(allocator_, codegen_, invoke);
   2666 }
   2667 
   2668 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
   2669   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2670 }
   2671 
   2672 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
   2673   CreateBitCountLocations(allocator_, codegen_, invoke);
   2674 }
   2675 
   2676 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
   2677   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2678 }
   2679 
   2680 static void CreateOneBitLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_high) {
   2681   LocationSummary* locations =
   2682       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2683   locations->SetInAt(0, Location::Any());
   2684   locations->SetOut(Location::RequiresRegister());
   2685   locations->AddTemp(is_high ? Location::RegisterLocation(RCX)  // needs CL
   2686                              : Location::RequiresRegister());  // any will do
   2687 }
   2688 
   2689 static void GenOneBit(X86_64Assembler* assembler,
   2690                       CodeGeneratorX86_64* codegen,
   2691                       HInvoke* invoke,
   2692                       bool is_high, bool is_long) {
   2693   LocationSummary* locations = invoke->GetLocations();
   2694   Location src = locations->InAt(0);
   2695   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2696 
   2697   if (invoke->InputAt(0)->IsConstant()) {
   2698     // Evaluate this at compile time.
   2699     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2700     if (value == 0) {
   2701       __ xorl(out, out);  // Clears upper bits too.
   2702       return;
   2703     }
   2704     // Nonzero value.
   2705     if (is_high) {
   2706       value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
   2707                       : 31 - CLZ(static_cast<uint32_t>(value));
   2708     } else {
   2709       value = is_long ? CTZ(static_cast<uint64_t>(value))
   2710                       : CTZ(static_cast<uint32_t>(value));
   2711     }
   2712     if (is_long) {
   2713       codegen->Load64BitValue(out, 1ULL << value);
   2714     } else {
   2715       codegen->Load32BitValue(out, 1 << value);
   2716     }
   2717     return;
   2718   }
   2719 
   2720   // Handle the non-constant cases.
   2721   CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2722   if (is_high) {
   2723     // Use architectural support: basically 1 << bsr.
   2724     if (src.IsRegister()) {
   2725       if (is_long) {
   2726         __ bsrq(tmp, src.AsRegister<CpuRegister>());
   2727       } else {
   2728         __ bsrl(tmp, src.AsRegister<CpuRegister>());
   2729       }
   2730     } else if (is_long) {
   2731       DCHECK(src.IsDoubleStackSlot());
   2732       __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2733     } else {
   2734       DCHECK(src.IsStackSlot());
   2735       __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2736     }
   2737     // BSR sets ZF if the input was zero.
   2738     NearLabel is_zero, done;
   2739     __ j(kEqual, &is_zero);
   2740     __ movl(out, Immediate(1));  // Clears upper bits too.
   2741     if (is_long) {
   2742       __ shlq(out, tmp);
   2743     } else {
   2744       __ shll(out, tmp);
   2745     }
   2746     __ jmp(&done);
   2747     __ Bind(&is_zero);
   2748     __ xorl(out, out);  // Clears upper bits too.
   2749     __ Bind(&done);
   2750   } else  {
   2751     // Copy input into temporary.
   2752     if (src.IsRegister()) {
   2753       if (is_long) {
   2754         __ movq(tmp, src.AsRegister<CpuRegister>());
   2755       } else {
   2756         __ movl(tmp, src.AsRegister<CpuRegister>());
   2757       }
   2758     } else if (is_long) {
   2759       DCHECK(src.IsDoubleStackSlot());
   2760       __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2761     } else {
   2762       DCHECK(src.IsStackSlot());
   2763       __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2764     }
   2765     // Do the bit twiddling: basically tmp & -tmp;
   2766     if (is_long) {
   2767       __ movq(out, tmp);
   2768       __ negq(tmp);
   2769       __ andq(out, tmp);
   2770     } else {
   2771       __ movl(out, tmp);
   2772       __ negl(tmp);
   2773       __ andl(out, tmp);
   2774     }
   2775   }
   2776 }
   2777 
   2778 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2779   CreateOneBitLocations(allocator_, invoke, /* is_high */ true);
   2780 }
   2781 
   2782 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2783   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
   2784 }
   2785 
   2786 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
   2787   CreateOneBitLocations(allocator_, invoke, /* is_high */ true);
   2788 }
   2789 
   2790 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
   2791   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
   2792 }
   2793 
   2794 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
   2795   CreateOneBitLocations(allocator_, invoke, /* is_high */ false);
   2796 }
   2797 
   2798 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
   2799   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
   2800 }
   2801 
   2802 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
   2803   CreateOneBitLocations(allocator_, invoke, /* is_high */ false);
   2804 }
   2805 
   2806 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
   2807   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
   2808 }
   2809 
   2810 static void CreateLeadingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2811   LocationSummary* locations =
   2812       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2813   locations->SetInAt(0, Location::Any());
   2814   locations->SetOut(Location::RequiresRegister());
   2815 }
   2816 
   2817 static void GenLeadingZeros(X86_64Assembler* assembler,
   2818                             CodeGeneratorX86_64* codegen,
   2819                             HInvoke* invoke, bool is_long) {
   2820   LocationSummary* locations = invoke->GetLocations();
   2821   Location src = locations->InAt(0);
   2822   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2823 
   2824   int zero_value_result = is_long ? 64 : 32;
   2825   if (invoke->InputAt(0)->IsConstant()) {
   2826     // Evaluate this at compile time.
   2827     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2828     if (value == 0) {
   2829       value = zero_value_result;
   2830     } else {
   2831       value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
   2832     }
   2833     codegen->Load32BitValue(out, value);
   2834     return;
   2835   }
   2836 
   2837   // Handle the non-constant cases.
   2838   if (src.IsRegister()) {
   2839     if (is_long) {
   2840       __ bsrq(out, src.AsRegister<CpuRegister>());
   2841     } else {
   2842       __ bsrl(out, src.AsRegister<CpuRegister>());
   2843     }
   2844   } else if (is_long) {
   2845     DCHECK(src.IsDoubleStackSlot());
   2846     __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2847   } else {
   2848     DCHECK(src.IsStackSlot());
   2849     __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2850   }
   2851 
   2852   // BSR sets ZF if the input was zero, and the output is undefined.
   2853   NearLabel is_zero, done;
   2854   __ j(kEqual, &is_zero);
   2855 
   2856   // Correct the result from BSR to get the CLZ result.
   2857   __ xorl(out, Immediate(zero_value_result - 1));
   2858   __ jmp(&done);
   2859 
   2860   // Fix the zero case with the expected result.
   2861   __ Bind(&is_zero);
   2862   __ movl(out, Immediate(zero_value_result));
   2863 
   2864   __ Bind(&done);
   2865 }
   2866 
   2867 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2868   CreateLeadingZeroLocations(allocator_, invoke);
   2869 }
   2870 
   2871 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2872   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2873 }
   2874 
   2875 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2876   CreateLeadingZeroLocations(allocator_, invoke);
   2877 }
   2878 
   2879 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2880   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2881 }
   2882 
   2883 static void CreateTrailingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
   2884   LocationSummary* locations =
   2885       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2886   locations->SetInAt(0, Location::Any());
   2887   locations->SetOut(Location::RequiresRegister());
   2888 }
   2889 
   2890 static void GenTrailingZeros(X86_64Assembler* assembler,
   2891                              CodeGeneratorX86_64* codegen,
   2892                              HInvoke* invoke, bool is_long) {
   2893   LocationSummary* locations = invoke->GetLocations();
   2894   Location src = locations->InAt(0);
   2895   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2896 
   2897   int zero_value_result = is_long ? 64 : 32;
   2898   if (invoke->InputAt(0)->IsConstant()) {
   2899     // Evaluate this at compile time.
   2900     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2901     if (value == 0) {
   2902       value = zero_value_result;
   2903     } else {
   2904       value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
   2905     }
   2906     codegen->Load32BitValue(out, value);
   2907     return;
   2908   }
   2909 
   2910   // Handle the non-constant cases.
   2911   if (src.IsRegister()) {
   2912     if (is_long) {
   2913       __ bsfq(out, src.AsRegister<CpuRegister>());
   2914     } else {
   2915       __ bsfl(out, src.AsRegister<CpuRegister>());
   2916     }
   2917   } else if (is_long) {
   2918     DCHECK(src.IsDoubleStackSlot());
   2919     __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2920   } else {
   2921     DCHECK(src.IsStackSlot());
   2922     __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2923   }
   2924 
   2925   // BSF sets ZF if the input was zero, and the output is undefined.
   2926   NearLabel done;
   2927   __ j(kNotEqual, &done);
   2928 
   2929   // Fix the zero case with the expected result.
   2930   __ movl(out, Immediate(zero_value_result));
   2931 
   2932   __ Bind(&done);
   2933 }
   2934 
   2935 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2936   CreateTrailingZeroLocations(allocator_, invoke);
   2937 }
   2938 
   2939 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2940   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2941 }
   2942 
   2943 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2944   CreateTrailingZeroLocations(allocator_, invoke);
   2945 }
   2946 
   2947 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2948   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2949 }
   2950 
   2951 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) {
   2952   InvokeRuntimeCallingConvention calling_convention;
   2953   IntrinsicVisitor::ComputeIntegerValueOfLocations(
   2954       invoke,
   2955       codegen_,
   2956       Location::RegisterLocation(RAX),
   2957       Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   2958 }
   2959 
   2960 void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) {
   2961   IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
   2962   LocationSummary* locations = invoke->GetLocations();
   2963   X86_64Assembler* assembler = GetAssembler();
   2964 
   2965   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2966   InvokeRuntimeCallingConvention calling_convention;
   2967   if (invoke->InputAt(0)->IsConstant()) {
   2968     int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
   2969     if (value >= info.low && value <= info.high) {
   2970       // Just embed the j.l.Integer in the code.
   2971       ScopedObjectAccess soa(Thread::Current());
   2972       mirror::Object* boxed = info.cache->Get(value + (-info.low));
   2973       DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
   2974       uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
   2975       __ movl(out, Immediate(static_cast<int32_t>(address)));
   2976     } else {
   2977       // Allocate and initialize a new j.l.Integer.
   2978       // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
   2979       // JIT object table.
   2980       CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
   2981       uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
   2982       __ movl(argument, Immediate(static_cast<int32_t>(address)));
   2983       codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
   2984       CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   2985       __ movl(Address(out, info.value_offset), Immediate(value));
   2986     }
   2987   } else {
   2988     CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>();
   2989     // Check bounds of our cache.
   2990     __ leal(out, Address(in, -info.low));
   2991     __ cmpl(out, Immediate(info.high - info.low + 1));
   2992     NearLabel allocate, done;
   2993     __ j(kAboveEqual, &allocate);
   2994     // If the value is within the bounds, load the j.l.Integer directly from the array.
   2995     uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
   2996     uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
   2997     if (data_offset + address <= std::numeric_limits<int32_t>::max()) {
   2998       __ movl(out, Address(out, TIMES_4, data_offset + address));
   2999     } else {
   3000       CpuRegister temp = CpuRegister(calling_convention.GetRegisterAt(0));
   3001       __ movl(temp, Immediate(static_cast<int32_t>(data_offset + address)));
   3002       __ movl(out, Address(temp, out, TIMES_4, 0));
   3003     }
   3004     __ MaybeUnpoisonHeapReference(out);
   3005     __ jmp(&done);
   3006     __ Bind(&allocate);
   3007     // Otherwise allocate and initialize a new j.l.Integer.
   3008     CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
   3009     address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
   3010     __ movl(argument, Immediate(static_cast<int32_t>(address)));
   3011     codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
   3012     CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
   3013     __ movl(Address(out, info.value_offset), in);
   3014     __ Bind(&done);
   3015   }
   3016 }
   3017 
   3018 void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) {
   3019   LocationSummary* locations =
   3020       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3021   locations->SetOut(Location::RequiresRegister());
   3022 }
   3023 
   3024 void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) {
   3025   X86_64Assembler* assembler = GetAssembler();
   3026   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
   3027   Address address = Address::Absolute
   3028       (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true);
   3029   NearLabel done;
   3030   __ gs()->movl(out, address);
   3031   __ testl(out, out);
   3032   __ j(kEqual, &done);
   3033   __ gs()->movl(address, Immediate(0));
   3034   codegen_->MemoryFence();
   3035   __ Bind(&done);
   3036 }
   3037 
   3038 void IntrinsicLocationsBuilderX86_64::VisitReachabilityFence(HInvoke* invoke) {
   3039   LocationSummary* locations =
   3040       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3041   locations->SetInAt(0, Location::Any());
   3042 }
   3043 
   3044 void IntrinsicCodeGeneratorX86_64::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
   3045 
   3046 UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
   3047 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
   3048 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
   3049 
   3050 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
   3051 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
   3052 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferAppend);
   3053 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferLength);
   3054 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferToString);
   3055 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppend);
   3056 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderLength);
   3057 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString);
   3058 
   3059 // 1.8.
   3060 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
   3061 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
   3062 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
   3063 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
   3064 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
   3065 
   3066 UNREACHABLE_INTRINSICS(X86_64)
   3067 
   3068 #undef __
   3069 
   3070 }  // namespace x86_64
   3071 }  // namespace art
   3072