Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "intrinsics_x86_64.h"
     18 
     19 #include <limits>
     20 
     21 #include "arch/x86_64/instruction_set_features_x86_64.h"
     22 #include "art_method-inl.h"
     23 #include "base/bit_utils.h"
     24 #include "code_generator_x86_64.h"
     25 #include "entrypoints/quick/quick_entrypoints.h"
     26 #include "intrinsics.h"
     27 #include "intrinsics_utils.h"
     28 #include "mirror/array-inl.h"
     29 #include "mirror/string.h"
     30 #include "thread.h"
     31 #include "utils/x86_64/assembler_x86_64.h"
     32 #include "utils/x86_64/constants_x86_64.h"
     33 
     34 namespace art {
     35 
     36 namespace x86_64 {
     37 
     38 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
     39   : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
     40 }
     41 
     42 
     43 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
     44   return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
     45 }
     46 
     47 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
     48   return codegen_->GetGraph()->GetArena();
     49 }
     50 
     51 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
     52   Dispatch(invoke);
     53   LocationSummary* res = invoke->GetLocations();
     54   if (res == nullptr) {
     55     return false;
     56   }
     57   if (kEmitCompilerReadBarrier && res->CanCall()) {
     58     // Generating an intrinsic for this HInvoke may produce an
     59     // IntrinsicSlowPathX86_64 slow path.  Currently this approach
     60     // does not work when using read barriers, as the emitted
     61     // calling sequence will make use of another slow path
     62     // (ReadBarrierForRootSlowPathX86_64 for HInvokeStaticOrDirect,
     63     // ReadBarrierSlowPathX86_64 for HInvokeVirtual).  So we bail
     64     // out in this case.
     65     //
     66     // TODO: Find a way to have intrinsics work with read barriers.
     67     invoke->SetLocations(nullptr);
     68     return false;
     69   }
     70   return res->Intrinsified();
     71 }
     72 
     73 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
     74   InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
     75   IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
     76 }
     77 
     78 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
     79 
     80 #define __ assembler->
     81 
     82 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
     83   LocationSummary* locations = new (arena) LocationSummary(invoke,
     84                                                            LocationSummary::kNoCall,
     85                                                            kIntrinsified);
     86   locations->SetInAt(0, Location::RequiresFpuRegister());
     87   locations->SetOut(Location::RequiresRegister());
     88 }
     89 
     90 static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
     91   LocationSummary* locations = new (arena) LocationSummary(invoke,
     92                                                            LocationSummary::kNoCall,
     93                                                            kIntrinsified);
     94   locations->SetInAt(0, Location::RequiresRegister());
     95   locations->SetOut(Location::RequiresFpuRegister());
     96 }
     97 
     98 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
     99   Location input = locations->InAt(0);
    100   Location output = locations->Out();
    101   __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
    102 }
    103 
    104 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
    105   Location input = locations->InAt(0);
    106   Location output = locations->Out();
    107   __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
    108 }
    109 
    110 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    111   CreateFPToIntLocations(arena_, invoke);
    112 }
    113 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    114   CreateIntToFPLocations(arena_, invoke);
    115 }
    116 
    117 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    118   MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    119 }
    120 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    121   MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    122 }
    123 
    124 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    125   CreateFPToIntLocations(arena_, invoke);
    126 }
    127 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    128   CreateIntToFPLocations(arena_, invoke);
    129 }
    130 
    131 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    132   MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    133 }
    134 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    135   MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    136 }
    137 
    138 static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
    139   LocationSummary* locations = new (arena) LocationSummary(invoke,
    140                                                            LocationSummary::kNoCall,
    141                                                            kIntrinsified);
    142   locations->SetInAt(0, Location::RequiresRegister());
    143   locations->SetOut(Location::SameAsFirstInput());
    144 }
    145 
    146 static void GenReverseBytes(LocationSummary* locations,
    147                             Primitive::Type size,
    148                             X86_64Assembler* assembler) {
    149   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    150 
    151   switch (size) {
    152     case Primitive::kPrimShort:
    153       // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
    154       __ bswapl(out);
    155       __ sarl(out, Immediate(16));
    156       break;
    157     case Primitive::kPrimInt:
    158       __ bswapl(out);
    159       break;
    160     case Primitive::kPrimLong:
    161       __ bswapq(out);
    162       break;
    163     default:
    164       LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
    165       UNREACHABLE();
    166   }
    167 }
    168 
    169 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
    170   CreateIntToIntLocations(arena_, invoke);
    171 }
    172 
    173 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
    174   GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
    175 }
    176 
    177 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
    178   CreateIntToIntLocations(arena_, invoke);
    179 }
    180 
    181 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
    182   GenReverseBytes(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
    183 }
    184 
    185 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
    186   CreateIntToIntLocations(arena_, invoke);
    187 }
    188 
    189 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
    190   GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
    191 }
    192 
    193 
    194 // TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
    195 //       need is 64b.
    196 
    197 static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke) {
    198   // TODO: Enable memory operations when the assembler supports them.
    199   LocationSummary* locations = new (arena) LocationSummary(invoke,
    200                                                            LocationSummary::kNoCall,
    201                                                            kIntrinsified);
    202   locations->SetInAt(0, Location::RequiresFpuRegister());
    203   locations->SetOut(Location::SameAsFirstInput());
    204   locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
    205 }
    206 
    207 static void MathAbsFP(LocationSummary* locations,
    208                       bool is64bit,
    209                       X86_64Assembler* assembler,
    210                       CodeGeneratorX86_64* codegen) {
    211   Location output = locations->Out();
    212 
    213   DCHECK(output.IsFpuRegister());
    214   XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    215 
    216   // TODO: Can mask directly with constant area using pand if we can guarantee
    217   // that the literal is aligned on a 16 byte boundary.  This will avoid a
    218   // temporary.
    219   if (is64bit) {
    220     __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
    221     __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
    222   } else {
    223     __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
    224     __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
    225   }
    226 }
    227 
    228 void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
    229   CreateFloatToFloatPlusTemps(arena_, invoke);
    230 }
    231 
    232 void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
    233   MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
    234 }
    235 
    236 void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
    237   CreateFloatToFloatPlusTemps(arena_, invoke);
    238 }
    239 
    240 void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
    241   MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
    242 }
    243 
    244 static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) {
    245   LocationSummary* locations = new (arena) LocationSummary(invoke,
    246                                                            LocationSummary::kNoCall,
    247                                                            kIntrinsified);
    248   locations->SetInAt(0, Location::RequiresRegister());
    249   locations->SetOut(Location::SameAsFirstInput());
    250   locations->AddTemp(Location::RequiresRegister());
    251 }
    252 
    253 static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
    254   Location output = locations->Out();
    255   CpuRegister out = output.AsRegister<CpuRegister>();
    256   CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
    257 
    258   if (is64bit) {
    259     // Create mask.
    260     __ movq(mask, out);
    261     __ sarq(mask, Immediate(63));
    262     // Add mask.
    263     __ addq(out, mask);
    264     __ xorq(out, mask);
    265   } else {
    266     // Create mask.
    267     __ movl(mask, out);
    268     __ sarl(mask, Immediate(31));
    269     // Add mask.
    270     __ addl(out, mask);
    271     __ xorl(out, mask);
    272   }
    273 }
    274 
    275 void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
    276   CreateIntToIntPlusTemp(arena_, invoke);
    277 }
    278 
    279 void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
    280   GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
    281 }
    282 
    283 void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
    284   CreateIntToIntPlusTemp(arena_, invoke);
    285 }
    286 
    287 void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
    288   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
    289 }
    290 
    291 static void GenMinMaxFP(LocationSummary* locations,
    292                         bool is_min,
    293                         bool is_double,
    294                         X86_64Assembler* assembler,
    295                         CodeGeneratorX86_64* codegen) {
    296   Location op1_loc = locations->InAt(0);
    297   Location op2_loc = locations->InAt(1);
    298   Location out_loc = locations->Out();
    299   XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
    300 
    301   // Shortcut for same input locations.
    302   if (op1_loc.Equals(op2_loc)) {
    303     DCHECK(out_loc.Equals(op1_loc));
    304     return;
    305   }
    306 
    307   //  (out := op1)
    308   //  out <=? op2
    309   //  if Nan jmp Nan_label
    310   //  if out is min jmp done
    311   //  if op2 is min jmp op2_label
    312   //  handle -0/+0
    313   //  jmp done
    314   // Nan_label:
    315   //  out := NaN
    316   // op2_label:
    317   //  out := op2
    318   // done:
    319   //
    320   // This removes one jmp, but needs to copy one input (op1) to out.
    321   //
    322   // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
    323 
    324   XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
    325 
    326   NearLabel nan, done, op2_label;
    327   if (is_double) {
    328     __ ucomisd(out, op2);
    329   } else {
    330     __ ucomiss(out, op2);
    331   }
    332 
    333   __ j(Condition::kParityEven, &nan);
    334 
    335   __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
    336   __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
    337 
    338   // Handle 0.0/-0.0.
    339   if (is_min) {
    340     if (is_double) {
    341       __ orpd(out, op2);
    342     } else {
    343       __ orps(out, op2);
    344     }
    345   } else {
    346     if (is_double) {
    347       __ andpd(out, op2);
    348     } else {
    349       __ andps(out, op2);
    350     }
    351   }
    352   __ jmp(&done);
    353 
    354   // NaN handling.
    355   __ Bind(&nan);
    356   if (is_double) {
    357     __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
    358   } else {
    359     __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
    360   }
    361   __ jmp(&done);
    362 
    363   // out := op2;
    364   __ Bind(&op2_label);
    365   if (is_double) {
    366     __ movsd(out, op2);
    367   } else {
    368     __ movss(out, op2);
    369   }
    370 
    371   // Done.
    372   __ Bind(&done);
    373 }
    374 
    375 static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) {
    376   LocationSummary* locations = new (arena) LocationSummary(invoke,
    377                                                            LocationSummary::kNoCall,
    378                                                            kIntrinsified);
    379   locations->SetInAt(0, Location::RequiresFpuRegister());
    380   locations->SetInAt(1, Location::RequiresFpuRegister());
    381   // The following is sub-optimal, but all we can do for now. It would be fine to also accept
    382   // the second input to be the output (we can simply swap inputs).
    383   locations->SetOut(Location::SameAsFirstInput());
    384 }
    385 
    386 void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
    387   CreateFPFPToFP(arena_, invoke);
    388 }
    389 
    390 void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
    391   GenMinMaxFP(
    392       invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_);
    393 }
    394 
    395 void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
    396   CreateFPFPToFP(arena_, invoke);
    397 }
    398 
    399 void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
    400   GenMinMaxFP(
    401       invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_);
    402 }
    403 
    404 void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    405   CreateFPFPToFP(arena_, invoke);
    406 }
    407 
    408 void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
    409   GenMinMaxFP(
    410       invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_);
    411 }
    412 
    413 void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
    414   CreateFPFPToFP(arena_, invoke);
    415 }
    416 
    417 void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
    418   GenMinMaxFP(
    419       invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_);
    420 }
    421 
    422 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
    423                       X86_64Assembler* assembler) {
    424   Location op1_loc = locations->InAt(0);
    425   Location op2_loc = locations->InAt(1);
    426 
    427   // Shortcut for same input locations.
    428   if (op1_loc.Equals(op2_loc)) {
    429     // Can return immediately, as op1_loc == out_loc.
    430     // Note: if we ever support separate registers, e.g., output into memory, we need to check for
    431     //       a copy here.
    432     DCHECK(locations->Out().Equals(op1_loc));
    433     return;
    434   }
    435 
    436   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    437   CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
    438 
    439   //  (out := op1)
    440   //  out <=? op2
    441   //  if out is min jmp done
    442   //  out := op2
    443   // done:
    444 
    445   if (is_long) {
    446     __ cmpq(out, op2);
    447   } else {
    448     __ cmpl(out, op2);
    449   }
    450 
    451   __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
    452 }
    453 
    454 static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
    455   LocationSummary* locations = new (arena) LocationSummary(invoke,
    456                                                            LocationSummary::kNoCall,
    457                                                            kIntrinsified);
    458   locations->SetInAt(0, Location::RequiresRegister());
    459   locations->SetInAt(1, Location::RequiresRegister());
    460   locations->SetOut(Location::SameAsFirstInput());
    461 }
    462 
    463 void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
    464   CreateIntIntToIntLocations(arena_, invoke);
    465 }
    466 
    467 void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
    468   GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler());
    469 }
    470 
    471 void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
    472   CreateIntIntToIntLocations(arena_, invoke);
    473 }
    474 
    475 void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
    476   GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler());
    477 }
    478 
    479 void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
    480   CreateIntIntToIntLocations(arena_, invoke);
    481 }
    482 
    483 void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
    484   GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler());
    485 }
    486 
    487 void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
    488   CreateIntIntToIntLocations(arena_, invoke);
    489 }
    490 
    491 void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
    492   GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler());
    493 }
    494 
    495 static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
    496   LocationSummary* locations = new (arena) LocationSummary(invoke,
    497                                                            LocationSummary::kNoCall,
    498                                                            kIntrinsified);
    499   locations->SetInAt(0, Location::RequiresFpuRegister());
    500   locations->SetOut(Location::RequiresFpuRegister());
    501 }
    502 
    503 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
    504   CreateFPToFPLocations(arena_, invoke);
    505 }
    506 
    507 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
    508   LocationSummary* locations = invoke->GetLocations();
    509   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    510   XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    511 
    512   GetAssembler()->sqrtsd(out, in);
    513 }
    514 
    515 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
    516   MoveArguments(invoke, codegen);
    517 
    518   DCHECK(invoke->IsInvokeStaticOrDirect());
    519   codegen->GenerateStaticOrDirectCall(
    520       invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
    521   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
    522 
    523   // Copy the result back to the expected output.
    524   Location out = invoke->GetLocations()->Out();
    525   if (out.IsValid()) {
    526     DCHECK(out.IsRegister());
    527     codegen->MoveFromReturnRegister(out, invoke->GetType());
    528   }
    529 }
    530 
    531 static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
    532                                       HInvoke* invoke,
    533                                       CodeGeneratorX86_64* codegen) {
    534   // Do we have instruction support?
    535   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
    536     CreateFPToFPLocations(arena, invoke);
    537     return;
    538   }
    539 
    540   // We have to fall back to a call to the intrinsic.
    541   LocationSummary* locations = new (arena) LocationSummary(invoke,
    542                                                            LocationSummary::kCall);
    543   InvokeRuntimeCallingConvention calling_convention;
    544   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    545   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    546   // Needs to be RDI for the invoke.
    547   locations->AddTemp(Location::RegisterLocation(RDI));
    548 }
    549 
    550 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
    551                                    HInvoke* invoke,
    552                                    X86_64Assembler* assembler,
    553                                    int round_mode) {
    554   LocationSummary* locations = invoke->GetLocations();
    555   if (locations->WillCall()) {
    556     InvokeOutOfLineIntrinsic(codegen, invoke);
    557   } else {
    558     XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    559     XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    560     __ roundsd(out, in, Immediate(round_mode));
    561   }
    562 }
    563 
    564 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
    565   CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
    566 }
    567 
    568 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
    569   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
    570 }
    571 
    572 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
    573   CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
    574 }
    575 
    576 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
    577   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
    578 }
    579 
    580 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
    581   CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
    582 }
    583 
    584 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
    585   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
    586 }
    587 
    588 static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
    589                                        HInvoke* invoke,
    590                                        CodeGeneratorX86_64* codegen) {
    591   // Do we have instruction support?
    592   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
    593     LocationSummary* locations = new (arena) LocationSummary(invoke,
    594                                                               LocationSummary::kNoCall,
    595                                                               kIntrinsified);
    596     locations->SetInAt(0, Location::RequiresFpuRegister());
    597     locations->SetOut(Location::RequiresRegister());
    598     locations->AddTemp(Location::RequiresFpuRegister());
    599     return;
    600   }
    601 
    602   // We have to fall back to a call to the intrinsic.
    603   LocationSummary* locations = new (arena) LocationSummary(invoke,
    604                                                            LocationSummary::kCall);
    605   InvokeRuntimeCallingConvention calling_convention;
    606   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    607   locations->SetOut(Location::RegisterLocation(RAX));
    608   // Needs to be RDI for the invoke.
    609   locations->AddTemp(Location::RegisterLocation(RDI));
    610 }
    611 
    612 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
    613   // See intrinsics.h.
    614   if (kRoundIsPlusPointFive) {
    615     CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
    616   }
    617 }
    618 
    619 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
    620   LocationSummary* locations = invoke->GetLocations();
    621   if (locations->WillCall()) {
    622     InvokeOutOfLineIntrinsic(codegen_, invoke);
    623     return;
    624   }
    625 
    626   // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
    627   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    628   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    629   XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    630   NearLabel done, nan;
    631   X86_64Assembler* assembler = GetAssembler();
    632 
    633   // Load 0.5 into inPlusPointFive.
    634   __ movss(inPlusPointFive, codegen_->LiteralFloatAddress(0.5f));
    635 
    636   // Add in the input.
    637   __ addss(inPlusPointFive, in);
    638 
    639   // And truncate to an integer.
    640   __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
    641 
    642   // Load maxInt into out.
    643   codegen_->Load64BitValue(out, kPrimIntMax);
    644 
    645   // if inPlusPointFive >= maxInt goto done
    646   __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
    647   __ j(kAboveEqual, &done);
    648 
    649   // if input == NaN goto nan
    650   __ j(kUnordered, &nan);
    651 
    652   // output = float-to-int-truncate(input)
    653   __ cvttss2si(out, inPlusPointFive);
    654   __ jmp(&done);
    655   __ Bind(&nan);
    656 
    657   //  output = 0
    658   __ xorl(out, out);
    659   __ Bind(&done);
    660 }
    661 
    662 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
    663   // See intrinsics.h.
    664   if (kRoundIsPlusPointFive) {
    665     CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
    666   }
    667 }
    668 
    669 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
    670   LocationSummary* locations = invoke->GetLocations();
    671   if (locations->WillCall()) {
    672     InvokeOutOfLineIntrinsic(codegen_, invoke);
    673     return;
    674   }
    675 
    676   // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
    677   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    678   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    679   XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    680   NearLabel done, nan;
    681   X86_64Assembler* assembler = GetAssembler();
    682 
    683   // Load 0.5 into inPlusPointFive.
    684   __ movsd(inPlusPointFive, codegen_->LiteralDoubleAddress(0.5));
    685 
    686   // Add in the input.
    687   __ addsd(inPlusPointFive, in);
    688 
    689   // And truncate to an integer.
    690   __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
    691 
    692   // Load maxLong into out.
    693   codegen_->Load64BitValue(out, kPrimLongMax);
    694 
    695   // if inPlusPointFive >= maxLong goto done
    696   __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
    697   __ j(kAboveEqual, &done);
    698 
    699   // if input == NaN goto nan
    700   __ j(kUnordered, &nan);
    701 
    702   // output = double-to-long-truncate(input)
    703   __ cvttsd2si(out, inPlusPointFive, /* is64bit */ true);
    704   __ jmp(&done);
    705   __ Bind(&nan);
    706 
    707   //  output = 0
    708   __ xorl(out, out);
    709   __ Bind(&done);
    710 }
    711 
    712 static void CreateFPToFPCallLocations(ArenaAllocator* arena,
    713                                       HInvoke* invoke) {
    714   LocationSummary* locations = new (arena) LocationSummary(invoke,
    715                                                            LocationSummary::kCall,
    716                                                            kIntrinsified);
    717   InvokeRuntimeCallingConvention calling_convention;
    718   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    719   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    720 
    721   // We have to ensure that the native code doesn't clobber the XMM registers which are
    722   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
    723   // saved in the prologue and properly restored.
    724   for (auto fp_reg : non_volatile_xmm_regs) {
    725     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
    726   }
    727 }
    728 
    729 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
    730                           QuickEntrypointEnum entry) {
    731   LocationSummary* locations = invoke->GetLocations();
    732   DCHECK(locations->WillCall());
    733   DCHECK(invoke->IsInvokeStaticOrDirect());
    734   X86_64Assembler* assembler = codegen->GetAssembler();
    735 
    736   __ gs()->call(Address::Absolute(GetThreadOffset<kX86_64WordSize>(entry), true));
    737   codegen->RecordPcInfo(invoke, invoke->GetDexPc());
    738 }
    739 
    740 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
    741   CreateFPToFPCallLocations(arena_, invoke);
    742 }
    743 
    744 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
    745   GenFPToFPCall(invoke, codegen_, kQuickCos);
    746 }
    747 
    748 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
    749   CreateFPToFPCallLocations(arena_, invoke);
    750 }
    751 
    752 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
    753   GenFPToFPCall(invoke, codegen_, kQuickSin);
    754 }
    755 
    756 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
    757   CreateFPToFPCallLocations(arena_, invoke);
    758 }
    759 
    760 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
    761   GenFPToFPCall(invoke, codegen_, kQuickAcos);
    762 }
    763 
    764 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
    765   CreateFPToFPCallLocations(arena_, invoke);
    766 }
    767 
    768 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
    769   GenFPToFPCall(invoke, codegen_, kQuickAsin);
    770 }
    771 
    772 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
    773   CreateFPToFPCallLocations(arena_, invoke);
    774 }
    775 
    776 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
    777   GenFPToFPCall(invoke, codegen_, kQuickAtan);
    778 }
    779 
    780 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
    781   CreateFPToFPCallLocations(arena_, invoke);
    782 }
    783 
    784 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
    785   GenFPToFPCall(invoke, codegen_, kQuickCbrt);
    786 }
    787 
    788 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
    789   CreateFPToFPCallLocations(arena_, invoke);
    790 }
    791 
    792 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
    793   GenFPToFPCall(invoke, codegen_, kQuickCosh);
    794 }
    795 
    796 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
    797   CreateFPToFPCallLocations(arena_, invoke);
    798 }
    799 
    800 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
    801   GenFPToFPCall(invoke, codegen_, kQuickExp);
    802 }
    803 
    804 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
    805   CreateFPToFPCallLocations(arena_, invoke);
    806 }
    807 
    808 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
    809   GenFPToFPCall(invoke, codegen_, kQuickExpm1);
    810 }
    811 
    812 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
    813   CreateFPToFPCallLocations(arena_, invoke);
    814 }
    815 
    816 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
    817   GenFPToFPCall(invoke, codegen_, kQuickLog);
    818 }
    819 
    820 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
    821   CreateFPToFPCallLocations(arena_, invoke);
    822 }
    823 
    824 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
    825   GenFPToFPCall(invoke, codegen_, kQuickLog10);
    826 }
    827 
    828 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
    829   CreateFPToFPCallLocations(arena_, invoke);
    830 }
    831 
    832 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
    833   GenFPToFPCall(invoke, codegen_, kQuickSinh);
    834 }
    835 
    836 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
    837   CreateFPToFPCallLocations(arena_, invoke);
    838 }
    839 
    840 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
    841   GenFPToFPCall(invoke, codegen_, kQuickTan);
    842 }
    843 
    844 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
    845   CreateFPToFPCallLocations(arena_, invoke);
    846 }
    847 
    848 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
    849   GenFPToFPCall(invoke, codegen_, kQuickTanh);
    850 }
    851 
    852 static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
    853                                         HInvoke* invoke) {
    854   LocationSummary* locations = new (arena) LocationSummary(invoke,
    855                                                            LocationSummary::kCall,
    856                                                            kIntrinsified);
    857   InvokeRuntimeCallingConvention calling_convention;
    858   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    859   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
    860   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    861 
    862   // We have to ensure that the native code doesn't clobber the XMM registers which are
    863   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
    864   // saved in the prologue and properly restored.
    865   for (auto fp_reg : non_volatile_xmm_regs) {
    866     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
    867   }
    868 }
    869 
    870 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
    871   CreateFPFPToFPCallLocations(arena_, invoke);
    872 }
    873 
    874 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
    875   GenFPToFPCall(invoke, codegen_, kQuickAtan2);
    876 }
    877 
    878 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
    879   CreateFPFPToFPCallLocations(arena_, invoke);
    880 }
    881 
    882 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
    883   GenFPToFPCall(invoke, codegen_, kQuickHypot);
    884 }
    885 
    886 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
    887   CreateFPFPToFPCallLocations(arena_, invoke);
    888 }
    889 
    890 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
    891   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
    892 }
    893 
    894 void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) {
    895   // The inputs plus one temp.
    896   LocationSummary* locations = new (arena_) LocationSummary(invoke,
    897                                                             LocationSummary::kCallOnSlowPath,
    898                                                             kIntrinsified);
    899   locations->SetInAt(0, Location::RequiresRegister());
    900   locations->SetInAt(1, Location::RequiresRegister());
    901   locations->SetOut(Location::SameAsFirstInput());
    902   locations->AddTemp(Location::RequiresRegister());
    903 }
    904 
    905 void IntrinsicCodeGeneratorX86_64::VisitStringCharAt(HInvoke* invoke) {
    906   LocationSummary* locations = invoke->GetLocations();
    907 
    908   // Location of reference to data array.
    909   const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
    910   // Location of count.
    911   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
    912 
    913   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
    914   CpuRegister idx = locations->InAt(1).AsRegister<CpuRegister>();
    915   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
    916 
    917   // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth
    918   //       the cost.
    919   // TODO: For simplicity, the index parameter is requested in a register, so different from Quick
    920   //       we will not optimize the code for constants (which would save a register).
    921 
    922   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
    923   codegen_->AddSlowPath(slow_path);
    924 
    925   X86_64Assembler* assembler = GetAssembler();
    926 
    927   __ cmpl(idx, Address(obj, count_offset));
    928   codegen_->MaybeRecordImplicitNullCheck(invoke);
    929   __ j(kAboveEqual, slow_path->GetEntryLabel());
    930 
    931   // out = out[2*idx].
    932   __ movzxw(out, Address(out, idx, ScaleFactor::TIMES_2, value_offset));
    933 
    934   __ Bind(slow_path->GetExitLabel());
    935 }
    936 
    937 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
    938   // Check to see if we have known failures that will cause us to have to bail out
    939   // to the runtime, and just generate the runtime call directly.
    940   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
    941   HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
    942 
    943   // The positions must be non-negative.
    944   if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
    945       (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
    946     // We will have to fail anyways.
    947     return;
    948   }
    949 
    950   // The length must be > 0.
    951   HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
    952   if (length != nullptr) {
    953     int32_t len = length->GetValue();
    954     if (len < 0) {
    955       // Just call as normal.
    956       return;
    957     }
    958   }
    959 
    960   LocationSummary* locations = new (arena_) LocationSummary(invoke,
    961                                                             LocationSummary::kCallOnSlowPath,
    962                                                             kIntrinsified);
    963   // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
    964   locations->SetInAt(0, Location::RequiresRegister());
    965   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
    966   locations->SetInAt(2, Location::RequiresRegister());
    967   locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
    968   locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
    969 
    970   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
    971   locations->AddTemp(Location::RegisterLocation(RSI));
    972   locations->AddTemp(Location::RegisterLocation(RDI));
    973   locations->AddTemp(Location::RegisterLocation(RCX));
    974 }
    975 
    976 static void CheckPosition(X86_64Assembler* assembler,
    977                           Location pos,
    978                           CpuRegister input,
    979                           Location length,
    980                           SlowPathCode* slow_path,
    981                           CpuRegister input_len,
    982                           CpuRegister temp,
    983                           bool length_is_input_length = false) {
    984   // Where is the length in the Array?
    985   const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
    986 
    987   if (pos.IsConstant()) {
    988     int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
    989     if (pos_const == 0) {
    990       if (!length_is_input_length) {
    991         // Check that length(input) >= length.
    992         if (length.IsConstant()) {
    993           __ cmpl(Address(input, length_offset),
    994                   Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    995         } else {
    996           __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
    997         }
    998         __ j(kLess, slow_path->GetEntryLabel());
    999       }
   1000     } else {
   1001       // Check that length(input) >= pos.
   1002       __ movl(input_len, Address(input, length_offset));
   1003       __ cmpl(input_len, Immediate(pos_const));
   1004       __ j(kLess, slow_path->GetEntryLabel());
   1005 
   1006       // Check that (length(input) - pos) >= length.
   1007       __ leal(temp, Address(input_len, -pos_const));
   1008       if (length.IsConstant()) {
   1009         __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
   1010       } else {
   1011         __ cmpl(temp, length.AsRegister<CpuRegister>());
   1012       }
   1013       __ j(kLess, slow_path->GetEntryLabel());
   1014     }
   1015   } else if (length_is_input_length) {
   1016     // The only way the copy can succeed is if pos is zero.
   1017     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
   1018     __ testl(pos_reg, pos_reg);
   1019     __ j(kNotEqual, slow_path->GetEntryLabel());
   1020   } else {
   1021     // Check that pos >= 0.
   1022     CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
   1023     __ testl(pos_reg, pos_reg);
   1024     __ j(kLess, slow_path->GetEntryLabel());
   1025 
   1026     // Check that pos <= length(input).
   1027     __ cmpl(Address(input, length_offset), pos_reg);
   1028     __ j(kLess, slow_path->GetEntryLabel());
   1029 
   1030     // Check that (length(input) - pos) >= length.
   1031     __ movl(temp, Address(input, length_offset));
   1032     __ subl(temp, pos_reg);
   1033     if (length.IsConstant()) {
   1034       __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
   1035     } else {
   1036       __ cmpl(temp, length.AsRegister<CpuRegister>());
   1037     }
   1038     __ j(kLess, slow_path->GetEntryLabel());
   1039   }
   1040 }
   1041 
   1042 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
   1043   X86_64Assembler* assembler = GetAssembler();
   1044   LocationSummary* locations = invoke->GetLocations();
   1045 
   1046   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   1047   Location src_pos = locations->InAt(1);
   1048   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   1049   Location dest_pos = locations->InAt(3);
   1050   Location length = locations->InAt(4);
   1051 
   1052   // Temporaries that we need for MOVSW.
   1053   CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
   1054   DCHECK_EQ(src_base.AsRegister(), RSI);
   1055   CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
   1056   DCHECK_EQ(dest_base.AsRegister(), RDI);
   1057   CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
   1058   DCHECK_EQ(count.AsRegister(), RCX);
   1059 
   1060   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   1061   codegen_->AddSlowPath(slow_path);
   1062 
   1063   // Bail out if the source and destination are the same.
   1064   __ cmpl(src, dest);
   1065   __ j(kEqual, slow_path->GetEntryLabel());
   1066 
   1067   // Bail out if the source is null.
   1068   __ testl(src, src);
   1069   __ j(kEqual, slow_path->GetEntryLabel());
   1070 
   1071   // Bail out if the destination is null.
   1072   __ testl(dest, dest);
   1073   __ j(kEqual, slow_path->GetEntryLabel());
   1074 
   1075   // If the length is negative, bail out.
   1076   // We have already checked in the LocationsBuilder for the constant case.
   1077   if (!length.IsConstant()) {
   1078     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
   1079     __ j(kLess, slow_path->GetEntryLabel());
   1080   }
   1081 
   1082   // Validity checks: source.
   1083   CheckPosition(assembler, src_pos, src, length, slow_path, src_base, dest_base);
   1084 
   1085   // Validity checks: dest.
   1086   CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base, dest_base);
   1087 
   1088   // We need the count in RCX.
   1089   if (length.IsConstant()) {
   1090     __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
   1091   } else {
   1092     __ movl(count, length.AsRegister<CpuRegister>());
   1093   }
   1094 
   1095   // Okay, everything checks out.  Finally time to do the copy.
   1096   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   1097   const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   1098   DCHECK_EQ(char_size, 2u);
   1099 
   1100   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
   1101 
   1102   if (src_pos.IsConstant()) {
   1103     int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1104     __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
   1105   } else {
   1106     __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
   1107                               ScaleFactor::TIMES_2, data_offset));
   1108   }
   1109   if (dest_pos.IsConstant()) {
   1110     int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1111     __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
   1112   } else {
   1113     __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
   1114                                ScaleFactor::TIMES_2, data_offset));
   1115   }
   1116 
   1117   // Do the move.
   1118   __ rep_movsw();
   1119 
   1120   __ Bind(slow_path->GetExitLabel());
   1121 }
   1122 
   1123 
   1124 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   1125   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
   1126 }
   1127 
   1128 // TODO: Implement read barriers in the SystemArrayCopy intrinsic.
   1129 // Note that this code path is not used (yet) because we do not
   1130 // intrinsify methods that can go into the IntrinsicSlowPathX86_64
   1131 // slow path.
   1132 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   1133   X86_64Assembler* assembler = GetAssembler();
   1134   LocationSummary* locations = invoke->GetLocations();
   1135 
   1136   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   1137   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   1138   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   1139   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   1140 
   1141   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   1142   Location src_pos = locations->InAt(1);
   1143   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   1144   Location dest_pos = locations->InAt(3);
   1145   Location length = locations->InAt(4);
   1146   CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
   1147   CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
   1148   CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>();
   1149 
   1150   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   1151   codegen_->AddSlowPath(slow_path);
   1152 
   1153   NearLabel conditions_on_positions_validated;
   1154   SystemArrayCopyOptimizations optimizations(invoke);
   1155 
   1156   // If source and destination are the same, we go to slow path if we need to do
   1157   // forward copying.
   1158   if (src_pos.IsConstant()) {
   1159     int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1160     if (dest_pos.IsConstant()) {
   1161       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1162       if (optimizations.GetDestinationIsSource()) {
   1163         // Checked when building locations.
   1164         DCHECK_GE(src_pos_constant, dest_pos_constant);
   1165       } else if (src_pos_constant < dest_pos_constant) {
   1166         __ cmpl(src, dest);
   1167         __ j(kEqual, slow_path->GetEntryLabel());
   1168       }
   1169     } else {
   1170       if (!optimizations.GetDestinationIsSource()) {
   1171         __ cmpl(src, dest);
   1172         __ j(kNotEqual, &conditions_on_positions_validated);
   1173       }
   1174       __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
   1175       __ j(kGreater, slow_path->GetEntryLabel());
   1176     }
   1177   } else {
   1178     if (!optimizations.GetDestinationIsSource()) {
   1179       __ cmpl(src, dest);
   1180       __ j(kNotEqual, &conditions_on_positions_validated);
   1181     }
   1182     if (dest_pos.IsConstant()) {
   1183       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1184       __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
   1185       __ j(kLess, slow_path->GetEntryLabel());
   1186     } else {
   1187       __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
   1188       __ j(kLess, slow_path->GetEntryLabel());
   1189     }
   1190   }
   1191 
   1192   __ Bind(&conditions_on_positions_validated);
   1193 
   1194   if (!optimizations.GetSourceIsNotNull()) {
   1195     // Bail out if the source is null.
   1196     __ testl(src, src);
   1197     __ j(kEqual, slow_path->GetEntryLabel());
   1198   }
   1199 
   1200   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
   1201     // Bail out if the destination is null.
   1202     __ testl(dest, dest);
   1203     __ j(kEqual, slow_path->GetEntryLabel());
   1204   }
   1205 
   1206   // If the length is negative, bail out.
   1207   // We have already checked in the LocationsBuilder for the constant case.
   1208   if (!length.IsConstant() &&
   1209       !optimizations.GetCountIsSourceLength() &&
   1210       !optimizations.GetCountIsDestinationLength()) {
   1211     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
   1212     __ j(kLess, slow_path->GetEntryLabel());
   1213   }
   1214 
   1215   // Validity checks: source.
   1216   CheckPosition(assembler,
   1217                 src_pos,
   1218                 src,
   1219                 length,
   1220                 slow_path,
   1221                 temp1,
   1222                 temp2,
   1223                 optimizations.GetCountIsSourceLength());
   1224 
   1225   // Validity checks: dest.
   1226   CheckPosition(assembler,
   1227                 dest_pos,
   1228                 dest,
   1229                 length,
   1230                 slow_path,
   1231                 temp1,
   1232                 temp2,
   1233                 optimizations.GetCountIsDestinationLength());
   1234 
   1235   if (!optimizations.GetDoesNotNeedTypeCheck()) {
   1236     // Check whether all elements of the source array are assignable to the component
   1237     // type of the destination array. We do two checks: the classes are the same,
   1238     // or the destination is Object[]. If none of these checks succeed, we go to the
   1239     // slow path.
   1240     __ movl(temp1, Address(dest, class_offset));
   1241     __ movl(temp2, Address(src, class_offset));
   1242     bool did_unpoison = false;
   1243     if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
   1244         !optimizations.GetSourceIsNonPrimitiveArray()) {
   1245       // One or two of the references need to be unpoisoned. Unpoison them
   1246       // both to make the identity check valid.
   1247       __ MaybeUnpoisonHeapReference(temp1);
   1248       __ MaybeUnpoisonHeapReference(temp2);
   1249       did_unpoison = true;
   1250     }
   1251 
   1252     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   1253       // Bail out if the destination is not a non primitive array.
   1254       // /* HeapReference<Class> */ TMP = temp1->component_type_
   1255       __ movl(CpuRegister(TMP), Address(temp1, component_offset));
   1256       __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1257       __ j(kEqual, slow_path->GetEntryLabel());
   1258       __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1259       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1260       __ j(kNotEqual, slow_path->GetEntryLabel());
   1261     }
   1262 
   1263     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   1264       // Bail out if the source is not a non primitive array.
   1265       // /* HeapReference<Class> */ TMP = temp2->component_type_
   1266       __ movl(CpuRegister(TMP), Address(temp2, component_offset));
   1267       __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1268       __ j(kEqual, slow_path->GetEntryLabel());
   1269       __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1270       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1271       __ j(kNotEqual, slow_path->GetEntryLabel());
   1272     }
   1273 
   1274     __ cmpl(temp1, temp2);
   1275 
   1276     if (optimizations.GetDestinationIsTypedObjectArray()) {
   1277       NearLabel do_copy;
   1278       __ j(kEqual, &do_copy);
   1279       if (!did_unpoison) {
   1280         __ MaybeUnpoisonHeapReference(temp1);
   1281       }
   1282       // /* HeapReference<Class> */ temp1 = temp1->component_type_
   1283       __ movl(temp1, Address(temp1, component_offset));
   1284       __ MaybeUnpoisonHeapReference(temp1);
   1285       // /* HeapReference<Class> */ temp1 = temp1->super_class_
   1286       __ movl(temp1, Address(temp1, super_offset));
   1287       // No need to unpoison the result, we're comparing against null.
   1288       __ testl(temp1, temp1);
   1289       __ j(kNotEqual, slow_path->GetEntryLabel());
   1290       __ Bind(&do_copy);
   1291     } else {
   1292       __ j(kNotEqual, slow_path->GetEntryLabel());
   1293     }
   1294   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   1295     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
   1296     // Bail out if the source is not a non primitive array.
   1297     // /* HeapReference<Class> */ temp1 = src->klass_
   1298     __ movl(temp1, Address(src, class_offset));
   1299     __ MaybeUnpoisonHeapReference(temp1);
   1300     // /* HeapReference<Class> */ TMP = temp1->component_type_
   1301     __ movl(CpuRegister(TMP), Address(temp1, component_offset));
   1302     __ testl(CpuRegister(TMP), CpuRegister(TMP));
   1303     __ j(kEqual, slow_path->GetEntryLabel());
   1304     __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
   1305     __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
   1306     __ j(kNotEqual, slow_path->GetEntryLabel());
   1307   }
   1308 
   1309   // Compute base source address, base destination address, and end source address.
   1310 
   1311   uint32_t element_size = sizeof(int32_t);
   1312   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
   1313   if (src_pos.IsConstant()) {
   1314     int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
   1315     __ leal(temp1, Address(src, element_size * constant + offset));
   1316   } else {
   1317     __ leal(temp1, Address(src, src_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset));
   1318   }
   1319 
   1320   if (dest_pos.IsConstant()) {
   1321     int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   1322     __ leal(temp2, Address(dest, element_size * constant + offset));
   1323   } else {
   1324     __ leal(temp2, Address(dest, dest_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset));
   1325   }
   1326 
   1327   if (length.IsConstant()) {
   1328     int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
   1329     __ leal(temp3, Address(temp1, element_size * constant));
   1330   } else {
   1331     __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0));
   1332   }
   1333 
   1334   // Iterate over the arrays and do a raw copy of the objects. We don't need to
   1335   // poison/unpoison, nor do any read barrier as the next uses of the destination
   1336   // array will do it.
   1337   NearLabel loop, done;
   1338   __ cmpl(temp1, temp3);
   1339   __ j(kEqual, &done);
   1340   __ Bind(&loop);
   1341   __ movl(CpuRegister(TMP), Address(temp1, 0));
   1342   __ movl(Address(temp2, 0), CpuRegister(TMP));
   1343   __ addl(temp1, Immediate(element_size));
   1344   __ addl(temp2, Immediate(element_size));
   1345   __ cmpl(temp1, temp3);
   1346   __ j(kNotEqual, &loop);
   1347   __ Bind(&done);
   1348 
   1349   // We only need one card marking on the destination array.
   1350   codegen_->MarkGCCard(temp1,
   1351                        temp2,
   1352                        dest,
   1353                        CpuRegister(kNoRegister),
   1354                        /* value_can_be_null */ false);
   1355 
   1356   __ Bind(slow_path->GetExitLabel());
   1357 }
   1358 
   1359 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
   1360   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1361                                                             LocationSummary::kCall,
   1362                                                             kIntrinsified);
   1363   InvokeRuntimeCallingConvention calling_convention;
   1364   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1365   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1366   locations->SetOut(Location::RegisterLocation(RAX));
   1367 }
   1368 
   1369 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
   1370   X86_64Assembler* assembler = GetAssembler();
   1371   LocationSummary* locations = invoke->GetLocations();
   1372 
   1373   // Note that the null check must have been done earlier.
   1374   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1375 
   1376   CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
   1377   __ testl(argument, argument);
   1378   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   1379   codegen_->AddSlowPath(slow_path);
   1380   __ j(kEqual, slow_path->GetEntryLabel());
   1381 
   1382   __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pStringCompareTo),
   1383                                   /* no_rip */ true));
   1384   __ Bind(slow_path->GetExitLabel());
   1385 }
   1386 
   1387 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
   1388   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1389                                                             LocationSummary::kNoCall,
   1390                                                             kIntrinsified);
   1391   locations->SetInAt(0, Location::RequiresRegister());
   1392   locations->SetInAt(1, Location::RequiresRegister());
   1393 
   1394   // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
   1395   locations->AddTemp(Location::RegisterLocation(RCX));
   1396   locations->AddTemp(Location::RegisterLocation(RDI));
   1397 
   1398   // Set output, RSI needed for repe_cmpsq instruction anyways.
   1399   locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
   1400 }
   1401 
   1402 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
   1403   X86_64Assembler* assembler = GetAssembler();
   1404   LocationSummary* locations = invoke->GetLocations();
   1405 
   1406   CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
   1407   CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
   1408   CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
   1409   CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
   1410   CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
   1411 
   1412   NearLabel end, return_true, return_false;
   1413 
   1414   // Get offsets of count, value, and class fields within a string object.
   1415   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1416   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1417   const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
   1418 
   1419   // Note that the null check must have been done earlier.
   1420   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1421 
   1422   // Check if input is null, return false if it is.
   1423   __ testl(arg, arg);
   1424   __ j(kEqual, &return_false);
   1425 
   1426   // Instanceof check for the argument by comparing class fields.
   1427   // All string objects must have the same type since String cannot be subclassed.
   1428   // Receiver must be a string object, so its class field is equal to all strings' class fields.
   1429   // If the argument is a string object, its class field must be equal to receiver's class field.
   1430   __ movl(rcx, Address(str, class_offset));
   1431   __ cmpl(rcx, Address(arg, class_offset));
   1432   __ j(kNotEqual, &return_false);
   1433 
   1434   // Reference equality check, return true if same reference.
   1435   __ cmpl(str, arg);
   1436   __ j(kEqual, &return_true);
   1437 
   1438   // Load length of receiver string.
   1439   __ movl(rcx, Address(str, count_offset));
   1440   // Check if lengths are equal, return false if they're not.
   1441   __ cmpl(rcx, Address(arg, count_offset));
   1442   __ j(kNotEqual, &return_false);
   1443   // Return true if both strings are empty.
   1444   __ jrcxz(&return_true);
   1445 
   1446   // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
   1447   __ leal(rsi, Address(str, value_offset));
   1448   __ leal(rdi, Address(arg, value_offset));
   1449 
   1450   // Divide string length by 4 and adjust for lengths not divisible by 4.
   1451   __ addl(rcx, Immediate(3));
   1452   __ shrl(rcx, Immediate(2));
   1453 
   1454   // Assertions that must hold in order to compare strings 4 characters at a time.
   1455   DCHECK_ALIGNED(value_offset, 8);
   1456   static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
   1457 
   1458   // Loop to compare strings four characters at a time starting at the beginning of the string.
   1459   __ repe_cmpsq();
   1460   // If strings are not equal, zero flag will be cleared.
   1461   __ j(kNotEqual, &return_false);
   1462 
   1463   // Return true and exit the function.
   1464   // If loop does not result in returning false, we return true.
   1465   __ Bind(&return_true);
   1466   __ movl(rsi, Immediate(1));
   1467   __ jmp(&end);
   1468 
   1469   // Return false and exit the function.
   1470   __ Bind(&return_false);
   1471   __ xorl(rsi, rsi);
   1472   __ Bind(&end);
   1473 }
   1474 
   1475 static void CreateStringIndexOfLocations(HInvoke* invoke,
   1476                                          ArenaAllocator* allocator,
   1477                                          bool start_at_zero) {
   1478   LocationSummary* locations = new (allocator) LocationSummary(invoke,
   1479                                                                LocationSummary::kCallOnSlowPath,
   1480                                                                kIntrinsified);
   1481   // The data needs to be in RDI for scasw. So request that the string is there, anyways.
   1482   locations->SetInAt(0, Location::RegisterLocation(RDI));
   1483   // If we look for a constant char, we'll still have to copy it into RAX. So just request the
   1484   // allocator to do that, anyways. We can still do the constant check by checking the parameter
   1485   // of the instruction explicitly.
   1486   // Note: This works as we don't clobber RAX anywhere.
   1487   locations->SetInAt(1, Location::RegisterLocation(RAX));
   1488   if (!start_at_zero) {
   1489     locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
   1490   }
   1491   // As we clobber RDI during execution anyways, also use it as the output.
   1492   locations->SetOut(Location::SameAsFirstInput());
   1493 
   1494   // repne scasw uses RCX as the counter.
   1495   locations->AddTemp(Location::RegisterLocation(RCX));
   1496   // Need another temporary to be able to compute the result.
   1497   locations->AddTemp(Location::RequiresRegister());
   1498 }
   1499 
   1500 static void GenerateStringIndexOf(HInvoke* invoke,
   1501                                   X86_64Assembler* assembler,
   1502                                   CodeGeneratorX86_64* codegen,
   1503                                   ArenaAllocator* allocator,
   1504                                   bool start_at_zero) {
   1505   LocationSummary* locations = invoke->GetLocations();
   1506 
   1507   // Note that the null check must have been done earlier.
   1508   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1509 
   1510   CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
   1511   CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
   1512   CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
   1513   CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
   1514   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   1515 
   1516   // Check our assumptions for registers.
   1517   DCHECK_EQ(string_obj.AsRegister(), RDI);
   1518   DCHECK_EQ(search_value.AsRegister(), RAX);
   1519   DCHECK_EQ(counter.AsRegister(), RCX);
   1520   DCHECK_EQ(out.AsRegister(), RDI);
   1521 
   1522   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
   1523   // or directly dispatch if we have a constant.
   1524   SlowPathCode* slow_path = nullptr;
   1525   if (invoke->InputAt(1)->IsIntConstant()) {
   1526     if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
   1527     std::numeric_limits<uint16_t>::max()) {
   1528       // Always needs the slow-path. We could directly dispatch to it, but this case should be
   1529       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
   1530       slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
   1531       codegen->AddSlowPath(slow_path);
   1532       __ jmp(slow_path->GetEntryLabel());
   1533       __ Bind(slow_path->GetExitLabel());
   1534       return;
   1535     }
   1536   } else {
   1537     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
   1538     slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
   1539     codegen->AddSlowPath(slow_path);
   1540     __ j(kAbove, slow_path->GetEntryLabel());
   1541   }
   1542 
   1543   // From here down, we know that we are looking for a char that fits in 16 bits.
   1544   // Location of reference to data array within the String object.
   1545   int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   1546   // Location of count within the String object.
   1547   int32_t count_offset = mirror::String::CountOffset().Int32Value();
   1548 
   1549   // Load string length, i.e., the count field of the string.
   1550   __ movl(string_length, Address(string_obj, count_offset));
   1551 
   1552   // Do a length check.
   1553   // TODO: Support jecxz.
   1554   NearLabel not_found_label;
   1555   __ testl(string_length, string_length);
   1556   __ j(kEqual, &not_found_label);
   1557 
   1558   if (start_at_zero) {
   1559     // Number of chars to scan is the same as the string length.
   1560     __ movl(counter, string_length);
   1561 
   1562     // Move to the start of the string.
   1563     __ addq(string_obj, Immediate(value_offset));
   1564   } else {
   1565     CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
   1566 
   1567     // Do a start_index check.
   1568     __ cmpl(start_index, string_length);
   1569     __ j(kGreaterEqual, &not_found_label);
   1570 
   1571     // Ensure we have a start index >= 0;
   1572     __ xorl(counter, counter);
   1573     __ cmpl(start_index, Immediate(0));
   1574     __ cmov(kGreater, counter, start_index, /* is64bit */ false);  // 32-bit copy is enough.
   1575 
   1576     // Move to the start of the string: string_obj + value_offset + 2 * start_index.
   1577     __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
   1578 
   1579     // Now update ecx, the work counter: it's gonna be string.length - start_index.
   1580     __ negq(counter);  // Needs to be 64-bit negation, as the address computation is 64-bit.
   1581     __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
   1582   }
   1583 
   1584   // Everything is set up for repne scasw:
   1585   //   * Comparison address in RDI.
   1586   //   * Counter in ECX.
   1587   __ repne_scasw();
   1588 
   1589   // Did we find a match?
   1590   __ j(kNotEqual, &not_found_label);
   1591 
   1592   // Yes, we matched.  Compute the index of the result.
   1593   __ subl(string_length, counter);
   1594   __ leal(out, Address(string_length, -1));
   1595 
   1596   NearLabel done;
   1597   __ jmp(&done);
   1598 
   1599   // Failed to match; return -1.
   1600   __ Bind(&not_found_label);
   1601   __ movl(out, Immediate(-1));
   1602 
   1603   // And join up at the end.
   1604   __ Bind(&done);
   1605   if (slow_path != nullptr) {
   1606     __ Bind(slow_path->GetExitLabel());
   1607   }
   1608 }
   1609 
   1610 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
   1611   CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ true);
   1612 }
   1613 
   1614 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
   1615   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ true);
   1616 }
   1617 
   1618 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
   1619   CreateStringIndexOfLocations(invoke, arena_, /* start_at_zero */ false);
   1620 }
   1621 
   1622 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
   1623   GenerateStringIndexOf(
   1624       invoke, GetAssembler(), codegen_, GetAllocator(), /* start_at_zero */ false);
   1625 }
   1626 
   1627 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1628   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1629                                                             LocationSummary::kCall,
   1630                                                             kIntrinsified);
   1631   InvokeRuntimeCallingConvention calling_convention;
   1632   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1633   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1634   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1635   locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
   1636   locations->SetOut(Location::RegisterLocation(RAX));
   1637 }
   1638 
   1639 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1640   X86_64Assembler* assembler = GetAssembler();
   1641   LocationSummary* locations = invoke->GetLocations();
   1642 
   1643   CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
   1644   __ testl(byte_array, byte_array);
   1645   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   1646   codegen_->AddSlowPath(slow_path);
   1647   __ j(kEqual, slow_path->GetEntryLabel());
   1648 
   1649   __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromBytes),
   1650                                   /* no_rip */ true));
   1651   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   1652   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   1653   __ Bind(slow_path->GetExitLabel());
   1654 }
   1655 
   1656 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
   1657   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1658                                                             LocationSummary::kCall,
   1659                                                             kIntrinsified);
   1660   InvokeRuntimeCallingConvention calling_convention;
   1661   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1662   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1663   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1664   locations->SetOut(Location::RegisterLocation(RAX));
   1665 }
   1666 
   1667 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
   1668   X86_64Assembler* assembler = GetAssembler();
   1669 
   1670   // No need to emit code checking whether `locations->InAt(2)` is a null
   1671   // pointer, as callers of the native method
   1672   //
   1673   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   1674   //
   1675   // all include a null check on `data` before calling that method.
   1676   __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromChars),
   1677                                   /* no_rip */ true));
   1678   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   1679   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   1680 }
   1681 
   1682 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
   1683   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1684                                                             LocationSummary::kCall,
   1685                                                             kIntrinsified);
   1686   InvokeRuntimeCallingConvention calling_convention;
   1687   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1688   locations->SetOut(Location::RegisterLocation(RAX));
   1689 }
   1690 
   1691 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
   1692   X86_64Assembler* assembler = GetAssembler();
   1693   LocationSummary* locations = invoke->GetLocations();
   1694 
   1695   CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
   1696   __ testl(string_to_copy, string_to_copy);
   1697   SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
   1698   codegen_->AddSlowPath(slow_path);
   1699   __ j(kEqual, slow_path->GetEntryLabel());
   1700 
   1701   __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocStringFromString),
   1702                                   /* no_rip */ true));
   1703   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   1704   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   1705   __ Bind(slow_path->GetExitLabel());
   1706 }
   1707 
   1708 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1709   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1710   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1711                                                             LocationSummary::kNoCall,
   1712                                                             kIntrinsified);
   1713   locations->SetInAt(0, Location::RequiresRegister());
   1714   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
   1715   locations->SetInAt(2, Location::RequiresRegister());
   1716   locations->SetInAt(3, Location::RequiresRegister());
   1717   locations->SetInAt(4, Location::RequiresRegister());
   1718 
   1719   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
   1720   locations->AddTemp(Location::RegisterLocation(RSI));
   1721   locations->AddTemp(Location::RegisterLocation(RDI));
   1722   locations->AddTemp(Location::RegisterLocation(RCX));
   1723 }
   1724 
   1725 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1726   X86_64Assembler* assembler = GetAssembler();
   1727   LocationSummary* locations = invoke->GetLocations();
   1728 
   1729   size_t char_component_size = Primitive::ComponentSize(Primitive::kPrimChar);
   1730   // Location of data in char array buffer.
   1731   const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
   1732   // Location of char array data in string.
   1733   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1734 
   1735   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1736   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   1737   Location srcBegin = locations->InAt(1);
   1738   int srcBegin_value =
   1739     srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
   1740   CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
   1741   CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
   1742   CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
   1743 
   1744   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   1745   const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   1746   DCHECK_EQ(char_size, 2u);
   1747 
   1748   // Compute the address of the destination buffer.
   1749   __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
   1750 
   1751   // Compute the address of the source string.
   1752   if (srcBegin.IsConstant()) {
   1753     // Compute the address of the source string by adding the number of chars from
   1754     // the source beginning to the value offset of a string.
   1755     __ leaq(CpuRegister(RSI), Address(obj, srcBegin_value * char_size + value_offset));
   1756   } else {
   1757     __ leaq(CpuRegister(RSI), Address(obj, srcBegin.AsRegister<CpuRegister>(),
   1758                                       ScaleFactor::TIMES_2, value_offset));
   1759   }
   1760 
   1761   // Compute the number of chars (words) to move.
   1762   __ movl(CpuRegister(RCX), srcEnd);
   1763   if (srcBegin.IsConstant()) {
   1764     if (srcBegin_value != 0) {
   1765       __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
   1766     }
   1767   } else {
   1768     DCHECK(srcBegin.IsRegister());
   1769     __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
   1770   }
   1771 
   1772   // Do the move.
   1773   __ rep_movsw();
   1774 }
   1775 
   1776 static void GenPeek(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
   1777   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
   1778   CpuRegister out = locations->Out().AsRegister<CpuRegister>();  // == address, here for clarity.
   1779   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   1780   // to avoid a SIGBUS.
   1781   switch (size) {
   1782     case Primitive::kPrimByte:
   1783       __ movsxb(out, Address(address, 0));
   1784       break;
   1785     case Primitive::kPrimShort:
   1786       __ movsxw(out, Address(address, 0));
   1787       break;
   1788     case Primitive::kPrimInt:
   1789       __ movl(out, Address(address, 0));
   1790       break;
   1791     case Primitive::kPrimLong:
   1792       __ movq(out, Address(address, 0));
   1793       break;
   1794     default:
   1795       LOG(FATAL) << "Type not recognized for peek: " << size;
   1796       UNREACHABLE();
   1797   }
   1798 }
   1799 
   1800 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
   1801   CreateIntToIntLocations(arena_, invoke);
   1802 }
   1803 
   1804 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
   1805   GenPeek(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
   1806 }
   1807 
   1808 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
   1809   CreateIntToIntLocations(arena_, invoke);
   1810 }
   1811 
   1812 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
   1813   GenPeek(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
   1814 }
   1815 
   1816 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
   1817   CreateIntToIntLocations(arena_, invoke);
   1818 }
   1819 
   1820 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
   1821   GenPeek(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
   1822 }
   1823 
   1824 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
   1825   CreateIntToIntLocations(arena_, invoke);
   1826 }
   1827 
   1828 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
   1829   GenPeek(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
   1830 }
   1831 
   1832 static void CreateIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* invoke) {
   1833   LocationSummary* locations = new (arena) LocationSummary(invoke,
   1834                                                            LocationSummary::kNoCall,
   1835                                                            kIntrinsified);
   1836   locations->SetInAt(0, Location::RequiresRegister());
   1837   locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
   1838 }
   1839 
   1840 static void GenPoke(LocationSummary* locations, Primitive::Type size, X86_64Assembler* assembler) {
   1841   CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
   1842   Location value = locations->InAt(1);
   1843   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   1844   // to avoid a SIGBUS.
   1845   switch (size) {
   1846     case Primitive::kPrimByte:
   1847       if (value.IsConstant()) {
   1848         __ movb(Address(address, 0),
   1849                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   1850       } else {
   1851         __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
   1852       }
   1853       break;
   1854     case Primitive::kPrimShort:
   1855       if (value.IsConstant()) {
   1856         __ movw(Address(address, 0),
   1857                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   1858       } else {
   1859         __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
   1860       }
   1861       break;
   1862     case Primitive::kPrimInt:
   1863       if (value.IsConstant()) {
   1864         __ movl(Address(address, 0),
   1865                 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
   1866       } else {
   1867         __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
   1868       }
   1869       break;
   1870     case Primitive::kPrimLong:
   1871       if (value.IsConstant()) {
   1872         int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
   1873         DCHECK(IsInt<32>(v));
   1874         int32_t v_32 = v;
   1875         __ movq(Address(address, 0), Immediate(v_32));
   1876       } else {
   1877         __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
   1878       }
   1879       break;
   1880     default:
   1881       LOG(FATAL) << "Type not recognized for poke: " << size;
   1882       UNREACHABLE();
   1883   }
   1884 }
   1885 
   1886 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
   1887   CreateIntIntToVoidLocations(arena_, invoke);
   1888 }
   1889 
   1890 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
   1891   GenPoke(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler());
   1892 }
   1893 
   1894 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
   1895   CreateIntIntToVoidLocations(arena_, invoke);
   1896 }
   1897 
   1898 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
   1899   GenPoke(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
   1900 }
   1901 
   1902 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
   1903   CreateIntIntToVoidLocations(arena_, invoke);
   1904 }
   1905 
   1906 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
   1907   GenPoke(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
   1908 }
   1909 
   1910 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
   1911   CreateIntIntToVoidLocations(arena_, invoke);
   1912 }
   1913 
   1914 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
   1915   GenPoke(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler());
   1916 }
   1917 
   1918 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   1919   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   1920                                                             LocationSummary::kNoCall,
   1921                                                             kIntrinsified);
   1922   locations->SetOut(Location::RequiresRegister());
   1923 }
   1924 
   1925 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
   1926   CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
   1927   GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64WordSize>(),
   1928                                                     /* no_rip */ true));
   1929 }
   1930 
   1931 static void GenUnsafeGet(HInvoke* invoke,
   1932                          Primitive::Type type,
   1933                          bool is_volatile ATTRIBUTE_UNUSED,
   1934                          CodeGeneratorX86_64* codegen) {
   1935   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   1936   LocationSummary* locations = invoke->GetLocations();
   1937   Location base_loc = locations->InAt(1);
   1938   CpuRegister base = base_loc.AsRegister<CpuRegister>();
   1939   Location offset_loc = locations->InAt(2);
   1940   CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
   1941   Location output_loc = locations->Out();
   1942   CpuRegister output = output_loc.AsRegister<CpuRegister>();
   1943 
   1944   switch (type) {
   1945     case Primitive::kPrimInt:
   1946       __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1947       break;
   1948 
   1949     case Primitive::kPrimNot: {
   1950       if (kEmitCompilerReadBarrier) {
   1951         if (kUseBakerReadBarrier) {
   1952           Location temp = locations->GetTemp(0);
   1953           codegen->GenerateArrayLoadWithBakerReadBarrier(
   1954               invoke, output_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
   1955         } else {
   1956           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1957           codegen->GenerateReadBarrierSlow(
   1958               invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
   1959         }
   1960       } else {
   1961         __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1962         __ MaybeUnpoisonHeapReference(output);
   1963       }
   1964       break;
   1965     }
   1966 
   1967     case Primitive::kPrimLong:
   1968       __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1969       break;
   1970 
   1971     default:
   1972       LOG(FATAL) << "Unsupported op size " << type;
   1973       UNREACHABLE();
   1974   }
   1975 }
   1976 
   1977 static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
   1978                                           HInvoke* invoke,
   1979                                           Primitive::Type type) {
   1980   bool can_call = kEmitCompilerReadBarrier &&
   1981       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
   1982        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   1983   LocationSummary* locations = new (arena) LocationSummary(invoke,
   1984                                                            can_call ?
   1985                                                                LocationSummary::kCallOnSlowPath :
   1986                                                                LocationSummary::kNoCall,
   1987                                                            kIntrinsified);
   1988   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1989   locations->SetInAt(1, Location::RequiresRegister());
   1990   locations->SetInAt(2, Location::RequiresRegister());
   1991   locations->SetOut(Location::RequiresRegister());
   1992   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   1993     // We need a temporary register for the read barrier marking slow
   1994     // path in InstructionCodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier.
   1995     locations->AddTemp(Location::RequiresRegister());
   1996   }
   1997 }
   1998 
   1999 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
   2000   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
   2001 }
   2002 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
   2003   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
   2004 }
   2005 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
   2006   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
   2007 }
   2008 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   2009   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
   2010 }
   2011 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
   2012   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
   2013 }
   2014 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   2015   CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
   2016 }
   2017 
   2018 
   2019 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
   2020   GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ false, codegen_);
   2021 }
   2022 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
   2023   GenUnsafeGet(invoke, Primitive::kPrimInt, /* is_volatile */ true, codegen_);
   2024 }
   2025 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
   2026   GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ false, codegen_);
   2027 }
   2028 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   2029   GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ true, codegen_);
   2030 }
   2031 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
   2032   GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ false, codegen_);
   2033 }
   2034 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   2035   GenUnsafeGet(invoke, Primitive::kPrimNot, /* is_volatile */ true, codegen_);
   2036 }
   2037 
   2038 
   2039 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena,
   2040                                                        Primitive::Type type,
   2041                                                        HInvoke* invoke) {
   2042   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2043                                                            LocationSummary::kNoCall,
   2044                                                            kIntrinsified);
   2045   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   2046   locations->SetInAt(1, Location::RequiresRegister());
   2047   locations->SetInAt(2, Location::RequiresRegister());
   2048   locations->SetInAt(3, Location::RequiresRegister());
   2049   if (type == Primitive::kPrimNot) {
   2050     // Need temp registers for card-marking.
   2051     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   2052     locations->AddTemp(Location::RequiresRegister());
   2053   }
   2054 }
   2055 
   2056 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
   2057   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
   2058 }
   2059 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
   2060   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
   2061 }
   2062 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
   2063   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke);
   2064 }
   2065 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
   2066   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
   2067 }
   2068 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   2069   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
   2070 }
   2071 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   2072   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke);
   2073 }
   2074 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
   2075   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
   2076 }
   2077 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   2078   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
   2079 }
   2080 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   2081   CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke);
   2082 }
   2083 
   2084 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
   2085 // memory model.
   2086 static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile,
   2087                          CodeGeneratorX86_64* codegen) {
   2088   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   2089   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
   2090   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
   2091   CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
   2092 
   2093   if (type == Primitive::kPrimLong) {
   2094     __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
   2095   } else if (kPoisonHeapReferences && type == Primitive::kPrimNot) {
   2096     CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2097     __ movl(temp, value);
   2098     __ PoisonHeapReference(temp);
   2099     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
   2100   } else {
   2101     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
   2102   }
   2103 
   2104   if (is_volatile) {
   2105     codegen->MemoryFence();
   2106   }
   2107 
   2108   if (type == Primitive::kPrimNot) {
   2109     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   2110     codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
   2111                         locations->GetTemp(1).AsRegister<CpuRegister>(),
   2112                         base,
   2113                         value,
   2114                         value_can_be_null);
   2115   }
   2116 }
   2117 
   2118 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
   2119   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
   2120 }
   2121 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
   2122   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ false, codegen_);
   2123 }
   2124 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
   2125   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, /* is_volatile */ true, codegen_);
   2126 }
   2127 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
   2128   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
   2129 }
   2130 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   2131   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ false, codegen_);
   2132 }
   2133 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   2134   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, /* is_volatile */ true, codegen_);
   2135 }
   2136 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
   2137   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
   2138 }
   2139 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   2140   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ false, codegen_);
   2141 }
   2142 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   2143   GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, /* is_volatile */ true, codegen_);
   2144 }
   2145 
   2146 static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, Primitive::Type type,
   2147                                        HInvoke* invoke) {
   2148   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2149                                                            LocationSummary::kNoCall,
   2150                                                            kIntrinsified);
   2151   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   2152   locations->SetInAt(1, Location::RequiresRegister());
   2153   locations->SetInAt(2, Location::RequiresRegister());
   2154   // expected value must be in EAX/RAX.
   2155   locations->SetInAt(3, Location::RegisterLocation(RAX));
   2156   locations->SetInAt(4, Location::RequiresRegister());
   2157 
   2158   locations->SetOut(Location::RequiresRegister());
   2159   if (type == Primitive::kPrimNot) {
   2160     // Need temp registers for card-marking.
   2161     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   2162     locations->AddTemp(Location::RequiresRegister());
   2163   }
   2164 }
   2165 
   2166 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
   2167   CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimInt, invoke);
   2168 }
   2169 
   2170 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
   2171   CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimLong, invoke);
   2172 }
   2173 
   2174 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
   2175   // The UnsafeCASObject intrinsic is missing a read barrier, and
   2176   // therefore sometimes does not work as expected (b/25883050).
   2177   // Turn it off temporarily as a quick fix, until the read barrier is
   2178   // implemented.
   2179   //
   2180   // TODO(rpl): Implement a read barrier in GenCAS below and re-enable
   2181   // this intrinsic.
   2182   if (kEmitCompilerReadBarrier) {
   2183     return;
   2184   }
   2185 
   2186   CreateIntIntIntIntIntToInt(arena_, Primitive::kPrimNot, invoke);
   2187 }
   2188 
   2189 static void GenCAS(Primitive::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
   2190   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
   2191   LocationSummary* locations = invoke->GetLocations();
   2192 
   2193   CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
   2194   CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
   2195   CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
   2196   // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
   2197   DCHECK_EQ(expected.AsRegister(), RAX);
   2198   CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
   2199   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2200 
   2201   if (type == Primitive::kPrimNot) {
   2202     // Mark card for object assuming new value is stored.
   2203     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   2204     codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
   2205                         locations->GetTemp(1).AsRegister<CpuRegister>(),
   2206                         base,
   2207                         value,
   2208                         value_can_be_null);
   2209 
   2210     bool base_equals_value = (base.AsRegister() == value.AsRegister());
   2211     Register value_reg = value.AsRegister();
   2212     if (kPoisonHeapReferences) {
   2213       if (base_equals_value) {
   2214         // If `base` and `value` are the same register location, move
   2215         // `value_reg` to a temporary register.  This way, poisoning
   2216         // `value_reg` won't invalidate `base`.
   2217         value_reg = locations->GetTemp(0).AsRegister<CpuRegister>().AsRegister();
   2218         __ movl(CpuRegister(value_reg), base);
   2219       }
   2220 
   2221       // Check that the register allocator did not assign the location
   2222       // of `expected` (RAX) to `value` nor to `base`, so that heap
   2223       // poisoning (when enabled) works as intended below.
   2224       // - If `value` were equal to `expected`, both references would
   2225       //   be poisoned twice, meaning they would not be poisoned at
   2226       //   all, as heap poisoning uses address negation.
   2227       // - If `base` were equal to `expected`, poisoning `expected`
   2228       //   would invalidate `base`.
   2229       DCHECK_NE(value_reg, expected.AsRegister());
   2230       DCHECK_NE(base.AsRegister(), expected.AsRegister());
   2231 
   2232       __ PoisonHeapReference(expected);
   2233       __ PoisonHeapReference(CpuRegister(value_reg));
   2234     }
   2235 
   2236     // TODO: Add a read barrier for the reference stored in the object
   2237     // before attempting the CAS, similar to the one in the
   2238     // art::Unsafe_compareAndSwapObject JNI implementation.
   2239     //
   2240     // Note that this code is not (yet) used when read barriers are
   2241     // enabled (see IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject).
   2242     DCHECK(!kEmitCompilerReadBarrier);
   2243     __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), CpuRegister(value_reg));
   2244 
   2245     // LOCK CMPXCHG has full barrier semantics, and we don't need
   2246     // scheduling barriers at this time.
   2247 
   2248     // Convert ZF into the boolean result.
   2249     __ setcc(kZero, out);
   2250     __ movzxb(out, out);
   2251 
   2252     // If heap poisoning is enabled, we need to unpoison the values
   2253     // that were poisoned earlier.
   2254     if (kPoisonHeapReferences) {
   2255       if (base_equals_value) {
   2256         // `value_reg` has been moved to a temporary register, no need
   2257         // to unpoison it.
   2258       } else {
   2259         // Ensure `value` is different from `out`, so that unpoisoning
   2260         // the former does not invalidate the latter.
   2261         DCHECK_NE(value_reg, out.AsRegister());
   2262         __ UnpoisonHeapReference(CpuRegister(value_reg));
   2263       }
   2264       // Ensure `expected` is different from `out`, so that unpoisoning
   2265       // the former does not invalidate the latter.
   2266       DCHECK_NE(expected.AsRegister(), out.AsRegister());
   2267       __ UnpoisonHeapReference(expected);
   2268     }
   2269   } else {
   2270     if (type == Primitive::kPrimInt) {
   2271       __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
   2272     } else if (type == Primitive::kPrimLong) {
   2273       __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
   2274     } else {
   2275       LOG(FATAL) << "Unexpected CAS type " << type;
   2276     }
   2277 
   2278     // LOCK CMPXCHG has full barrier semantics, and we don't need
   2279     // scheduling barriers at this time.
   2280 
   2281     // Convert ZF into the boolean result.
   2282     __ setcc(kZero, out);
   2283     __ movzxb(out, out);
   2284   }
   2285 }
   2286 
   2287 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
   2288   GenCAS(Primitive::kPrimInt, invoke, codegen_);
   2289 }
   2290 
   2291 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
   2292   GenCAS(Primitive::kPrimLong, invoke, codegen_);
   2293 }
   2294 
   2295 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
   2296   GenCAS(Primitive::kPrimNot, invoke, codegen_);
   2297 }
   2298 
   2299 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
   2300   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   2301                                                            LocationSummary::kNoCall,
   2302                                                            kIntrinsified);
   2303   locations->SetInAt(0, Location::RequiresRegister());
   2304   locations->SetOut(Location::SameAsFirstInput());
   2305   locations->AddTemp(Location::RequiresRegister());
   2306 }
   2307 
   2308 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
   2309                      X86_64Assembler* assembler) {
   2310   Immediate imm_shift(shift);
   2311   Immediate imm_mask(mask);
   2312   __ movl(temp, reg);
   2313   __ shrl(reg, imm_shift);
   2314   __ andl(temp, imm_mask);
   2315   __ andl(reg, imm_mask);
   2316   __ shll(temp, imm_shift);
   2317   __ orl(reg, temp);
   2318 }
   2319 
   2320 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
   2321   X86_64Assembler* assembler = GetAssembler();
   2322   LocationSummary* locations = invoke->GetLocations();
   2323 
   2324   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
   2325   CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2326 
   2327   /*
   2328    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
   2329    * swapping bits to reverse bits in a number x. Using bswap to save instructions
   2330    * compared to generic luni implementation which has 5 rounds of swapping bits.
   2331    * x = bswap x
   2332    * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
   2333    * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
   2334    * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
   2335    */
   2336   __ bswapl(reg);
   2337   SwapBits(reg, temp, 1, 0x55555555, assembler);
   2338   SwapBits(reg, temp, 2, 0x33333333, assembler);
   2339   SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
   2340 }
   2341 
   2342 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
   2343   LocationSummary* locations = new (arena_) LocationSummary(invoke,
   2344                                                            LocationSummary::kNoCall,
   2345                                                            kIntrinsified);
   2346   locations->SetInAt(0, Location::RequiresRegister());
   2347   locations->SetOut(Location::SameAsFirstInput());
   2348   locations->AddTemp(Location::RequiresRegister());
   2349   locations->AddTemp(Location::RequiresRegister());
   2350 }
   2351 
   2352 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
   2353                        int32_t shift, int64_t mask, X86_64Assembler* assembler) {
   2354   Immediate imm_shift(shift);
   2355   __ movq(temp_mask, Immediate(mask));
   2356   __ movq(temp, reg);
   2357   __ shrq(reg, imm_shift);
   2358   __ andq(temp, temp_mask);
   2359   __ andq(reg, temp_mask);
   2360   __ shlq(temp, imm_shift);
   2361   __ orq(reg, temp);
   2362 }
   2363 
   2364 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
   2365   X86_64Assembler* assembler = GetAssembler();
   2366   LocationSummary* locations = invoke->GetLocations();
   2367 
   2368   CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
   2369   CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
   2370   CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
   2371 
   2372   /*
   2373    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
   2374    * swapping bits to reverse bits in a long number x. Using bswap to save instructions
   2375    * compared to generic luni implementation which has 5 rounds of swapping bits.
   2376    * x = bswap x
   2377    * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
   2378    * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
   2379    * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
   2380    */
   2381   __ bswapq(reg);
   2382   SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
   2383   SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
   2384   SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
   2385 }
   2386 
   2387 static void CreateBitCountLocations(
   2388     ArenaAllocator* arena, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
   2389   if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
   2390     // Do nothing if there is no popcnt support. This results in generating
   2391     // a call for the intrinsic rather than direct code.
   2392     return;
   2393   }
   2394   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2395                                                            LocationSummary::kNoCall,
   2396                                                            kIntrinsified);
   2397   locations->SetInAt(0, Location::Any());
   2398   locations->SetOut(Location::RequiresRegister());
   2399 }
   2400 
   2401 static void GenBitCount(X86_64Assembler* assembler,
   2402                         CodeGeneratorX86_64* codegen,
   2403                         HInvoke* invoke,
   2404                         bool is_long) {
   2405   LocationSummary* locations = invoke->GetLocations();
   2406   Location src = locations->InAt(0);
   2407   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2408 
   2409   if (invoke->InputAt(0)->IsConstant()) {
   2410     // Evaluate this at compile time.
   2411     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2412     int32_t result = is_long
   2413         ? POPCOUNT(static_cast<uint64_t>(value))
   2414         : POPCOUNT(static_cast<uint32_t>(value));
   2415     codegen->Load32BitValue(out, result);
   2416     return;
   2417   }
   2418 
   2419   if (src.IsRegister()) {
   2420     if (is_long) {
   2421       __ popcntq(out, src.AsRegister<CpuRegister>());
   2422     } else {
   2423       __ popcntl(out, src.AsRegister<CpuRegister>());
   2424     }
   2425   } else if (is_long) {
   2426     DCHECK(src.IsDoubleStackSlot());
   2427     __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2428   } else {
   2429     DCHECK(src.IsStackSlot());
   2430     __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2431   }
   2432 }
   2433 
   2434 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
   2435   CreateBitCountLocations(arena_, codegen_, invoke);
   2436 }
   2437 
   2438 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
   2439   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2440 }
   2441 
   2442 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
   2443   CreateBitCountLocations(arena_, codegen_, invoke);
   2444 }
   2445 
   2446 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
   2447   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2448 }
   2449 
   2450 static void CreateOneBitLocations(ArenaAllocator* arena, HInvoke* invoke, bool is_high) {
   2451   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2452                                                            LocationSummary::kNoCall,
   2453                                                            kIntrinsified);
   2454   locations->SetInAt(0, Location::Any());
   2455   locations->SetOut(Location::RequiresRegister());
   2456   locations->AddTemp(is_high ? Location::RegisterLocation(RCX)  // needs CL
   2457                              : Location::RequiresRegister());  // any will do
   2458 }
   2459 
   2460 static void GenOneBit(X86_64Assembler* assembler,
   2461                       CodeGeneratorX86_64* codegen,
   2462                       HInvoke* invoke,
   2463                       bool is_high, bool is_long) {
   2464   LocationSummary* locations = invoke->GetLocations();
   2465   Location src = locations->InAt(0);
   2466   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2467 
   2468   if (invoke->InputAt(0)->IsConstant()) {
   2469     // Evaluate this at compile time.
   2470     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2471     if (value == 0) {
   2472       __ xorl(out, out);  // Clears upper bits too.
   2473       return;
   2474     }
   2475     // Nonzero value.
   2476     if (is_high) {
   2477       value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
   2478                       : 31 - CLZ(static_cast<uint32_t>(value));
   2479     } else {
   2480       value = is_long ? CTZ(static_cast<uint64_t>(value))
   2481                       : CTZ(static_cast<uint32_t>(value));
   2482     }
   2483     if (is_long) {
   2484       codegen->Load64BitValue(out, 1L << value);
   2485     } else {
   2486       codegen->Load32BitValue(out, 1 << value);
   2487     }
   2488     return;
   2489   }
   2490 
   2491   // Handle the non-constant cases.
   2492   CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
   2493   if (is_high) {
   2494     // Use architectural support: basically 1 << bsr.
   2495     if (src.IsRegister()) {
   2496       if (is_long) {
   2497         __ bsrq(tmp, src.AsRegister<CpuRegister>());
   2498       } else {
   2499         __ bsrl(tmp, src.AsRegister<CpuRegister>());
   2500       }
   2501     } else if (is_long) {
   2502       DCHECK(src.IsDoubleStackSlot());
   2503       __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2504     } else {
   2505       DCHECK(src.IsStackSlot());
   2506       __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2507     }
   2508     // BSR sets ZF if the input was zero.
   2509     NearLabel is_zero, done;
   2510     __ j(kEqual, &is_zero);
   2511     __ movl(out, Immediate(1));  // Clears upper bits too.
   2512     if (is_long) {
   2513       __ shlq(out, tmp);
   2514     } else {
   2515       __ shll(out, tmp);
   2516     }
   2517     __ jmp(&done);
   2518     __ Bind(&is_zero);
   2519     __ xorl(out, out);  // Clears upper bits too.
   2520     __ Bind(&done);
   2521   } else  {
   2522     // Copy input into temporary.
   2523     if (src.IsRegister()) {
   2524       if (is_long) {
   2525         __ movq(tmp, src.AsRegister<CpuRegister>());
   2526       } else {
   2527         __ movl(tmp, src.AsRegister<CpuRegister>());
   2528       }
   2529     } else if (is_long) {
   2530       DCHECK(src.IsDoubleStackSlot());
   2531       __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2532     } else {
   2533       DCHECK(src.IsStackSlot());
   2534       __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
   2535     }
   2536     // Do the bit twiddling: basically tmp & -tmp;
   2537     if (is_long) {
   2538       __ movq(out, tmp);
   2539       __ negq(tmp);
   2540       __ andq(out, tmp);
   2541     } else {
   2542       __ movl(out, tmp);
   2543       __ negl(tmp);
   2544       __ andl(out, tmp);
   2545     }
   2546   }
   2547 }
   2548 
   2549 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2550   CreateOneBitLocations(arena_, invoke, /* is_high */ true);
   2551 }
   2552 
   2553 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
   2554   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
   2555 }
   2556 
   2557 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
   2558   CreateOneBitLocations(arena_, invoke, /* is_high */ true);
   2559 }
   2560 
   2561 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
   2562   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
   2563 }
   2564 
   2565 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
   2566   CreateOneBitLocations(arena_, invoke, /* is_high */ false);
   2567 }
   2568 
   2569 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
   2570   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
   2571 }
   2572 
   2573 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
   2574   CreateOneBitLocations(arena_, invoke, /* is_high */ false);
   2575 }
   2576 
   2577 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
   2578   GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
   2579 }
   2580 
   2581 static void CreateLeadingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
   2582   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2583                                                            LocationSummary::kNoCall,
   2584                                                            kIntrinsified);
   2585   locations->SetInAt(0, Location::Any());
   2586   locations->SetOut(Location::RequiresRegister());
   2587 }
   2588 
   2589 static void GenLeadingZeros(X86_64Assembler* assembler,
   2590                             CodeGeneratorX86_64* codegen,
   2591                             HInvoke* invoke, bool is_long) {
   2592   LocationSummary* locations = invoke->GetLocations();
   2593   Location src = locations->InAt(0);
   2594   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2595 
   2596   int zero_value_result = is_long ? 64 : 32;
   2597   if (invoke->InputAt(0)->IsConstant()) {
   2598     // Evaluate this at compile time.
   2599     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2600     if (value == 0) {
   2601       value = zero_value_result;
   2602     } else {
   2603       value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
   2604     }
   2605     codegen->Load32BitValue(out, value);
   2606     return;
   2607   }
   2608 
   2609   // Handle the non-constant cases.
   2610   if (src.IsRegister()) {
   2611     if (is_long) {
   2612       __ bsrq(out, src.AsRegister<CpuRegister>());
   2613     } else {
   2614       __ bsrl(out, src.AsRegister<CpuRegister>());
   2615     }
   2616   } else if (is_long) {
   2617     DCHECK(src.IsDoubleStackSlot());
   2618     __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2619   } else {
   2620     DCHECK(src.IsStackSlot());
   2621     __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2622   }
   2623 
   2624   // BSR sets ZF if the input was zero, and the output is undefined.
   2625   NearLabel is_zero, done;
   2626   __ j(kEqual, &is_zero);
   2627 
   2628   // Correct the result from BSR to get the CLZ result.
   2629   __ xorl(out, Immediate(zero_value_result - 1));
   2630   __ jmp(&done);
   2631 
   2632   // Fix the zero case with the expected result.
   2633   __ Bind(&is_zero);
   2634   __ movl(out, Immediate(zero_value_result));
   2635 
   2636   __ Bind(&done);
   2637 }
   2638 
   2639 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2640   CreateLeadingZeroLocations(arena_, invoke);
   2641 }
   2642 
   2643 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2644   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2645 }
   2646 
   2647 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2648   CreateLeadingZeroLocations(arena_, invoke);
   2649 }
   2650 
   2651 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2652   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2653 }
   2654 
   2655 static void CreateTrailingZeroLocations(ArenaAllocator* arena, HInvoke* invoke) {
   2656   LocationSummary* locations = new (arena) LocationSummary(invoke,
   2657                                                            LocationSummary::kNoCall,
   2658                                                            kIntrinsified);
   2659   locations->SetInAt(0, Location::Any());
   2660   locations->SetOut(Location::RequiresRegister());
   2661 }
   2662 
   2663 static void GenTrailingZeros(X86_64Assembler* assembler,
   2664                              CodeGeneratorX86_64* codegen,
   2665                              HInvoke* invoke, bool is_long) {
   2666   LocationSummary* locations = invoke->GetLocations();
   2667   Location src = locations->InAt(0);
   2668   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   2669 
   2670   int zero_value_result = is_long ? 64 : 32;
   2671   if (invoke->InputAt(0)->IsConstant()) {
   2672     // Evaluate this at compile time.
   2673     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2674     if (value == 0) {
   2675       value = zero_value_result;
   2676     } else {
   2677       value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
   2678     }
   2679     codegen->Load32BitValue(out, value);
   2680     return;
   2681   }
   2682 
   2683   // Handle the non-constant cases.
   2684   if (src.IsRegister()) {
   2685     if (is_long) {
   2686       __ bsfq(out, src.AsRegister<CpuRegister>());
   2687     } else {
   2688       __ bsfl(out, src.AsRegister<CpuRegister>());
   2689     }
   2690   } else if (is_long) {
   2691     DCHECK(src.IsDoubleStackSlot());
   2692     __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2693   } else {
   2694     DCHECK(src.IsStackSlot());
   2695     __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
   2696   }
   2697 
   2698   // BSF sets ZF if the input was zero, and the output is undefined.
   2699   NearLabel done;
   2700   __ j(kNotEqual, &done);
   2701 
   2702   // Fix the zero case with the expected result.
   2703   __ movl(out, Immediate(zero_value_result));
   2704 
   2705   __ Bind(&done);
   2706 }
   2707 
   2708 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2709   CreateTrailingZeroLocations(arena_, invoke);
   2710 }
   2711 
   2712 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2713   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
   2714 }
   2715 
   2716 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2717   CreateTrailingZeroLocations(arena_, invoke);
   2718 }
   2719 
   2720 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2721   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
   2722 }
   2723 
   2724 UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
   2725 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
   2726 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
   2727 
   2728 // 1.8.
   2729 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
   2730 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
   2731 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
   2732 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
   2733 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
   2734 
   2735 UNREACHABLE_INTRINSICS(X86_64)
   2736 
   2737 #undef __
   2738 
   2739 }  // namespace x86_64
   2740 }  // namespace art
   2741