Home | History | Annotate | Download | only in optimizing
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "intrinsics_x86.h"
     18 
     19 #include <limits>
     20 
     21 #include "arch/x86/instruction_set_features_x86.h"
     22 #include "art_method.h"
     23 #include "base/bit_utils.h"
     24 #include "code_generator_x86.h"
     25 #include "entrypoints/quick/quick_entrypoints.h"
     26 #include "heap_poisoning.h"
     27 #include "intrinsics.h"
     28 #include "intrinsics_utils.h"
     29 #include "lock_word.h"
     30 #include "mirror/array-inl.h"
     31 #include "mirror/object_array-inl.h"
     32 #include "mirror/reference.h"
     33 #include "mirror/string.h"
     34 #include "scoped_thread_state_change-inl.h"
     35 #include "thread-current-inl.h"
     36 #include "utils/x86/assembler_x86.h"
     37 #include "utils/x86/constants_x86.h"
     38 
     39 namespace art {
     40 
     41 namespace x86 {
     42 
     43 IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen)
     44   : allocator_(codegen->GetGraph()->GetAllocator()),
     45     codegen_(codegen) {
     46 }
     47 
     48 
     49 X86Assembler* IntrinsicCodeGeneratorX86::GetAssembler() {
     50   return down_cast<X86Assembler*>(codegen_->GetAssembler());
     51 }
     52 
     53 ArenaAllocator* IntrinsicCodeGeneratorX86::GetAllocator() {
     54   return codegen_->GetGraph()->GetAllocator();
     55 }
     56 
     57 bool IntrinsicLocationsBuilderX86::TryDispatch(HInvoke* invoke) {
     58   Dispatch(invoke);
     59   LocationSummary* res = invoke->GetLocations();
     60   if (res == nullptr) {
     61     return false;
     62   }
     63   return res->Intrinsified();
     64 }
     65 
     66 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86* codegen) {
     67   InvokeDexCallingConventionVisitorX86 calling_convention_visitor;
     68   IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
     69 }
     70 
     71 using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>;
     72 
     73 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
     74 #define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
     75 
     76 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
     77 class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode {
     78  public:
     79   explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction)
     80       : SlowPathCode(instruction) {
     81     DCHECK(kEmitCompilerReadBarrier);
     82     DCHECK(kUseBakerReadBarrier);
     83   }
     84 
     85   void EmitNativeCode(CodeGenerator* codegen) override {
     86     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     87     LocationSummary* locations = instruction_->GetLocations();
     88     DCHECK(locations->CanCall());
     89     DCHECK(instruction_->IsInvokeStaticOrDirect())
     90         << "Unexpected instruction in read barrier arraycopy slow path: "
     91         << instruction_->DebugName();
     92     DCHECK(instruction_->GetLocations()->Intrinsified());
     93     DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
     94 
     95     int32_t element_size = DataType::Size(DataType::Type::kReference);
     96     uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
     97 
     98     Register src = locations->InAt(0).AsRegister<Register>();
     99     Location src_pos = locations->InAt(1);
    100     Register dest = locations->InAt(2).AsRegister<Register>();
    101     Location dest_pos = locations->InAt(3);
    102     Location length = locations->InAt(4);
    103     Location temp1_loc = locations->GetTemp(0);
    104     Register temp1 = temp1_loc.AsRegister<Register>();
    105     Register temp2 = locations->GetTemp(1).AsRegister<Register>();
    106     Register temp3 = locations->GetTemp(2).AsRegister<Register>();
    107 
    108     __ Bind(GetEntryLabel());
    109     // In this code path, registers `temp1`, `temp2`, and `temp3`
    110     // (resp.) are not used for the base source address, the base
    111     // destination address, and the end source address (resp.), as in
    112     // other SystemArrayCopy intrinsic code paths.  Instead they are
    113     // (resp.) used for:
    114     // - the loop index (`i`);
    115     // - the source index (`src_index`) and the loaded (source)
    116     //   reference (`value`); and
    117     // - the destination index (`dest_index`).
    118 
    119     // i = 0
    120     __ xorl(temp1, temp1);
    121     NearLabel loop;
    122     __ Bind(&loop);
    123     // value = src_array[i + src_pos]
    124     if (src_pos.IsConstant()) {
    125       int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
    126       int32_t adjusted_offset = offset + constant * element_size;
    127       __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset));
    128     } else {
    129       __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
    130       __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset));
    131     }
    132     __ MaybeUnpoisonHeapReference(temp2);
    133     // TODO: Inline the mark bit check before calling the runtime?
    134     // value = ReadBarrier::Mark(value)
    135     // No need to save live registers; it's taken care of by the
    136     // entrypoint. Also, there is no need to update the stack mask,
    137     // as this runtime call will not trigger a garbage collection.
    138     // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more
    139     // explanations.)
    140     DCHECK_NE(temp2, ESP);
    141     DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2;
    142     int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
    143     // This runtime call does not require a stack map.
    144     x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
    145     __ MaybePoisonHeapReference(temp2);
    146     // dest_array[i + dest_pos] = value
    147     if (dest_pos.IsConstant()) {
    148       int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
    149       int32_t adjusted_offset = offset + constant * element_size;
    150       __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2);
    151     } else {
    152       __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
    153       __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2);
    154     }
    155     // ++i
    156     __ addl(temp1, Immediate(1));
    157     // if (i != length) goto loop
    158     x86_codegen->GenerateIntCompare(temp1_loc, length);
    159     __ j(kNotEqual, &loop);
    160     __ jmp(GetExitLabel());
    161   }
    162 
    163   const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathX86"; }
    164 
    165  private:
    166   DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86);
    167 };
    168 
    169 #undef __
    170 
    171 #define __ assembler->
    172 
    173 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is64bit) {
    174   LocationSummary* locations =
    175       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    176   locations->SetInAt(0, Location::RequiresFpuRegister());
    177   locations->SetOut(Location::RequiresRegister());
    178   if (is64bit) {
    179     locations->AddTemp(Location::RequiresFpuRegister());
    180   }
    181 }
    182 
    183 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is64bit) {
    184   LocationSummary* locations =
    185       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    186   locations->SetInAt(0, Location::RequiresRegister());
    187   locations->SetOut(Location::RequiresFpuRegister());
    188   if (is64bit) {
    189     locations->AddTemp(Location::RequiresFpuRegister());
    190     locations->AddTemp(Location::RequiresFpuRegister());
    191   }
    192 }
    193 
    194 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86Assembler* assembler) {
    195   Location input = locations->InAt(0);
    196   Location output = locations->Out();
    197   if (is64bit) {
    198     // Need to use the temporary.
    199     XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    200     __ movsd(temp, input.AsFpuRegister<XmmRegister>());
    201     __ movd(output.AsRegisterPairLow<Register>(), temp);
    202     __ psrlq(temp, Immediate(32));
    203     __ movd(output.AsRegisterPairHigh<Register>(), temp);
    204   } else {
    205     __ movd(output.AsRegister<Register>(), input.AsFpuRegister<XmmRegister>());
    206   }
    207 }
    208 
    209 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86Assembler* assembler) {
    210   Location input = locations->InAt(0);
    211   Location output = locations->Out();
    212   if (is64bit) {
    213     // Need to use the temporary.
    214     XmmRegister temp1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    215     XmmRegister temp2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
    216     __ movd(temp1, input.AsRegisterPairLow<Register>());
    217     __ movd(temp2, input.AsRegisterPairHigh<Register>());
    218     __ punpckldq(temp1, temp2);
    219     __ movsd(output.AsFpuRegister<XmmRegister>(), temp1);
    220   } else {
    221     __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<Register>());
    222   }
    223 }
    224 
    225 void IntrinsicLocationsBuilderX86::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    226   CreateFPToIntLocations(allocator_, invoke, /* is64bit= */ true);
    227 }
    228 void IntrinsicLocationsBuilderX86::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    229   CreateIntToFPLocations(allocator_, invoke, /* is64bit= */ true);
    230 }
    231 
    232 void IntrinsicCodeGeneratorX86::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
    233   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetAssembler());
    234 }
    235 void IntrinsicCodeGeneratorX86::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
    236   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetAssembler());
    237 }
    238 
    239 void IntrinsicLocationsBuilderX86::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    240   CreateFPToIntLocations(allocator_, invoke, /* is64bit= */ false);
    241 }
    242 void IntrinsicLocationsBuilderX86::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    243   CreateIntToFPLocations(allocator_, invoke, /* is64bit= */ false);
    244 }
    245 
    246 void IntrinsicCodeGeneratorX86::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
    247   MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetAssembler());
    248 }
    249 void IntrinsicCodeGeneratorX86::VisitFloatIntBitsToFloat(HInvoke* invoke) {
    250   MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetAssembler());
    251 }
    252 
    253 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    254   LocationSummary* locations =
    255       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    256   locations->SetInAt(0, Location::RequiresRegister());
    257   locations->SetOut(Location::SameAsFirstInput());
    258 }
    259 
    260 static void CreateLongToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    261   LocationSummary* locations =
    262       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    263   locations->SetInAt(0, Location::RequiresRegister());
    264   locations->SetOut(Location::RequiresRegister());
    265 }
    266 
    267 static void CreateLongToLongLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    268   LocationSummary* locations =
    269       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    270   locations->SetInAt(0, Location::RequiresRegister());
    271   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
    272 }
    273 
    274 static void GenReverseBytes(LocationSummary* locations,
    275                             DataType::Type size,
    276                             X86Assembler* assembler) {
    277   Register out = locations->Out().AsRegister<Register>();
    278 
    279   switch (size) {
    280     case DataType::Type::kInt16:
    281       // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
    282       __ bswapl(out);
    283       __ sarl(out, Immediate(16));
    284       break;
    285     case DataType::Type::kInt32:
    286       __ bswapl(out);
    287       break;
    288     default:
    289       LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
    290       UNREACHABLE();
    291   }
    292 }
    293 
    294 void IntrinsicLocationsBuilderX86::VisitIntegerReverseBytes(HInvoke* invoke) {
    295   CreateIntToIntLocations(allocator_, invoke);
    296 }
    297 
    298 void IntrinsicCodeGeneratorX86::VisitIntegerReverseBytes(HInvoke* invoke) {
    299   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
    300 }
    301 
    302 void IntrinsicLocationsBuilderX86::VisitLongReverseBytes(HInvoke* invoke) {
    303   CreateLongToLongLocations(allocator_, invoke);
    304 }
    305 
    306 void IntrinsicCodeGeneratorX86::VisitLongReverseBytes(HInvoke* invoke) {
    307   LocationSummary* locations = invoke->GetLocations();
    308   Location input = locations->InAt(0);
    309   Register input_lo = input.AsRegisterPairLow<Register>();
    310   Register input_hi = input.AsRegisterPairHigh<Register>();
    311   Location output = locations->Out();
    312   Register output_lo = output.AsRegisterPairLow<Register>();
    313   Register output_hi = output.AsRegisterPairHigh<Register>();
    314 
    315   X86Assembler* assembler = GetAssembler();
    316   // Assign the inputs to the outputs, mixing low/high.
    317   __ movl(output_lo, input_hi);
    318   __ movl(output_hi, input_lo);
    319   __ bswapl(output_lo);
    320   __ bswapl(output_hi);
    321 }
    322 
    323 void IntrinsicLocationsBuilderX86::VisitShortReverseBytes(HInvoke* invoke) {
    324   CreateIntToIntLocations(allocator_, invoke);
    325 }
    326 
    327 void IntrinsicCodeGeneratorX86::VisitShortReverseBytes(HInvoke* invoke) {
    328   GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
    329 }
    330 
    331 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    332   LocationSummary* locations =
    333       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    334   locations->SetInAt(0, Location::RequiresFpuRegister());
    335   locations->SetOut(Location::RequiresFpuRegister());
    336 }
    337 
    338 void IntrinsicLocationsBuilderX86::VisitMathSqrt(HInvoke* invoke) {
    339   CreateFPToFPLocations(allocator_, invoke);
    340 }
    341 
    342 void IntrinsicCodeGeneratorX86::VisitMathSqrt(HInvoke* invoke) {
    343   LocationSummary* locations = invoke->GetLocations();
    344   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    345   XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    346 
    347   GetAssembler()->sqrtsd(out, in);
    348 }
    349 
    350 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) {
    351   MoveArguments(invoke, codegen);
    352 
    353   DCHECK(invoke->IsInvokeStaticOrDirect());
    354   codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(),
    355                                       Location::RegisterLocation(EAX));
    356 
    357   // Copy the result back to the expected output.
    358   Location out = invoke->GetLocations()->Out();
    359   if (out.IsValid()) {
    360     DCHECK(out.IsRegister());
    361     codegen->MoveFromReturnRegister(out, invoke->GetType());
    362   }
    363 }
    364 
    365 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
    366                                        HInvoke* invoke,
    367                                        CodeGeneratorX86* codegen) {
    368   // Do we have instruction support?
    369   if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
    370     CreateFPToFPLocations(allocator, invoke);
    371     return;
    372   }
    373 
    374   // We have to fall back to a call to the intrinsic.
    375   LocationSummary* locations =
    376       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
    377   InvokeRuntimeCallingConvention calling_convention;
    378   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    379   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    380   // Needs to be EAX for the invoke.
    381   locations->AddTemp(Location::RegisterLocation(EAX));
    382 }
    383 
    384 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86* codegen,
    385                                    HInvoke* invoke,
    386                                    X86Assembler* assembler,
    387                                    int round_mode) {
    388   LocationSummary* locations = invoke->GetLocations();
    389   if (locations->WillCall()) {
    390     InvokeOutOfLineIntrinsic(codegen, invoke);
    391   } else {
    392     XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    393     XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
    394     __ roundsd(out, in, Immediate(round_mode));
    395   }
    396 }
    397 
    398 void IntrinsicLocationsBuilderX86::VisitMathCeil(HInvoke* invoke) {
    399   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    400 }
    401 
    402 void IntrinsicCodeGeneratorX86::VisitMathCeil(HInvoke* invoke) {
    403   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
    404 }
    405 
    406 void IntrinsicLocationsBuilderX86::VisitMathFloor(HInvoke* invoke) {
    407   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    408 }
    409 
    410 void IntrinsicCodeGeneratorX86::VisitMathFloor(HInvoke* invoke) {
    411   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
    412 }
    413 
    414 void IntrinsicLocationsBuilderX86::VisitMathRint(HInvoke* invoke) {
    415   CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
    416 }
    417 
    418 void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
    419   GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
    420 }
    421 
    422 void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
    423   // Do we have instruction support?
    424   if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
    425     HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
    426     DCHECK(static_or_direct != nullptr);
    427     LocationSummary* locations =
    428         new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    429     locations->SetInAt(0, Location::RequiresFpuRegister());
    430     if (static_or_direct->HasSpecialInput() &&
    431         invoke->InputAt(
    432             static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
    433       locations->SetInAt(1, Location::RequiresRegister());
    434     }
    435     locations->SetOut(Location::RequiresRegister());
    436     locations->AddTemp(Location::RequiresFpuRegister());
    437     locations->AddTemp(Location::RequiresFpuRegister());
    438     return;
    439   }
    440 
    441   // We have to fall back to a call to the intrinsic.
    442   LocationSummary* locations =
    443       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
    444   InvokeRuntimeCallingConvention calling_convention;
    445   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    446   locations->SetOut(Location::RegisterLocation(EAX));
    447   // Needs to be EAX for the invoke.
    448   locations->AddTemp(Location::RegisterLocation(EAX));
    449 }
    450 
    451 void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
    452   LocationSummary* locations = invoke->GetLocations();
    453   if (locations->WillCall()) {  // TODO: can we reach this?
    454     InvokeOutOfLineIntrinsic(codegen_, invoke);
    455     return;
    456   }
    457 
    458   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
    459   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
    460   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
    461   Register out = locations->Out().AsRegister<Register>();
    462   NearLabel skip_incr, done;
    463   X86Assembler* assembler = GetAssembler();
    464 
    465   // Since no direct x86 rounding instruction matches the required semantics,
    466   // this intrinsic is implemented as follows:
    467   //  result = floor(in);
    468   //  if (in - result >= 0.5f)
    469   //    result = result + 1.0f;
    470   __ movss(t2, in);
    471   __ roundss(t1, in, Immediate(1));
    472   __ subss(t2, t1);
    473   if (locations->GetInputCount() == 2 && locations->InAt(1).IsValid()) {
    474     // Direct constant area available.
    475     HX86ComputeBaseMethodAddress* method_address =
    476         invoke->InputAt(1)->AsX86ComputeBaseMethodAddress();
    477     Register constant_area = locations->InAt(1).AsRegister<Register>();
    478     __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f),
    479                                                 method_address,
    480                                                 constant_area));
    481     __ j(kBelow, &skip_incr);
    482     __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f),
    483                                                method_address,
    484                                                constant_area));
    485     __ Bind(&skip_incr);
    486   } else {
    487     // No constant area: go through stack.
    488     __ pushl(Immediate(bit_cast<int32_t, float>(0.5f)));
    489     __ pushl(Immediate(bit_cast<int32_t, float>(1.0f)));
    490     __ comiss(t2, Address(ESP, 4));
    491     __ j(kBelow, &skip_incr);
    492     __ addss(t1, Address(ESP, 0));
    493     __ Bind(&skip_incr);
    494     __ addl(ESP, Immediate(8));
    495   }
    496 
    497   // Final conversion to an integer. Unfortunately this also does not have a
    498   // direct x86 instruction, since NaN should map to 0 and large positive
    499   // values need to be clipped to the extreme value.
    500   __ movl(out, Immediate(kPrimIntMax));
    501   __ cvtsi2ss(t2, out);
    502   __ comiss(t1, t2);
    503   __ j(kAboveEqual, &done);  // clipped to max (already in out), does not jump on unordered
    504   __ movl(out, Immediate(0));  // does not change flags
    505   __ j(kUnordered, &done);  // NaN mapped to 0 (just moved in out)
    506   __ cvttss2si(out, t1);
    507   __ Bind(&done);
    508 }
    509 
    510 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    511   LocationSummary* locations =
    512       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
    513   InvokeRuntimeCallingConvention calling_convention;
    514   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    515   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    516 }
    517 
    518 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86* codegen, QuickEntrypointEnum entry) {
    519   LocationSummary* locations = invoke->GetLocations();
    520   DCHECK(locations->WillCall());
    521   DCHECK(invoke->IsInvokeStaticOrDirect());
    522   X86Assembler* assembler = codegen->GetAssembler();
    523 
    524   // We need some place to pass the parameters.
    525   __ subl(ESP, Immediate(16));
    526   __ cfi().AdjustCFAOffset(16);
    527 
    528   // Pass the parameters at the bottom of the stack.
    529   __ movsd(Address(ESP, 0), XMM0);
    530 
    531   // If we have a second parameter, pass it next.
    532   if (invoke->GetNumberOfArguments() == 2) {
    533     __ movsd(Address(ESP, 8), XMM1);
    534   }
    535 
    536   // Now do the actual call.
    537   codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
    538 
    539   // Extract the return value from the FP stack.
    540   __ fstpl(Address(ESP, 0));
    541   __ movsd(XMM0, Address(ESP, 0));
    542 
    543   // And clean up the stack.
    544   __ addl(ESP, Immediate(16));
    545   __ cfi().AdjustCFAOffset(-16);
    546 }
    547 
    548 static void CreateLowestOneBitLocations(ArenaAllocator* allocator, bool is_long, HInvoke* invoke) {
    549   LocationSummary* locations =
    550       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
    551   if (is_long) {
    552     locations->SetInAt(0, Location::RequiresRegister());
    553   } else {
    554     locations->SetInAt(0, Location::Any());
    555   }
    556   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
    557 }
    558 
    559 static void GenLowestOneBit(X86Assembler* assembler,
    560                       CodeGeneratorX86* codegen,
    561                       bool is_long,
    562                       HInvoke* invoke) {
    563   LocationSummary* locations = invoke->GetLocations();
    564   Location src = locations->InAt(0);
    565   Location out_loc = locations->Out();
    566 
    567   if (invoke->InputAt(0)->IsConstant()) {
    568     // Evaluate this at compile time.
    569     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
    570     if (value == 0) {
    571       if (is_long) {
    572         __ xorl(out_loc.AsRegisterPairLow<Register>(), out_loc.AsRegisterPairLow<Register>());
    573         __ xorl(out_loc.AsRegisterPairHigh<Register>(), out_loc.AsRegisterPairHigh<Register>());
    574       } else {
    575         __ xorl(out_loc.AsRegister<Register>(), out_loc.AsRegister<Register>());
    576       }
    577       return;
    578     }
    579     // Nonzero value.
    580     value = is_long ? CTZ(static_cast<uint64_t>(value))
    581                     : CTZ(static_cast<uint32_t>(value));
    582     if (is_long) {
    583       if (value >= 32) {
    584         int shift = value-32;
    585         codegen->Load32BitValue(out_loc.AsRegisterPairLow<Register>(), 0);
    586         codegen->Load32BitValue(out_loc.AsRegisterPairHigh<Register>(), 1 << shift);
    587       } else {
    588         codegen->Load32BitValue(out_loc.AsRegisterPairLow<Register>(), 1 << value);
    589         codegen->Load32BitValue(out_loc.AsRegisterPairHigh<Register>(), 0);
    590       }
    591     } else {
    592       codegen->Load32BitValue(out_loc.AsRegister<Register>(), 1 << value);
    593     }
    594     return;
    595   }
    596   // Handle non constant case
    597   if (is_long) {
    598     DCHECK(src.IsRegisterPair());
    599     Register src_lo = src.AsRegisterPairLow<Register>();
    600     Register src_hi = src.AsRegisterPairHigh<Register>();
    601 
    602     Register out_lo = out_loc.AsRegisterPairLow<Register>();
    603     Register out_hi = out_loc.AsRegisterPairHigh<Register>();
    604 
    605     __ movl(out_lo, src_lo);
    606     __ movl(out_hi, src_hi);
    607 
    608     __ negl(out_lo);
    609     __ adcl(out_hi, Immediate(0));
    610     __ negl(out_hi);
    611 
    612     __ andl(out_lo, src_lo);
    613     __ andl(out_hi, src_hi);
    614   } else {
    615     if (codegen->GetInstructionSetFeatures().HasAVX2() && src.IsRegister()) {
    616       Register out = out_loc.AsRegister<Register>();
    617       __ blsi(out, src.AsRegister<Register>());
    618     } else {
    619       Register out = out_loc.AsRegister<Register>();
    620       // Do tmp & -tmp
    621       if (src.IsRegister()) {
    622         __ movl(out, src.AsRegister<Register>());
    623       } else {
    624         DCHECK(src.IsStackSlot());
    625         __ movl(out, Address(ESP, src.GetStackIndex()));
    626       }
    627       __ negl(out);
    628 
    629       if (src.IsRegister()) {
    630         __ andl(out, src.AsRegister<Register>());
    631       } else {
    632         __ andl(out, Address(ESP, src.GetStackIndex()));
    633       }
    634     }
    635   }
    636 }
    637 
    638 void IntrinsicLocationsBuilderX86::VisitMathCos(HInvoke* invoke) {
    639   CreateFPToFPCallLocations(allocator_, invoke);
    640 }
    641 
    642 void IntrinsicCodeGeneratorX86::VisitMathCos(HInvoke* invoke) {
    643   GenFPToFPCall(invoke, codegen_, kQuickCos);
    644 }
    645 
    646 void IntrinsicLocationsBuilderX86::VisitMathSin(HInvoke* invoke) {
    647   CreateFPToFPCallLocations(allocator_, invoke);
    648 }
    649 
    650 void IntrinsicCodeGeneratorX86::VisitMathSin(HInvoke* invoke) {
    651   GenFPToFPCall(invoke, codegen_, kQuickSin);
    652 }
    653 
    654 void IntrinsicLocationsBuilderX86::VisitMathAcos(HInvoke* invoke) {
    655   CreateFPToFPCallLocations(allocator_, invoke);
    656 }
    657 
    658 void IntrinsicCodeGeneratorX86::VisitMathAcos(HInvoke* invoke) {
    659   GenFPToFPCall(invoke, codegen_, kQuickAcos);
    660 }
    661 
    662 void IntrinsicLocationsBuilderX86::VisitMathAsin(HInvoke* invoke) {
    663   CreateFPToFPCallLocations(allocator_, invoke);
    664 }
    665 
    666 void IntrinsicCodeGeneratorX86::VisitMathAsin(HInvoke* invoke) {
    667   GenFPToFPCall(invoke, codegen_, kQuickAsin);
    668 }
    669 
    670 void IntrinsicLocationsBuilderX86::VisitMathAtan(HInvoke* invoke) {
    671   CreateFPToFPCallLocations(allocator_, invoke);
    672 }
    673 
    674 void IntrinsicCodeGeneratorX86::VisitMathAtan(HInvoke* invoke) {
    675   GenFPToFPCall(invoke, codegen_, kQuickAtan);
    676 }
    677 
    678 void IntrinsicLocationsBuilderX86::VisitMathCbrt(HInvoke* invoke) {
    679   CreateFPToFPCallLocations(allocator_, invoke);
    680 }
    681 
    682 void IntrinsicCodeGeneratorX86::VisitMathCbrt(HInvoke* invoke) {
    683   GenFPToFPCall(invoke, codegen_, kQuickCbrt);
    684 }
    685 
    686 void IntrinsicLocationsBuilderX86::VisitMathCosh(HInvoke* invoke) {
    687   CreateFPToFPCallLocations(allocator_, invoke);
    688 }
    689 
    690 void IntrinsicCodeGeneratorX86::VisitMathCosh(HInvoke* invoke) {
    691   GenFPToFPCall(invoke, codegen_, kQuickCosh);
    692 }
    693 
    694 void IntrinsicLocationsBuilderX86::VisitMathExp(HInvoke* invoke) {
    695   CreateFPToFPCallLocations(allocator_, invoke);
    696 }
    697 
    698 void IntrinsicCodeGeneratorX86::VisitMathExp(HInvoke* invoke) {
    699   GenFPToFPCall(invoke, codegen_, kQuickExp);
    700 }
    701 
    702 void IntrinsicLocationsBuilderX86::VisitMathExpm1(HInvoke* invoke) {
    703   CreateFPToFPCallLocations(allocator_, invoke);
    704 }
    705 
    706 void IntrinsicCodeGeneratorX86::VisitMathExpm1(HInvoke* invoke) {
    707   GenFPToFPCall(invoke, codegen_, kQuickExpm1);
    708 }
    709 
    710 void IntrinsicLocationsBuilderX86::VisitMathLog(HInvoke* invoke) {
    711   CreateFPToFPCallLocations(allocator_, invoke);
    712 }
    713 
    714 void IntrinsicCodeGeneratorX86::VisitMathLog(HInvoke* invoke) {
    715   GenFPToFPCall(invoke, codegen_, kQuickLog);
    716 }
    717 
    718 void IntrinsicLocationsBuilderX86::VisitMathLog10(HInvoke* invoke) {
    719   CreateFPToFPCallLocations(allocator_, invoke);
    720 }
    721 
    722 void IntrinsicCodeGeneratorX86::VisitMathLog10(HInvoke* invoke) {
    723   GenFPToFPCall(invoke, codegen_, kQuickLog10);
    724 }
    725 
    726 void IntrinsicLocationsBuilderX86::VisitMathSinh(HInvoke* invoke) {
    727   CreateFPToFPCallLocations(allocator_, invoke);
    728 }
    729 
    730 void IntrinsicCodeGeneratorX86::VisitMathSinh(HInvoke* invoke) {
    731   GenFPToFPCall(invoke, codegen_, kQuickSinh);
    732 }
    733 
    734 void IntrinsicLocationsBuilderX86::VisitMathTan(HInvoke* invoke) {
    735   CreateFPToFPCallLocations(allocator_, invoke);
    736 }
    737 
    738 void IntrinsicCodeGeneratorX86::VisitMathTan(HInvoke* invoke) {
    739   GenFPToFPCall(invoke, codegen_, kQuickTan);
    740 }
    741 
    742 void IntrinsicLocationsBuilderX86::VisitMathTanh(HInvoke* invoke) {
    743   CreateFPToFPCallLocations(allocator_, invoke);
    744 }
    745 
    746 void IntrinsicCodeGeneratorX86::VisitMathTanh(HInvoke* invoke) {
    747   GenFPToFPCall(invoke, codegen_, kQuickTanh);
    748 }
    749 
    750 void IntrinsicLocationsBuilderX86::VisitIntegerLowestOneBit(HInvoke* invoke) {
    751   CreateLowestOneBitLocations(allocator_, /*is_long=*/ false, invoke);
    752 }
    753 void IntrinsicCodeGeneratorX86::VisitIntegerLowestOneBit(HInvoke* invoke) {
    754   GenLowestOneBit(GetAssembler(), codegen_, /*is_long=*/ false, invoke);
    755 }
    756 
    757 void IntrinsicLocationsBuilderX86::VisitLongLowestOneBit(HInvoke* invoke) {
    758   CreateLowestOneBitLocations(allocator_, /*is_long=*/ true, invoke);
    759 }
    760 
    761 void IntrinsicCodeGeneratorX86::VisitLongLowestOneBit(HInvoke* invoke) {
    762   GenLowestOneBit(GetAssembler(), codegen_, /*is_long=*/ true, invoke);
    763 }
    764 
    765 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
    766   LocationSummary* locations =
    767       new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
    768   InvokeRuntimeCallingConvention calling_convention;
    769   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
    770   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
    771   locations->SetOut(Location::FpuRegisterLocation(XMM0));
    772 }
    773 
    774 void IntrinsicLocationsBuilderX86::VisitMathAtan2(HInvoke* invoke) {
    775   CreateFPFPToFPCallLocations(allocator_, invoke);
    776 }
    777 
    778 void IntrinsicCodeGeneratorX86::VisitMathAtan2(HInvoke* invoke) {
    779   GenFPToFPCall(invoke, codegen_, kQuickAtan2);
    780 }
    781 
    782 void IntrinsicLocationsBuilderX86::VisitMathPow(HInvoke* invoke) {
    783   CreateFPFPToFPCallLocations(allocator_, invoke);
    784 }
    785 
    786 void IntrinsicCodeGeneratorX86::VisitMathPow(HInvoke* invoke) {
    787   GenFPToFPCall(invoke, codegen_, kQuickPow);
    788 }
    789 
    790 void IntrinsicLocationsBuilderX86::VisitMathHypot(HInvoke* invoke) {
    791   CreateFPFPToFPCallLocations(allocator_, invoke);
    792 }
    793 
    794 void IntrinsicCodeGeneratorX86::VisitMathHypot(HInvoke* invoke) {
    795   GenFPToFPCall(invoke, codegen_, kQuickHypot);
    796 }
    797 
    798 void IntrinsicLocationsBuilderX86::VisitMathNextAfter(HInvoke* invoke) {
    799   CreateFPFPToFPCallLocations(allocator_, invoke);
    800 }
    801 
    802 void IntrinsicCodeGeneratorX86::VisitMathNextAfter(HInvoke* invoke) {
    803   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
    804 }
    805 
    806 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
    807   // We need at least two of the positions or length to be an integer constant,
    808   // or else we won't have enough free registers.
    809   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
    810   HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
    811   HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
    812 
    813   int num_constants =
    814       ((src_pos != nullptr) ? 1 : 0)
    815       + ((dest_pos != nullptr) ? 1 : 0)
    816       + ((length != nullptr) ? 1 : 0);
    817 
    818   if (num_constants < 2) {
    819     // Not enough free registers.
    820     return;
    821   }
    822 
    823   // As long as we are checking, we might as well check to see if the src and dest
    824   // positions are >= 0.
    825   if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
    826       (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
    827     // We will have to fail anyways.
    828     return;
    829   }
    830 
    831   // And since we are already checking, check the length too.
    832   if (length != nullptr) {
    833     int32_t len = length->GetValue();
    834     if (len < 0) {
    835       // Just call as normal.
    836       return;
    837     }
    838   }
    839 
    840   // Okay, it is safe to generate inline code.
    841   LocationSummary* locations =
    842       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
    843   // arraycopy(Object src, int srcPos, Object dest, int destPos, int length).
    844   locations->SetInAt(0, Location::RequiresRegister());
    845   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
    846   locations->SetInAt(2, Location::RequiresRegister());
    847   locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
    848   locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
    849 
    850   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
    851   locations->AddTemp(Location::RegisterLocation(ESI));
    852   locations->AddTemp(Location::RegisterLocation(EDI));
    853   locations->AddTemp(Location::RegisterLocation(ECX));
    854 }
    855 
    856 static void CheckPosition(X86Assembler* assembler,
    857                           Location pos,
    858                           Register input,
    859                           Location length,
    860                           SlowPathCode* slow_path,
    861                           Register temp,
    862                           bool length_is_input_length = false) {
    863   // Where is the length in the Array?
    864   const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
    865 
    866   if (pos.IsConstant()) {
    867     int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
    868     if (pos_const == 0) {
    869       if (!length_is_input_length) {
    870         // Check that length(input) >= length.
    871         if (length.IsConstant()) {
    872           __ cmpl(Address(input, length_offset),
    873                   Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    874         } else {
    875           __ cmpl(Address(input, length_offset), length.AsRegister<Register>());
    876         }
    877         __ j(kLess, slow_path->GetEntryLabel());
    878       }
    879     } else {
    880       // Check that length(input) >= pos.
    881       __ movl(temp, Address(input, length_offset));
    882       __ subl(temp, Immediate(pos_const));
    883       __ j(kLess, slow_path->GetEntryLabel());
    884 
    885       // Check that (length(input) - pos) >= length.
    886       if (length.IsConstant()) {
    887         __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    888       } else {
    889         __ cmpl(temp, length.AsRegister<Register>());
    890       }
    891       __ j(kLess, slow_path->GetEntryLabel());
    892     }
    893   } else if (length_is_input_length) {
    894     // The only way the copy can succeed is if pos is zero.
    895     Register pos_reg = pos.AsRegister<Register>();
    896     __ testl(pos_reg, pos_reg);
    897     __ j(kNotEqual, slow_path->GetEntryLabel());
    898   } else {
    899     // Check that pos >= 0.
    900     Register pos_reg = pos.AsRegister<Register>();
    901     __ testl(pos_reg, pos_reg);
    902     __ j(kLess, slow_path->GetEntryLabel());
    903 
    904     // Check that pos <= length(input).
    905     __ cmpl(Address(input, length_offset), pos_reg);
    906     __ j(kLess, slow_path->GetEntryLabel());
    907 
    908     // Check that (length(input) - pos) >= length.
    909     __ movl(temp, Address(input, length_offset));
    910     __ subl(temp, pos_reg);
    911     if (length.IsConstant()) {
    912       __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    913     } else {
    914       __ cmpl(temp, length.AsRegister<Register>());
    915     }
    916     __ j(kLess, slow_path->GetEntryLabel());
    917   }
    918 }
    919 
    920 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
    921   X86Assembler* assembler = GetAssembler();
    922   LocationSummary* locations = invoke->GetLocations();
    923 
    924   Register src = locations->InAt(0).AsRegister<Register>();
    925   Location srcPos = locations->InAt(1);
    926   Register dest = locations->InAt(2).AsRegister<Register>();
    927   Location destPos = locations->InAt(3);
    928   Location length = locations->InAt(4);
    929 
    930   // Temporaries that we need for MOVSW.
    931   Register src_base = locations->GetTemp(0).AsRegister<Register>();
    932   DCHECK_EQ(src_base, ESI);
    933   Register dest_base = locations->GetTemp(1).AsRegister<Register>();
    934   DCHECK_EQ(dest_base, EDI);
    935   Register count = locations->GetTemp(2).AsRegister<Register>();
    936   DCHECK_EQ(count, ECX);
    937 
    938   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
    939   codegen_->AddSlowPath(slow_path);
    940 
    941   // Bail out if the source and destination are the same (to handle overlap).
    942   __ cmpl(src, dest);
    943   __ j(kEqual, slow_path->GetEntryLabel());
    944 
    945   // Bail out if the source is null.
    946   __ testl(src, src);
    947   __ j(kEqual, slow_path->GetEntryLabel());
    948 
    949   // Bail out if the destination is null.
    950   __ testl(dest, dest);
    951   __ j(kEqual, slow_path->GetEntryLabel());
    952 
    953   // If the length is negative, bail out.
    954   // We have already checked in the LocationsBuilder for the constant case.
    955   if (!length.IsConstant()) {
    956     __ cmpl(length.AsRegister<Register>(), length.AsRegister<Register>());
    957     __ j(kLess, slow_path->GetEntryLabel());
    958   }
    959 
    960   // We need the count in ECX.
    961   if (length.IsConstant()) {
    962     __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
    963   } else {
    964     __ movl(count, length.AsRegister<Register>());
    965   }
    966 
    967   // Validity checks: source. Use src_base as a temporary register.
    968   CheckPosition(assembler, srcPos, src, Location::RegisterLocation(count), slow_path, src_base);
    969 
    970   // Validity checks: dest. Use src_base as a temporary register.
    971   CheckPosition(assembler, destPos, dest, Location::RegisterLocation(count), slow_path, src_base);
    972 
    973   // Okay, everything checks out.  Finally time to do the copy.
    974   // Check assumption that sizeof(Char) is 2 (used in scaling below).
    975   const size_t char_size = DataType::Size(DataType::Type::kUint16);
    976   DCHECK_EQ(char_size, 2u);
    977 
    978   const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
    979 
    980   if (srcPos.IsConstant()) {
    981     int32_t srcPos_const = srcPos.GetConstant()->AsIntConstant()->GetValue();
    982     __ leal(src_base, Address(src, char_size * srcPos_const + data_offset));
    983   } else {
    984     __ leal(src_base, Address(src, srcPos.AsRegister<Register>(),
    985                               ScaleFactor::TIMES_2, data_offset));
    986   }
    987   if (destPos.IsConstant()) {
    988     int32_t destPos_const = destPos.GetConstant()->AsIntConstant()->GetValue();
    989 
    990     __ leal(dest_base, Address(dest, char_size * destPos_const + data_offset));
    991   } else {
    992     __ leal(dest_base, Address(dest, destPos.AsRegister<Register>(),
    993                                ScaleFactor::TIMES_2, data_offset));
    994   }
    995 
    996   // Do the move.
    997   __ rep_movsw();
    998 
    999   __ Bind(slow_path->GetExitLabel());
   1000 }
   1001 
   1002 void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) {
   1003   // The inputs plus one temp.
   1004   LocationSummary* locations = new (allocator_) LocationSummary(
   1005       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1006   InvokeRuntimeCallingConvention calling_convention;
   1007   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1008   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1009   locations->SetOut(Location::RegisterLocation(EAX));
   1010 }
   1011 
   1012 void IntrinsicCodeGeneratorX86::VisitStringCompareTo(HInvoke* invoke) {
   1013   X86Assembler* assembler = GetAssembler();
   1014   LocationSummary* locations = invoke->GetLocations();
   1015 
   1016   // Note that the null check must have been done earlier.
   1017   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1018 
   1019   Register argument = locations->InAt(1).AsRegister<Register>();
   1020   __ testl(argument, argument);
   1021   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   1022   codegen_->AddSlowPath(slow_path);
   1023   __ j(kEqual, slow_path->GetEntryLabel());
   1024 
   1025   codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path);
   1026   __ Bind(slow_path->GetExitLabel());
   1027 }
   1028 
   1029 void IntrinsicLocationsBuilderX86::VisitStringEquals(HInvoke* invoke) {
   1030   LocationSummary* locations =
   1031       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1032   locations->SetInAt(0, Location::RequiresRegister());
   1033   locations->SetInAt(1, Location::RequiresRegister());
   1034 
   1035   // Request temporary registers, ECX and EDI needed for repe_cmpsl instruction.
   1036   locations->AddTemp(Location::RegisterLocation(ECX));
   1037   locations->AddTemp(Location::RegisterLocation(EDI));
   1038 
   1039   // Set output, ESI needed for repe_cmpsl instruction anyways.
   1040   locations->SetOut(Location::RegisterLocation(ESI), Location::kOutputOverlap);
   1041 }
   1042 
   1043 void IntrinsicCodeGeneratorX86::VisitStringEquals(HInvoke* invoke) {
   1044   X86Assembler* assembler = GetAssembler();
   1045   LocationSummary* locations = invoke->GetLocations();
   1046 
   1047   Register str = locations->InAt(0).AsRegister<Register>();
   1048   Register arg = locations->InAt(1).AsRegister<Register>();
   1049   Register ecx = locations->GetTemp(0).AsRegister<Register>();
   1050   Register edi = locations->GetTemp(1).AsRegister<Register>();
   1051   Register esi = locations->Out().AsRegister<Register>();
   1052 
   1053   NearLabel end, return_true, return_false;
   1054 
   1055   // Get offsets of count, value, and class fields within a string object.
   1056   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1057   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1058   const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
   1059 
   1060   // Note that the null check must have been done earlier.
   1061   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1062 
   1063   StringEqualsOptimizations optimizations(invoke);
   1064   if (!optimizations.GetArgumentNotNull()) {
   1065     // Check if input is null, return false if it is.
   1066     __ testl(arg, arg);
   1067     __ j(kEqual, &return_false);
   1068   }
   1069 
   1070   if (!optimizations.GetArgumentIsString()) {
   1071     // Instanceof check for the argument by comparing class fields.
   1072     // All string objects must have the same type since String cannot be subclassed.
   1073     // Receiver must be a string object, so its class field is equal to all strings' class fields.
   1074     // If the argument is a string object, its class field must be equal to receiver's class field.
   1075     //
   1076     // As the String class is expected to be non-movable, we can read the class
   1077     // field from String.equals' arguments without read barriers.
   1078     AssertNonMovableStringClass();
   1079     // Also, because we use the loaded class references only to compare them, we
   1080     // don't need to unpoison them.
   1081     // /* HeapReference<Class> */ ecx = str->klass_
   1082     __ movl(ecx, Address(str, class_offset));
   1083     // if (ecx != /* HeapReference<Class> */ arg->klass_) return false
   1084     __ cmpl(ecx, Address(arg, class_offset));
   1085     __ j(kNotEqual, &return_false);
   1086   }
   1087 
   1088   // Reference equality check, return true if same reference.
   1089   __ cmpl(str, arg);
   1090   __ j(kEqual, &return_true);
   1091 
   1092   // Load length and compression flag of receiver string.
   1093   __ movl(ecx, Address(str, count_offset));
   1094   // Check if lengths and compression flags are equal, return false if they're not.
   1095   // Two identical strings will always have same compression style since
   1096   // compression style is decided on alloc.
   1097   __ cmpl(ecx, Address(arg, count_offset));
   1098   __ j(kNotEqual, &return_false);
   1099   // Return true if strings are empty. Even with string compression `count == 0` means empty.
   1100   static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1101                 "Expecting 0=compressed, 1=uncompressed");
   1102   __ jecxz(&return_true);
   1103 
   1104   if (mirror::kUseStringCompression) {
   1105     NearLabel string_uncompressed;
   1106     // Extract length and differentiate between both compressed or both uncompressed.
   1107     // Different compression style is cut above.
   1108     __ shrl(ecx, Immediate(1));
   1109     __ j(kCarrySet, &string_uncompressed);
   1110     // Divide string length by 2, rounding up, and continue as if uncompressed.
   1111     __ addl(ecx, Immediate(1));
   1112     __ shrl(ecx, Immediate(1));
   1113     __ Bind(&string_uncompressed);
   1114   }
   1115   // Load starting addresses of string values into ESI/EDI as required for repe_cmpsl instruction.
   1116   __ leal(esi, Address(str, value_offset));
   1117   __ leal(edi, Address(arg, value_offset));
   1118 
   1119   // Divide string length by 2 to compare characters 2 at a time and adjust for lengths not
   1120   // divisible by 2.
   1121   __ addl(ecx, Immediate(1));
   1122   __ shrl(ecx, Immediate(1));
   1123 
   1124   // Assertions that must hold in order to compare strings 2 characters (uncompressed)
   1125   // or 4 characters (compressed) at a time.
   1126   DCHECK_ALIGNED(value_offset, 4);
   1127   static_assert(IsAligned<4>(kObjectAlignment), "String of odd length is not zero padded");
   1128 
   1129   // Loop to compare strings two characters at a time starting at the beginning of the string.
   1130   __ repe_cmpsl();
   1131   // If strings are not equal, zero flag will be cleared.
   1132   __ j(kNotEqual, &return_false);
   1133 
   1134   // Return true and exit the function.
   1135   // If loop does not result in returning false, we return true.
   1136   __ Bind(&return_true);
   1137   __ movl(esi, Immediate(1));
   1138   __ jmp(&end);
   1139 
   1140   // Return false and exit the function.
   1141   __ Bind(&return_false);
   1142   __ xorl(esi, esi);
   1143   __ Bind(&end);
   1144 }
   1145 
   1146 static void CreateStringIndexOfLocations(HInvoke* invoke,
   1147                                          ArenaAllocator* allocator,
   1148                                          bool start_at_zero) {
   1149   LocationSummary* locations = new (allocator) LocationSummary(invoke,
   1150                                                                LocationSummary::kCallOnSlowPath,
   1151                                                                kIntrinsified);
   1152   // The data needs to be in EDI for scasw. So request that the string is there, anyways.
   1153   locations->SetInAt(0, Location::RegisterLocation(EDI));
   1154   // If we look for a constant char, we'll still have to copy it into EAX. So just request the
   1155   // allocator to do that, anyways. We can still do the constant check by checking the parameter
   1156   // of the instruction explicitly.
   1157   // Note: This works as we don't clobber EAX anywhere.
   1158   locations->SetInAt(1, Location::RegisterLocation(EAX));
   1159   if (!start_at_zero) {
   1160     locations->SetInAt(2, Location::RequiresRegister());          // The starting index.
   1161   }
   1162   // As we clobber EDI during execution anyways, also use it as the output.
   1163   locations->SetOut(Location::SameAsFirstInput());
   1164 
   1165   // repne scasw uses ECX as the counter.
   1166   locations->AddTemp(Location::RegisterLocation(ECX));
   1167   // Need another temporary to be able to compute the result.
   1168   locations->AddTemp(Location::RequiresRegister());
   1169   if (mirror::kUseStringCompression) {
   1170     // Need another temporary to be able to save unflagged string length.
   1171     locations->AddTemp(Location::RequiresRegister());
   1172   }
   1173 }
   1174 
   1175 static void GenerateStringIndexOf(HInvoke* invoke,
   1176                                   X86Assembler* assembler,
   1177                                   CodeGeneratorX86* codegen,
   1178                                   bool start_at_zero) {
   1179   LocationSummary* locations = invoke->GetLocations();
   1180 
   1181   // Note that the null check must have been done earlier.
   1182   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
   1183 
   1184   Register string_obj = locations->InAt(0).AsRegister<Register>();
   1185   Register search_value = locations->InAt(1).AsRegister<Register>();
   1186   Register counter = locations->GetTemp(0).AsRegister<Register>();
   1187   Register string_length = locations->GetTemp(1).AsRegister<Register>();
   1188   Register out = locations->Out().AsRegister<Register>();
   1189   // Only used when string compression feature is on.
   1190   Register string_length_flagged;
   1191 
   1192   // Check our assumptions for registers.
   1193   DCHECK_EQ(string_obj, EDI);
   1194   DCHECK_EQ(search_value, EAX);
   1195   DCHECK_EQ(counter, ECX);
   1196   DCHECK_EQ(out, EDI);
   1197 
   1198   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
   1199   // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   1200   SlowPathCode* slow_path = nullptr;
   1201   HInstruction* code_point = invoke->InputAt(1);
   1202   if (code_point->IsIntConstant()) {
   1203     if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
   1204     std::numeric_limits<uint16_t>::max()) {
   1205       // Always needs the slow-path. We could directly dispatch to it, but this case should be
   1206       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
   1207       slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   1208       codegen->AddSlowPath(slow_path);
   1209       __ jmp(slow_path->GetEntryLabel());
   1210       __ Bind(slow_path->GetExitLabel());
   1211       return;
   1212     }
   1213   } else if (code_point->GetType() != DataType::Type::kUint16) {
   1214     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
   1215     slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   1216     codegen->AddSlowPath(slow_path);
   1217     __ j(kAbove, slow_path->GetEntryLabel());
   1218   }
   1219 
   1220   // From here down, we know that we are looking for a char that fits in 16 bits.
   1221   // Location of reference to data array within the String object.
   1222   int32_t value_offset = mirror::String::ValueOffset().Int32Value();
   1223   // Location of count within the String object.
   1224   int32_t count_offset = mirror::String::CountOffset().Int32Value();
   1225 
   1226   // Load the count field of the string containing the length and compression flag.
   1227   __ movl(string_length, Address(string_obj, count_offset));
   1228 
   1229   // Do a zero-length check. Even with string compression `count == 0` means empty.
   1230   static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1231                 "Expecting 0=compressed, 1=uncompressed");
   1232   // TODO: Support jecxz.
   1233   NearLabel not_found_label;
   1234   __ testl(string_length, string_length);
   1235   __ j(kEqual, &not_found_label);
   1236 
   1237   if (mirror::kUseStringCompression) {
   1238     string_length_flagged = locations->GetTemp(2).AsRegister<Register>();
   1239     __ movl(string_length_flagged, string_length);
   1240     // Extract the length and shift out the least significant bit used as compression flag.
   1241     __ shrl(string_length, Immediate(1));
   1242   }
   1243 
   1244   if (start_at_zero) {
   1245     // Number of chars to scan is the same as the string length.
   1246     __ movl(counter, string_length);
   1247 
   1248     // Move to the start of the string.
   1249     __ addl(string_obj, Immediate(value_offset));
   1250   } else {
   1251     Register start_index = locations->InAt(2).AsRegister<Register>();
   1252 
   1253     // Do a start_index check.
   1254     __ cmpl(start_index, string_length);
   1255     __ j(kGreaterEqual, &not_found_label);
   1256 
   1257     // Ensure we have a start index >= 0;
   1258     __ xorl(counter, counter);
   1259     __ cmpl(start_index, Immediate(0));
   1260     __ cmovl(kGreater, counter, start_index);
   1261 
   1262     if (mirror::kUseStringCompression) {
   1263       NearLabel modify_counter, offset_uncompressed_label;
   1264       __ testl(string_length_flagged, Immediate(1));
   1265       __ j(kNotZero, &offset_uncompressed_label);
   1266       // Move to the start of the string: string_obj + value_offset + start_index.
   1267       __ leal(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
   1268       __ jmp(&modify_counter);
   1269 
   1270       // Move to the start of the string: string_obj + value_offset + 2 * start_index.
   1271       __ Bind(&offset_uncompressed_label);
   1272       __ leal(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
   1273 
   1274       // Now update ecx (the repne scasw work counter). We have string.length - start_index left to
   1275       // compare.
   1276       __ Bind(&modify_counter);
   1277     } else {
   1278       __ leal(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
   1279     }
   1280     __ negl(counter);
   1281     __ leal(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
   1282   }
   1283 
   1284   if (mirror::kUseStringCompression) {
   1285     NearLabel uncompressed_string_comparison;
   1286     NearLabel comparison_done;
   1287     __ testl(string_length_flagged, Immediate(1));
   1288     __ j(kNotZero, &uncompressed_string_comparison);
   1289 
   1290     // Check if EAX (search_value) is ASCII.
   1291     __ cmpl(search_value, Immediate(127));
   1292     __ j(kGreater, &not_found_label);
   1293     // Comparing byte-per-byte.
   1294     __ repne_scasb();
   1295     __ jmp(&comparison_done);
   1296 
   1297     // Everything is set up for repne scasw:
   1298     //   * Comparison address in EDI.
   1299     //   * Counter in ECX.
   1300     __ Bind(&uncompressed_string_comparison);
   1301     __ repne_scasw();
   1302     __ Bind(&comparison_done);
   1303   } else {
   1304     __ repne_scasw();
   1305   }
   1306   // Did we find a match?
   1307   __ j(kNotEqual, &not_found_label);
   1308 
   1309   // Yes, we matched.  Compute the index of the result.
   1310   __ subl(string_length, counter);
   1311   __ leal(out, Address(string_length, -1));
   1312 
   1313   NearLabel done;
   1314   __ jmp(&done);
   1315 
   1316   // Failed to match; return -1.
   1317   __ Bind(&not_found_label);
   1318   __ movl(out, Immediate(-1));
   1319 
   1320   // And join up at the end.
   1321   __ Bind(&done);
   1322   if (slow_path != nullptr) {
   1323     __ Bind(slow_path->GetExitLabel());
   1324   }
   1325 }
   1326 
   1327 void IntrinsicLocationsBuilderX86::VisitStringIndexOf(HInvoke* invoke) {
   1328   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ true);
   1329 }
   1330 
   1331 void IntrinsicCodeGeneratorX86::VisitStringIndexOf(HInvoke* invoke) {
   1332   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ true);
   1333 }
   1334 
   1335 void IntrinsicLocationsBuilderX86::VisitStringIndexOfAfter(HInvoke* invoke) {
   1336   CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero= */ false);
   1337 }
   1338 
   1339 void IntrinsicCodeGeneratorX86::VisitStringIndexOfAfter(HInvoke* invoke) {
   1340   GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ false);
   1341 }
   1342 
   1343 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1344   LocationSummary* locations = new (allocator_) LocationSummary(
   1345       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1346   InvokeRuntimeCallingConvention calling_convention;
   1347   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1348   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1349   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1350   locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
   1351   locations->SetOut(Location::RegisterLocation(EAX));
   1352 }
   1353 
   1354 void IntrinsicCodeGeneratorX86::VisitStringNewStringFromBytes(HInvoke* invoke) {
   1355   X86Assembler* assembler = GetAssembler();
   1356   LocationSummary* locations = invoke->GetLocations();
   1357 
   1358   Register byte_array = locations->InAt(0).AsRegister<Register>();
   1359   __ testl(byte_array, byte_array);
   1360   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   1361   codegen_->AddSlowPath(slow_path);
   1362   __ j(kEqual, slow_path->GetEntryLabel());
   1363 
   1364   codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc());
   1365   CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
   1366   __ Bind(slow_path->GetExitLabel());
   1367 }
   1368 
   1369 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromChars(HInvoke* invoke) {
   1370   LocationSummary* locations =
   1371       new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
   1372   InvokeRuntimeCallingConvention calling_convention;
   1373   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1374   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   1375   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   1376   locations->SetOut(Location::RegisterLocation(EAX));
   1377 }
   1378 
   1379 void IntrinsicCodeGeneratorX86::VisitStringNewStringFromChars(HInvoke* invoke) {
   1380   // No need to emit code checking whether `locations->InAt(2)` is a null
   1381   // pointer, as callers of the native method
   1382   //
   1383   //   java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
   1384   //
   1385   // all include a null check on `data` before calling that method.
   1386   codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
   1387   CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
   1388 }
   1389 
   1390 void IntrinsicLocationsBuilderX86::VisitStringNewStringFromString(HInvoke* invoke) {
   1391   LocationSummary* locations = new (allocator_) LocationSummary(
   1392       invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
   1393   InvokeRuntimeCallingConvention calling_convention;
   1394   locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   1395   locations->SetOut(Location::RegisterLocation(EAX));
   1396 }
   1397 
   1398 void IntrinsicCodeGeneratorX86::VisitStringNewStringFromString(HInvoke* invoke) {
   1399   X86Assembler* assembler = GetAssembler();
   1400   LocationSummary* locations = invoke->GetLocations();
   1401 
   1402   Register string_to_copy = locations->InAt(0).AsRegister<Register>();
   1403   __ testl(string_to_copy, string_to_copy);
   1404   SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   1405   codegen_->AddSlowPath(slow_path);
   1406   __ j(kEqual, slow_path->GetEntryLabel());
   1407 
   1408   codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc());
   1409   CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
   1410   __ Bind(slow_path->GetExitLabel());
   1411 }
   1412 
   1413 void IntrinsicLocationsBuilderX86::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1414   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1415   LocationSummary* locations =
   1416       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1417   locations->SetInAt(0, Location::RequiresRegister());
   1418   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
   1419   // Place srcEnd in ECX to save a move below.
   1420   locations->SetInAt(2, Location::RegisterLocation(ECX));
   1421   locations->SetInAt(3, Location::RequiresRegister());
   1422   locations->SetInAt(4, Location::RequiresRegister());
   1423 
   1424   // And we need some temporaries.  We will use REP MOVSW, so we need fixed registers.
   1425   // We don't have enough registers to also grab ECX, so handle below.
   1426   locations->AddTemp(Location::RegisterLocation(ESI));
   1427   locations->AddTemp(Location::RegisterLocation(EDI));
   1428 }
   1429 
   1430 void IntrinsicCodeGeneratorX86::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   1431   X86Assembler* assembler = GetAssembler();
   1432   LocationSummary* locations = invoke->GetLocations();
   1433 
   1434   size_t char_component_size = DataType::Size(DataType::Type::kUint16);
   1435   // Location of data in char array buffer.
   1436   const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
   1437   // Location of char array data in string.
   1438   const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
   1439 
   1440   // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
   1441   Register obj = locations->InAt(0).AsRegister<Register>();
   1442   Location srcBegin = locations->InAt(1);
   1443   int srcBegin_value =
   1444     srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
   1445   Register srcEnd = locations->InAt(2).AsRegister<Register>();
   1446   Register dst = locations->InAt(3).AsRegister<Register>();
   1447   Register dstBegin = locations->InAt(4).AsRegister<Register>();
   1448 
   1449   // Check assumption that sizeof(Char) is 2 (used in scaling below).
   1450   const size_t char_size = DataType::Size(DataType::Type::kUint16);
   1451   DCHECK_EQ(char_size, 2u);
   1452 
   1453   // Compute the number of chars (words) to move.
   1454   // Save ECX, since we don't know if it will be used later.
   1455   __ pushl(ECX);
   1456   int stack_adjust = kX86WordSize;
   1457   __ cfi().AdjustCFAOffset(stack_adjust);
   1458   DCHECK_EQ(srcEnd, ECX);
   1459   if (srcBegin.IsConstant()) {
   1460     __ subl(ECX, Immediate(srcBegin_value));
   1461   } else {
   1462     DCHECK(srcBegin.IsRegister());
   1463     __ subl(ECX, srcBegin.AsRegister<Register>());
   1464   }
   1465 
   1466   NearLabel done;
   1467   if (mirror::kUseStringCompression) {
   1468     // Location of count in string
   1469     const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
   1470     const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
   1471     DCHECK_EQ(c_char_size, 1u);
   1472     __ pushl(EAX);
   1473     __ cfi().AdjustCFAOffset(stack_adjust);
   1474 
   1475     NearLabel copy_loop, copy_uncompressed;
   1476     __ testl(Address(obj, count_offset), Immediate(1));
   1477     static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
   1478                   "Expecting 0=compressed, 1=uncompressed");
   1479     __ j(kNotZero, &copy_uncompressed);
   1480     // Compute the address of the source string by adding the number of chars from
   1481     // the source beginning to the value offset of a string.
   1482     __ leal(ESI, CodeGeneratorX86::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
   1483 
   1484     // Start the loop to copy String's value to Array of Char.
   1485     __ leal(EDI, Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
   1486     __ Bind(&copy_loop);
   1487     __ jecxz(&done);
   1488     // Use EAX temporary (convert byte from ESI to word).
   1489     // TODO: Use LODSB/STOSW (not supported by X86Assembler) with AH initialized to 0.
   1490     __ movzxb(EAX, Address(ESI, 0));
   1491     __ movw(Address(EDI, 0), EAX);
   1492     __ leal(EDI, Address(EDI, char_size));
   1493     __ leal(ESI, Address(ESI, c_char_size));
   1494     // TODO: Add support for LOOP to X86Assembler.
   1495     __ subl(ECX, Immediate(1));
   1496     __ jmp(&copy_loop);
   1497     __ Bind(&copy_uncompressed);
   1498   }
   1499 
   1500   // Do the copy for uncompressed string.
   1501   // Compute the address of the destination buffer.
   1502   __ leal(EDI, Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
   1503   __ leal(ESI, CodeGeneratorX86::ArrayAddress(obj, srcBegin, TIMES_2, value_offset));
   1504   __ rep_movsw();
   1505 
   1506   __ Bind(&done);
   1507   if (mirror::kUseStringCompression) {
   1508     // Restore EAX.
   1509     __ popl(EAX);
   1510     __ cfi().AdjustCFAOffset(-stack_adjust);
   1511   }
   1512   // Restore ECX.
   1513   __ popl(ECX);
   1514   __ cfi().AdjustCFAOffset(-stack_adjust);
   1515 }
   1516 
   1517 static void GenPeek(LocationSummary* locations, DataType::Type size, X86Assembler* assembler) {
   1518   Register address = locations->InAt(0).AsRegisterPairLow<Register>();
   1519   Location out_loc = locations->Out();
   1520   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   1521   // to avoid a SIGBUS.
   1522   switch (size) {
   1523     case DataType::Type::kInt8:
   1524       __ movsxb(out_loc.AsRegister<Register>(), Address(address, 0));
   1525       break;
   1526     case DataType::Type::kInt16:
   1527       __ movsxw(out_loc.AsRegister<Register>(), Address(address, 0));
   1528       break;
   1529     case DataType::Type::kInt32:
   1530       __ movl(out_loc.AsRegister<Register>(), Address(address, 0));
   1531       break;
   1532     case DataType::Type::kInt64:
   1533       __ movl(out_loc.AsRegisterPairLow<Register>(), Address(address, 0));
   1534       __ movl(out_loc.AsRegisterPairHigh<Register>(), Address(address, 4));
   1535       break;
   1536     default:
   1537       LOG(FATAL) << "Type not recognized for peek: " << size;
   1538       UNREACHABLE();
   1539   }
   1540 }
   1541 
   1542 void IntrinsicLocationsBuilderX86::VisitMemoryPeekByte(HInvoke* invoke) {
   1543   CreateLongToIntLocations(allocator_, invoke);
   1544 }
   1545 
   1546 void IntrinsicCodeGeneratorX86::VisitMemoryPeekByte(HInvoke* invoke) {
   1547   GenPeek(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
   1548 }
   1549 
   1550 void IntrinsicLocationsBuilderX86::VisitMemoryPeekIntNative(HInvoke* invoke) {
   1551   CreateLongToIntLocations(allocator_, invoke);
   1552 }
   1553 
   1554 void IntrinsicCodeGeneratorX86::VisitMemoryPeekIntNative(HInvoke* invoke) {
   1555   GenPeek(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
   1556 }
   1557 
   1558 void IntrinsicLocationsBuilderX86::VisitMemoryPeekLongNative(HInvoke* invoke) {
   1559   CreateLongToLongLocations(allocator_, invoke);
   1560 }
   1561 
   1562 void IntrinsicCodeGeneratorX86::VisitMemoryPeekLongNative(HInvoke* invoke) {
   1563   GenPeek(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
   1564 }
   1565 
   1566 void IntrinsicLocationsBuilderX86::VisitMemoryPeekShortNative(HInvoke* invoke) {
   1567   CreateLongToIntLocations(allocator_, invoke);
   1568 }
   1569 
   1570 void IntrinsicCodeGeneratorX86::VisitMemoryPeekShortNative(HInvoke* invoke) {
   1571   GenPeek(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
   1572 }
   1573 
   1574 static void CreateLongIntToVoidLocations(ArenaAllocator* allocator,
   1575                                          DataType::Type size,
   1576                                          HInvoke* invoke) {
   1577   LocationSummary* locations =
   1578       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1579   locations->SetInAt(0, Location::RequiresRegister());
   1580   HInstruction* value = invoke->InputAt(1);
   1581   if (size == DataType::Type::kInt8) {
   1582     locations->SetInAt(1, Location::ByteRegisterOrConstant(EDX, value));
   1583   } else {
   1584     locations->SetInAt(1, Location::RegisterOrConstant(value));
   1585   }
   1586 }
   1587 
   1588 static void GenPoke(LocationSummary* locations, DataType::Type size, X86Assembler* assembler) {
   1589   Register address = locations->InAt(0).AsRegisterPairLow<Register>();
   1590   Location value_loc = locations->InAt(1);
   1591   // x86 allows unaligned access. We do not have to check the input or use specific instructions
   1592   // to avoid a SIGBUS.
   1593   switch (size) {
   1594     case DataType::Type::kInt8:
   1595       if (value_loc.IsConstant()) {
   1596         __ movb(Address(address, 0),
   1597                 Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue()));
   1598       } else {
   1599         __ movb(Address(address, 0), value_loc.AsRegister<ByteRegister>());
   1600       }
   1601       break;
   1602     case DataType::Type::kInt16:
   1603       if (value_loc.IsConstant()) {
   1604         __ movw(Address(address, 0),
   1605                 Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue()));
   1606       } else {
   1607         __ movw(Address(address, 0), value_loc.AsRegister<Register>());
   1608       }
   1609       break;
   1610     case DataType::Type::kInt32:
   1611       if (value_loc.IsConstant()) {
   1612         __ movl(Address(address, 0),
   1613                 Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue()));
   1614       } else {
   1615         __ movl(Address(address, 0), value_loc.AsRegister<Register>());
   1616       }
   1617       break;
   1618     case DataType::Type::kInt64:
   1619       if (value_loc.IsConstant()) {
   1620         int64_t value = value_loc.GetConstant()->AsLongConstant()->GetValue();
   1621         __ movl(Address(address, 0), Immediate(Low32Bits(value)));
   1622         __ movl(Address(address, 4), Immediate(High32Bits(value)));
   1623       } else {
   1624         __ movl(Address(address, 0), value_loc.AsRegisterPairLow<Register>());
   1625         __ movl(Address(address, 4), value_loc.AsRegisterPairHigh<Register>());
   1626       }
   1627       break;
   1628     default:
   1629       LOG(FATAL) << "Type not recognized for poke: " << size;
   1630       UNREACHABLE();
   1631   }
   1632 }
   1633 
   1634 void IntrinsicLocationsBuilderX86::VisitMemoryPokeByte(HInvoke* invoke) {
   1635   CreateLongIntToVoidLocations(allocator_, DataType::Type::kInt8, invoke);
   1636 }
   1637 
   1638 void IntrinsicCodeGeneratorX86::VisitMemoryPokeByte(HInvoke* invoke) {
   1639   GenPoke(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
   1640 }
   1641 
   1642 void IntrinsicLocationsBuilderX86::VisitMemoryPokeIntNative(HInvoke* invoke) {
   1643   CreateLongIntToVoidLocations(allocator_, DataType::Type::kInt32, invoke);
   1644 }
   1645 
   1646 void IntrinsicCodeGeneratorX86::VisitMemoryPokeIntNative(HInvoke* invoke) {
   1647   GenPoke(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
   1648 }
   1649 
   1650 void IntrinsicLocationsBuilderX86::VisitMemoryPokeLongNative(HInvoke* invoke) {
   1651   CreateLongIntToVoidLocations(allocator_, DataType::Type::kInt64, invoke);
   1652 }
   1653 
   1654 void IntrinsicCodeGeneratorX86::VisitMemoryPokeLongNative(HInvoke* invoke) {
   1655   GenPoke(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
   1656 }
   1657 
   1658 void IntrinsicLocationsBuilderX86::VisitMemoryPokeShortNative(HInvoke* invoke) {
   1659   CreateLongIntToVoidLocations(allocator_, DataType::Type::kInt16, invoke);
   1660 }
   1661 
   1662 void IntrinsicCodeGeneratorX86::VisitMemoryPokeShortNative(HInvoke* invoke) {
   1663   GenPoke(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
   1664 }
   1665 
   1666 void IntrinsicLocationsBuilderX86::VisitThreadCurrentThread(HInvoke* invoke) {
   1667   LocationSummary* locations =
   1668       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1669   locations->SetOut(Location::RequiresRegister());
   1670 }
   1671 
   1672 void IntrinsicCodeGeneratorX86::VisitThreadCurrentThread(HInvoke* invoke) {
   1673   Register out = invoke->GetLocations()->Out().AsRegister<Register>();
   1674   GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86PointerSize>()));
   1675 }
   1676 
   1677 static void GenUnsafeGet(HInvoke* invoke,
   1678                          DataType::Type type,
   1679                          bool is_volatile,
   1680                          CodeGeneratorX86* codegen) {
   1681   X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
   1682   LocationSummary* locations = invoke->GetLocations();
   1683   Location base_loc = locations->InAt(1);
   1684   Register base = base_loc.AsRegister<Register>();
   1685   Location offset_loc = locations->InAt(2);
   1686   Register offset = offset_loc.AsRegisterPairLow<Register>();
   1687   Location output_loc = locations->Out();
   1688 
   1689   switch (type) {
   1690     case DataType::Type::kInt32: {
   1691       Register output = output_loc.AsRegister<Register>();
   1692       __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1693       break;
   1694     }
   1695 
   1696     case DataType::Type::kReference: {
   1697       Register output = output_loc.AsRegister<Register>();
   1698       if (kEmitCompilerReadBarrier) {
   1699         if (kUseBakerReadBarrier) {
   1700           Address src(base, offset, ScaleFactor::TIMES_1, 0);
   1701           codegen->GenerateReferenceLoadWithBakerReadBarrier(
   1702               invoke, output_loc, base, src, /* needs_null_check= */ false);
   1703         } else {
   1704           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1705           codegen->GenerateReadBarrierSlow(
   1706               invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
   1707         }
   1708       } else {
   1709         __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1710         __ MaybeUnpoisonHeapReference(output);
   1711       }
   1712       break;
   1713     }
   1714 
   1715     case DataType::Type::kInt64: {
   1716         Register output_lo = output_loc.AsRegisterPairLow<Register>();
   1717         Register output_hi = output_loc.AsRegisterPairHigh<Register>();
   1718         if (is_volatile) {
   1719           // Need to use a XMM to read atomically.
   1720           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
   1721           __ movsd(temp, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1722           __ movd(output_lo, temp);
   1723           __ psrlq(temp, Immediate(32));
   1724           __ movd(output_hi, temp);
   1725         } else {
   1726           __ movl(output_lo, Address(base, offset, ScaleFactor::TIMES_1, 0));
   1727           __ movl(output_hi, Address(base, offset, ScaleFactor::TIMES_1, 4));
   1728         }
   1729       }
   1730       break;
   1731 
   1732     default:
   1733       LOG(FATAL) << "Unsupported op size " << type;
   1734       UNREACHABLE();
   1735   }
   1736 }
   1737 
   1738 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator,
   1739                                           HInvoke* invoke,
   1740                                           DataType::Type type,
   1741                                           bool is_volatile) {
   1742   bool can_call = kEmitCompilerReadBarrier &&
   1743       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
   1744        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
   1745   LocationSummary* locations =
   1746       new (allocator) LocationSummary(invoke,
   1747                                       can_call
   1748                                           ? LocationSummary::kCallOnSlowPath
   1749                                           : LocationSummary::kNoCall,
   1750                                       kIntrinsified);
   1751   if (can_call && kUseBakerReadBarrier) {
   1752     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   1753   }
   1754   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1755   locations->SetInAt(1, Location::RequiresRegister());
   1756   locations->SetInAt(2, Location::RequiresRegister());
   1757   if (type == DataType::Type::kInt64) {
   1758     if (is_volatile) {
   1759       // Need to use XMM to read volatile.
   1760       locations->AddTemp(Location::RequiresFpuRegister());
   1761       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
   1762     } else {
   1763       locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   1764     }
   1765   } else {
   1766     locations->SetOut(Location::RequiresRegister(),
   1767                       (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
   1768   }
   1769 }
   1770 
   1771 void IntrinsicLocationsBuilderX86::VisitUnsafeGet(HInvoke* invoke) {
   1772   CreateIntIntIntToIntLocations(
   1773       allocator_, invoke, DataType::Type::kInt32, /* is_volatile= */ false);
   1774 }
   1775 void IntrinsicLocationsBuilderX86::VisitUnsafeGetVolatile(HInvoke* invoke) {
   1776   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt32, /* is_volatile= */ true);
   1777 }
   1778 void IntrinsicLocationsBuilderX86::VisitUnsafeGetLong(HInvoke* invoke) {
   1779   CreateIntIntIntToIntLocations(
   1780       allocator_, invoke, DataType::Type::kInt64, /* is_volatile= */ false);
   1781 }
   1782 void IntrinsicLocationsBuilderX86::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   1783   CreateIntIntIntToIntLocations(allocator_, invoke, DataType::Type::kInt64, /* is_volatile= */ true);
   1784 }
   1785 void IntrinsicLocationsBuilderX86::VisitUnsafeGetObject(HInvoke* invoke) {
   1786   CreateIntIntIntToIntLocations(
   1787       allocator_, invoke, DataType::Type::kReference, /* is_volatile= */ false);
   1788 }
   1789 void IntrinsicLocationsBuilderX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   1790   CreateIntIntIntToIntLocations(
   1791       allocator_, invoke, DataType::Type::kReference, /* is_volatile= */ true);
   1792 }
   1793 
   1794 
   1795 void IntrinsicCodeGeneratorX86::VisitUnsafeGet(HInvoke* invoke) {
   1796   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
   1797 }
   1798 void IntrinsicCodeGeneratorX86::VisitUnsafeGetVolatile(HInvoke* invoke) {
   1799   GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ true, codegen_);
   1800 }
   1801 void IntrinsicCodeGeneratorX86::VisitUnsafeGetLong(HInvoke* invoke) {
   1802   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
   1803 }
   1804 void IntrinsicCodeGeneratorX86::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
   1805   GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ true, codegen_);
   1806 }
   1807 void IntrinsicCodeGeneratorX86::VisitUnsafeGetObject(HInvoke* invoke) {
   1808   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ false, codegen_);
   1809 }
   1810 void IntrinsicCodeGeneratorX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
   1811   GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ true, codegen_);
   1812 }
   1813 
   1814 
   1815 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* allocator,
   1816                                                        DataType::Type type,
   1817                                                        HInvoke* invoke,
   1818                                                        bool is_volatile) {
   1819   LocationSummary* locations =
   1820       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   1821   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1822   locations->SetInAt(1, Location::RequiresRegister());
   1823   locations->SetInAt(2, Location::RequiresRegister());
   1824   locations->SetInAt(3, Location::RequiresRegister());
   1825   if (type == DataType::Type::kReference) {
   1826     // Need temp registers for card-marking.
   1827     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   1828     // Ensure the value is in a byte register.
   1829     locations->AddTemp(Location::RegisterLocation(ECX));
   1830   } else if (type == DataType::Type::kInt64 && is_volatile) {
   1831     locations->AddTemp(Location::RequiresFpuRegister());
   1832     locations->AddTemp(Location::RequiresFpuRegister());
   1833   }
   1834 }
   1835 
   1836 void IntrinsicLocationsBuilderX86::VisitUnsafePut(HInvoke* invoke) {
   1837   CreateIntIntIntIntToVoidPlusTempsLocations(
   1838       allocator_, DataType::Type::kInt32, invoke, /* is_volatile= */ false);
   1839 }
   1840 void IntrinsicLocationsBuilderX86::VisitUnsafePutOrdered(HInvoke* invoke) {
   1841   CreateIntIntIntIntToVoidPlusTempsLocations(
   1842       allocator_, DataType::Type::kInt32, invoke, /* is_volatile= */ false);
   1843 }
   1844 void IntrinsicLocationsBuilderX86::VisitUnsafePutVolatile(HInvoke* invoke) {
   1845   CreateIntIntIntIntToVoidPlusTempsLocations(
   1846       allocator_, DataType::Type::kInt32, invoke, /* is_volatile= */ true);
   1847 }
   1848 void IntrinsicLocationsBuilderX86::VisitUnsafePutObject(HInvoke* invoke) {
   1849   CreateIntIntIntIntToVoidPlusTempsLocations(
   1850       allocator_, DataType::Type::kReference, invoke, /* is_volatile= */ false);
   1851 }
   1852 void IntrinsicLocationsBuilderX86::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   1853   CreateIntIntIntIntToVoidPlusTempsLocations(
   1854       allocator_, DataType::Type::kReference, invoke, /* is_volatile= */ false);
   1855 }
   1856 void IntrinsicLocationsBuilderX86::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   1857   CreateIntIntIntIntToVoidPlusTempsLocations(
   1858       allocator_, DataType::Type::kReference, invoke, /* is_volatile= */ true);
   1859 }
   1860 void IntrinsicLocationsBuilderX86::VisitUnsafePutLong(HInvoke* invoke) {
   1861   CreateIntIntIntIntToVoidPlusTempsLocations(
   1862       allocator_, DataType::Type::kInt64, invoke, /* is_volatile= */ false);
   1863 }
   1864 void IntrinsicLocationsBuilderX86::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   1865   CreateIntIntIntIntToVoidPlusTempsLocations(
   1866       allocator_, DataType::Type::kInt64, invoke, /* is_volatile= */ false);
   1867 }
   1868 void IntrinsicLocationsBuilderX86::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   1869   CreateIntIntIntIntToVoidPlusTempsLocations(
   1870       allocator_, DataType::Type::kInt64, invoke, /* is_volatile= */ true);
   1871 }
   1872 
   1873 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
   1874 // memory model.
   1875 static void GenUnsafePut(LocationSummary* locations,
   1876                          DataType::Type type,
   1877                          bool is_volatile,
   1878                          CodeGeneratorX86* codegen) {
   1879   X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
   1880   Register base = locations->InAt(1).AsRegister<Register>();
   1881   Register offset = locations->InAt(2).AsRegisterPairLow<Register>();
   1882   Location value_loc = locations->InAt(3);
   1883 
   1884   if (type == DataType::Type::kInt64) {
   1885     Register value_lo = value_loc.AsRegisterPairLow<Register>();
   1886     Register value_hi = value_loc.AsRegisterPairHigh<Register>();
   1887     if (is_volatile) {
   1888       XmmRegister temp1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
   1889       XmmRegister temp2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
   1890       __ movd(temp1, value_lo);
   1891       __ movd(temp2, value_hi);
   1892       __ punpckldq(temp1, temp2);
   1893       __ movsd(Address(base, offset, ScaleFactor::TIMES_1, 0), temp1);
   1894     } else {
   1895       __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value_lo);
   1896       __ movl(Address(base, offset, ScaleFactor::TIMES_1, 4), value_hi);
   1897     }
   1898   } else if (kPoisonHeapReferences && type == DataType::Type::kReference) {
   1899     Register temp = locations->GetTemp(0).AsRegister<Register>();
   1900     __ movl(temp, value_loc.AsRegister<Register>());
   1901     __ PoisonHeapReference(temp);
   1902     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
   1903   } else {
   1904     __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value_loc.AsRegister<Register>());
   1905   }
   1906 
   1907   if (is_volatile) {
   1908     codegen->MemoryFence();
   1909   }
   1910 
   1911   if (type == DataType::Type::kReference) {
   1912     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   1913     codegen->MarkGCCard(locations->GetTemp(0).AsRegister<Register>(),
   1914                         locations->GetTemp(1).AsRegister<Register>(),
   1915                         base,
   1916                         value_loc.AsRegister<Register>(),
   1917                         value_can_be_null);
   1918   }
   1919 }
   1920 
   1921 void IntrinsicCodeGeneratorX86::VisitUnsafePut(HInvoke* invoke) {
   1922   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
   1923 }
   1924 void IntrinsicCodeGeneratorX86::VisitUnsafePutOrdered(HInvoke* invoke) {
   1925   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
   1926 }
   1927 void IntrinsicCodeGeneratorX86::VisitUnsafePutVolatile(HInvoke* invoke) {
   1928   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile= */ true, codegen_);
   1929 }
   1930 void IntrinsicCodeGeneratorX86::VisitUnsafePutObject(HInvoke* invoke) {
   1931   GenUnsafePut(
   1932       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_);
   1933 }
   1934 void IntrinsicCodeGeneratorX86::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
   1935   GenUnsafePut(
   1936       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ false, codegen_);
   1937 }
   1938 void IntrinsicCodeGeneratorX86::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
   1939   GenUnsafePut(
   1940       invoke->GetLocations(), DataType::Type::kReference, /* is_volatile= */ true, codegen_);
   1941 }
   1942 void IntrinsicCodeGeneratorX86::VisitUnsafePutLong(HInvoke* invoke) {
   1943   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
   1944 }
   1945 void IntrinsicCodeGeneratorX86::VisitUnsafePutLongOrdered(HInvoke* invoke) {
   1946   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
   1947 }
   1948 void IntrinsicCodeGeneratorX86::VisitUnsafePutLongVolatile(HInvoke* invoke) {
   1949   GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile= */ true, codegen_);
   1950 }
   1951 
   1952 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
   1953                                        DataType::Type type,
   1954                                        HInvoke* invoke) {
   1955   bool can_call = kEmitCompilerReadBarrier &&
   1956       kUseBakerReadBarrier &&
   1957       (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
   1958   LocationSummary* locations =
   1959       new (allocator) LocationSummary(invoke,
   1960                                       can_call
   1961                                           ? LocationSummary::kCallOnSlowPath
   1962                                           : LocationSummary::kNoCall,
   1963                                       kIntrinsified);
   1964   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   1965   locations->SetInAt(1, Location::RequiresRegister());
   1966   // Offset is a long, but in 32 bit mode, we only need the low word.
   1967   // Can we update the invoke here to remove a TypeConvert to Long?
   1968   locations->SetInAt(2, Location::RequiresRegister());
   1969   // Expected value must be in EAX or EDX:EAX.
   1970   // For long, new value must be in ECX:EBX.
   1971   if (type == DataType::Type::kInt64) {
   1972     locations->SetInAt(3, Location::RegisterPairLocation(EAX, EDX));
   1973     locations->SetInAt(4, Location::RegisterPairLocation(EBX, ECX));
   1974   } else {
   1975     locations->SetInAt(3, Location::RegisterLocation(EAX));
   1976     locations->SetInAt(4, Location::RequiresRegister());
   1977   }
   1978 
   1979   // Force a byte register for the output.
   1980   locations->SetOut(Location::RegisterLocation(EAX));
   1981   if (type == DataType::Type::kReference) {
   1982     // Need temporary registers for card-marking, and possibly for
   1983     // (Baker) read barrier.
   1984     locations->AddTemp(Location::RequiresRegister());  // Possibly used for reference poisoning too.
   1985     // Need a byte register for marking.
   1986     locations->AddTemp(Location::RegisterLocation(ECX));
   1987   }
   1988 }
   1989 
   1990 void IntrinsicLocationsBuilderX86::VisitUnsafeCASInt(HInvoke* invoke) {
   1991   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt32, invoke);
   1992 }
   1993 
   1994 void IntrinsicLocationsBuilderX86::VisitUnsafeCASLong(HInvoke* invoke) {
   1995   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt64, invoke);
   1996 }
   1997 
   1998 void IntrinsicLocationsBuilderX86::VisitUnsafeCASObject(HInvoke* invoke) {
   1999   // The only read barrier implementation supporting the
   2000   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2001   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   2002     return;
   2003   }
   2004 
   2005   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke);
   2006 }
   2007 
   2008 static void GenCAS(DataType::Type type, HInvoke* invoke, CodeGeneratorX86* codegen) {
   2009   X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
   2010   LocationSummary* locations = invoke->GetLocations();
   2011 
   2012   Register base = locations->InAt(1).AsRegister<Register>();
   2013   Register offset = locations->InAt(2).AsRegisterPairLow<Register>();
   2014   Location out = locations->Out();
   2015   DCHECK_EQ(out.AsRegister<Register>(), EAX);
   2016 
   2017   // The address of the field within the holding object.
   2018   Address field_addr(base, offset, ScaleFactor::TIMES_1, 0);
   2019 
   2020   if (type == DataType::Type::kReference) {
   2021     // The only read barrier implementation supporting the
   2022     // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2023     DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2024 
   2025     Location temp1_loc = locations->GetTemp(0);
   2026     Register temp1 = temp1_loc.AsRegister<Register>();
   2027     Register temp2 = locations->GetTemp(1).AsRegister<Register>();
   2028 
   2029     Register expected = locations->InAt(3).AsRegister<Register>();
   2030     // Ensure `expected` is in EAX (required by the CMPXCHG instruction).
   2031     DCHECK_EQ(expected, EAX);
   2032     Register value = locations->InAt(4).AsRegister<Register>();
   2033 
   2034     // Mark card for object assuming new value is stored.
   2035     bool value_can_be_null = true;  // TODO: Worth finding out this information?
   2036     codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
   2037 
   2038     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2039       // Need to make sure the reference stored in the field is a to-space
   2040       // one before attempting the CAS or the CAS could fail incorrectly.
   2041       codegen->GenerateReferenceLoadWithBakerReadBarrier(
   2042           invoke,
   2043           temp1_loc,  // Unused, used only as a "temporary" within the read barrier.
   2044           base,
   2045           field_addr,
   2046           /* needs_null_check= */ false,
   2047           /* always_update_field= */ true,
   2048           &temp2);
   2049     }
   2050 
   2051     bool base_equals_value = (base == value);
   2052     if (kPoisonHeapReferences) {
   2053       if (base_equals_value) {
   2054         // If `base` and `value` are the same register location, move
   2055         // `value` to a temporary register.  This way, poisoning
   2056         // `value` won't invalidate `base`.
   2057         value = temp1;
   2058         __ movl(value, base);
   2059       }
   2060 
   2061       // Check that the register allocator did not assign the location
   2062       // of `expected` (EAX) to `value` nor to `base`, so that heap
   2063       // poisoning (when enabled) works as intended below.
   2064       // - If `value` were equal to `expected`, both references would
   2065       //   be poisoned twice, meaning they would not be poisoned at
   2066       //   all, as heap poisoning uses address negation.
   2067       // - If `base` were equal to `expected`, poisoning `expected`
   2068       //   would invalidate `base`.
   2069       DCHECK_NE(value, expected);
   2070       DCHECK_NE(base, expected);
   2071 
   2072       __ PoisonHeapReference(expected);
   2073       __ PoisonHeapReference(value);
   2074     }
   2075 
   2076     __ LockCmpxchgl(field_addr, value);
   2077 
   2078     // LOCK CMPXCHG has full barrier semantics, and we don't need
   2079     // scheduling barriers at this time.
   2080 
   2081     // Convert ZF into the Boolean result.
   2082     __ setb(kZero, out.AsRegister<Register>());
   2083     __ movzxb(out.AsRegister<Register>(), out.AsRegister<ByteRegister>());
   2084 
   2085     // If heap poisoning is enabled, we need to unpoison the values
   2086     // that were poisoned earlier.
   2087     if (kPoisonHeapReferences) {
   2088       if (base_equals_value) {
   2089         // `value` has been moved to a temporary register, no need to
   2090         // unpoison it.
   2091       } else {
   2092         // Ensure `value` is different from `out`, so that unpoisoning
   2093         // the former does not invalidate the latter.
   2094         DCHECK_NE(value, out.AsRegister<Register>());
   2095         __ UnpoisonHeapReference(value);
   2096       }
   2097       // Do not unpoison the reference contained in register
   2098       // `expected`, as it is the same as register `out` (EAX).
   2099     }
   2100   } else {
   2101     if (type == DataType::Type::kInt32) {
   2102       // Ensure the expected value is in EAX (required by the CMPXCHG
   2103       // instruction).
   2104       DCHECK_EQ(locations->InAt(3).AsRegister<Register>(), EAX);
   2105       __ LockCmpxchgl(field_addr, locations->InAt(4).AsRegister<Register>());
   2106     } else if (type == DataType::Type::kInt64) {
   2107       // Ensure the expected value is in EAX:EDX and that the new
   2108       // value is in EBX:ECX (required by the CMPXCHG8B instruction).
   2109       DCHECK_EQ(locations->InAt(3).AsRegisterPairLow<Register>(), EAX);
   2110       DCHECK_EQ(locations->InAt(3).AsRegisterPairHigh<Register>(), EDX);
   2111       DCHECK_EQ(locations->InAt(4).AsRegisterPairLow<Register>(), EBX);
   2112       DCHECK_EQ(locations->InAt(4).AsRegisterPairHigh<Register>(), ECX);
   2113       __ LockCmpxchg8b(field_addr);
   2114     } else {
   2115       LOG(FATAL) << "Unexpected CAS type " << type;
   2116     }
   2117 
   2118     // LOCK CMPXCHG/LOCK CMPXCHG8B have full barrier semantics, and we
   2119     // don't need scheduling barriers at this time.
   2120 
   2121     // Convert ZF into the Boolean result.
   2122     __ setb(kZero, out.AsRegister<Register>());
   2123     __ movzxb(out.AsRegister<Register>(), out.AsRegister<ByteRegister>());
   2124   }
   2125 }
   2126 
   2127 void IntrinsicCodeGeneratorX86::VisitUnsafeCASInt(HInvoke* invoke) {
   2128   GenCAS(DataType::Type::kInt32, invoke, codegen_);
   2129 }
   2130 
   2131 void IntrinsicCodeGeneratorX86::VisitUnsafeCASLong(HInvoke* invoke) {
   2132   GenCAS(DataType::Type::kInt64, invoke, codegen_);
   2133 }
   2134 
   2135 void IntrinsicCodeGeneratorX86::VisitUnsafeCASObject(HInvoke* invoke) {
   2136   // The only read barrier implementation supporting the
   2137   // UnsafeCASObject intrinsic is the Baker-style read barriers.
   2138   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2139 
   2140   GenCAS(DataType::Type::kReference, invoke, codegen_);
   2141 }
   2142 
   2143 void IntrinsicLocationsBuilderX86::VisitIntegerReverse(HInvoke* invoke) {
   2144   LocationSummary* locations =
   2145       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2146   locations->SetInAt(0, Location::RequiresRegister());
   2147   locations->SetOut(Location::SameAsFirstInput());
   2148   locations->AddTemp(Location::RequiresRegister());
   2149 }
   2150 
   2151 static void SwapBits(Register reg, Register temp, int32_t shift, int32_t mask,
   2152                      X86Assembler* assembler) {
   2153   Immediate imm_shift(shift);
   2154   Immediate imm_mask(mask);
   2155   __ movl(temp, reg);
   2156   __ shrl(reg, imm_shift);
   2157   __ andl(temp, imm_mask);
   2158   __ andl(reg, imm_mask);
   2159   __ shll(temp, imm_shift);
   2160   __ orl(reg, temp);
   2161 }
   2162 
   2163 void IntrinsicCodeGeneratorX86::VisitIntegerReverse(HInvoke* invoke) {
   2164   X86Assembler* assembler = GetAssembler();
   2165   LocationSummary* locations = invoke->GetLocations();
   2166 
   2167   Register reg = locations->InAt(0).AsRegister<Register>();
   2168   Register temp = locations->GetTemp(0).AsRegister<Register>();
   2169 
   2170   /*
   2171    * Use one bswap instruction to reverse byte order first and then use 3 rounds of
   2172    * swapping bits to reverse bits in a number x. Using bswap to save instructions
   2173    * compared to generic luni implementation which has 5 rounds of swapping bits.
   2174    * x = bswap x
   2175    * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
   2176    * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
   2177    * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
   2178    */
   2179   __ bswapl(reg);
   2180   SwapBits(reg, temp, 1, 0x55555555, assembler);
   2181   SwapBits(reg, temp, 2, 0x33333333, assembler);
   2182   SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
   2183 }
   2184 
   2185 void IntrinsicLocationsBuilderX86::VisitLongReverse(HInvoke* invoke) {
   2186   LocationSummary* locations =
   2187       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2188   locations->SetInAt(0, Location::RequiresRegister());
   2189   locations->SetOut(Location::SameAsFirstInput());
   2190   locations->AddTemp(Location::RequiresRegister());
   2191 }
   2192 
   2193 void IntrinsicCodeGeneratorX86::VisitLongReverse(HInvoke* invoke) {
   2194   X86Assembler* assembler = GetAssembler();
   2195   LocationSummary* locations = invoke->GetLocations();
   2196 
   2197   Register reg_low = locations->InAt(0).AsRegisterPairLow<Register>();
   2198   Register reg_high = locations->InAt(0).AsRegisterPairHigh<Register>();
   2199   Register temp = locations->GetTemp(0).AsRegister<Register>();
   2200 
   2201   // We want to swap high/low, then bswap each one, and then do the same
   2202   // as a 32 bit reverse.
   2203   // Exchange high and low.
   2204   __ movl(temp, reg_low);
   2205   __ movl(reg_low, reg_high);
   2206   __ movl(reg_high, temp);
   2207 
   2208   // bit-reverse low
   2209   __ bswapl(reg_low);
   2210   SwapBits(reg_low, temp, 1, 0x55555555, assembler);
   2211   SwapBits(reg_low, temp, 2, 0x33333333, assembler);
   2212   SwapBits(reg_low, temp, 4, 0x0f0f0f0f, assembler);
   2213 
   2214   // bit-reverse high
   2215   __ bswapl(reg_high);
   2216   SwapBits(reg_high, temp, 1, 0x55555555, assembler);
   2217   SwapBits(reg_high, temp, 2, 0x33333333, assembler);
   2218   SwapBits(reg_high, temp, 4, 0x0f0f0f0f, assembler);
   2219 }
   2220 
   2221 static void CreateBitCountLocations(
   2222     ArenaAllocator* allocator, CodeGeneratorX86* codegen, HInvoke* invoke, bool is_long) {
   2223   if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
   2224     // Do nothing if there is no popcnt support. This results in generating
   2225     // a call for the intrinsic rather than direct code.
   2226     return;
   2227   }
   2228   LocationSummary* locations =
   2229       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2230   if (is_long) {
   2231     locations->AddTemp(Location::RequiresRegister());
   2232   }
   2233   locations->SetInAt(0, Location::Any());
   2234   locations->SetOut(Location::RequiresRegister());
   2235 }
   2236 
   2237 static void GenBitCount(X86Assembler* assembler,
   2238                         CodeGeneratorX86* codegen,
   2239                         HInvoke* invoke, bool is_long) {
   2240   LocationSummary* locations = invoke->GetLocations();
   2241   Location src = locations->InAt(0);
   2242   Register out = locations->Out().AsRegister<Register>();
   2243 
   2244   if (invoke->InputAt(0)->IsConstant()) {
   2245     // Evaluate this at compile time.
   2246     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2247     int32_t result = is_long
   2248         ? POPCOUNT(static_cast<uint64_t>(value))
   2249         : POPCOUNT(static_cast<uint32_t>(value));
   2250     codegen->Load32BitValue(out, result);
   2251     return;
   2252   }
   2253 
   2254   // Handle the non-constant cases.
   2255   if (!is_long) {
   2256     if (src.IsRegister()) {
   2257       __ popcntl(out, src.AsRegister<Register>());
   2258     } else {
   2259       DCHECK(src.IsStackSlot());
   2260       __ popcntl(out, Address(ESP, src.GetStackIndex()));
   2261     }
   2262   } else {
   2263     // The 64-bit case needs to worry about two parts.
   2264     Register temp = locations->GetTemp(0).AsRegister<Register>();
   2265     if (src.IsRegisterPair()) {
   2266       __ popcntl(temp, src.AsRegisterPairLow<Register>());
   2267       __ popcntl(out, src.AsRegisterPairHigh<Register>());
   2268     } else {
   2269       DCHECK(src.IsDoubleStackSlot());
   2270       __ popcntl(temp, Address(ESP, src.GetStackIndex()));
   2271       __ popcntl(out, Address(ESP, src.GetHighStackIndex(kX86WordSize)));
   2272     }
   2273     __ addl(out, temp);
   2274   }
   2275 }
   2276 
   2277 void IntrinsicLocationsBuilderX86::VisitIntegerBitCount(HInvoke* invoke) {
   2278   CreateBitCountLocations(allocator_, codegen_, invoke, /* is_long= */ false);
   2279 }
   2280 
   2281 void IntrinsicCodeGeneratorX86::VisitIntegerBitCount(HInvoke* invoke) {
   2282   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ false);
   2283 }
   2284 
   2285 void IntrinsicLocationsBuilderX86::VisitLongBitCount(HInvoke* invoke) {
   2286   CreateBitCountLocations(allocator_, codegen_, invoke, /* is_long= */ true);
   2287 }
   2288 
   2289 void IntrinsicCodeGeneratorX86::VisitLongBitCount(HInvoke* invoke) {
   2290   GenBitCount(GetAssembler(), codegen_, invoke, /* is_long= */ true);
   2291 }
   2292 
   2293 static void CreateLeadingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_long) {
   2294   LocationSummary* locations =
   2295       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2296   if (is_long) {
   2297     locations->SetInAt(0, Location::RequiresRegister());
   2298   } else {
   2299     locations->SetInAt(0, Location::Any());
   2300   }
   2301   locations->SetOut(Location::RequiresRegister());
   2302 }
   2303 
   2304 static void GenLeadingZeros(X86Assembler* assembler,
   2305                             CodeGeneratorX86* codegen,
   2306                             HInvoke* invoke, bool is_long) {
   2307   LocationSummary* locations = invoke->GetLocations();
   2308   Location src = locations->InAt(0);
   2309   Register out = locations->Out().AsRegister<Register>();
   2310 
   2311   if (invoke->InputAt(0)->IsConstant()) {
   2312     // Evaluate this at compile time.
   2313     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2314     if (value == 0) {
   2315       value = is_long ? 64 : 32;
   2316     } else {
   2317       value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
   2318     }
   2319     codegen->Load32BitValue(out, value);
   2320     return;
   2321   }
   2322 
   2323   // Handle the non-constant cases.
   2324   if (!is_long) {
   2325     if (src.IsRegister()) {
   2326       __ bsrl(out, src.AsRegister<Register>());
   2327     } else {
   2328       DCHECK(src.IsStackSlot());
   2329       __ bsrl(out, Address(ESP, src.GetStackIndex()));
   2330     }
   2331 
   2332     // BSR sets ZF if the input was zero, and the output is undefined.
   2333     NearLabel all_zeroes, done;
   2334     __ j(kEqual, &all_zeroes);
   2335 
   2336     // Correct the result from BSR to get the final CLZ result.
   2337     __ xorl(out, Immediate(31));
   2338     __ jmp(&done);
   2339 
   2340     // Fix the zero case with the expected result.
   2341     __ Bind(&all_zeroes);
   2342     __ movl(out, Immediate(32));
   2343 
   2344     __ Bind(&done);
   2345     return;
   2346   }
   2347 
   2348   // 64 bit case needs to worry about both parts of the register.
   2349   DCHECK(src.IsRegisterPair());
   2350   Register src_lo = src.AsRegisterPairLow<Register>();
   2351   Register src_hi = src.AsRegisterPairHigh<Register>();
   2352   NearLabel handle_low, done, all_zeroes;
   2353 
   2354   // Is the high word zero?
   2355   __ testl(src_hi, src_hi);
   2356   __ j(kEqual, &handle_low);
   2357 
   2358   // High word is not zero. We know that the BSR result is defined in this case.
   2359   __ bsrl(out, src_hi);
   2360 
   2361   // Correct the result from BSR to get the final CLZ result.
   2362   __ xorl(out, Immediate(31));
   2363   __ jmp(&done);
   2364 
   2365   // High word was zero.  We have to compute the low word count and add 32.
   2366   __ Bind(&handle_low);
   2367   __ bsrl(out, src_lo);
   2368   __ j(kEqual, &all_zeroes);
   2369 
   2370   // We had a valid result.  Use an XOR to both correct the result and add 32.
   2371   __ xorl(out, Immediate(63));
   2372   __ jmp(&done);
   2373 
   2374   // All zero case.
   2375   __ Bind(&all_zeroes);
   2376   __ movl(out, Immediate(64));
   2377 
   2378   __ Bind(&done);
   2379 }
   2380 
   2381 void IntrinsicLocationsBuilderX86::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2382   CreateLeadingZeroLocations(allocator_, invoke, /* is_long= */ false);
   2383 }
   2384 
   2385 void IntrinsicCodeGeneratorX86::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
   2386   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false);
   2387 }
   2388 
   2389 void IntrinsicLocationsBuilderX86::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2390   CreateLeadingZeroLocations(allocator_, invoke, /* is_long= */ true);
   2391 }
   2392 
   2393 void IntrinsicCodeGeneratorX86::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
   2394   GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true);
   2395 }
   2396 
   2397 static void CreateTrailingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_long) {
   2398   LocationSummary* locations =
   2399       new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   2400   if (is_long) {
   2401     locations->SetInAt(0, Location::RequiresRegister());
   2402   } else {
   2403     locations->SetInAt(0, Location::Any());
   2404   }
   2405   locations->SetOut(Location::RequiresRegister());
   2406 }
   2407 
   2408 static void GenTrailingZeros(X86Assembler* assembler,
   2409                              CodeGeneratorX86* codegen,
   2410                              HInvoke* invoke, bool is_long) {
   2411   LocationSummary* locations = invoke->GetLocations();
   2412   Location src = locations->InAt(0);
   2413   Register out = locations->Out().AsRegister<Register>();
   2414 
   2415   if (invoke->InputAt(0)->IsConstant()) {
   2416     // Evaluate this at compile time.
   2417     int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
   2418     if (value == 0) {
   2419       value = is_long ? 64 : 32;
   2420     } else {
   2421       value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
   2422     }
   2423     codegen->Load32BitValue(out, value);
   2424     return;
   2425   }
   2426 
   2427   // Handle the non-constant cases.
   2428   if (!is_long) {
   2429     if (src.IsRegister()) {
   2430       __ bsfl(out, src.AsRegister<Register>());
   2431     } else {
   2432       DCHECK(src.IsStackSlot());
   2433       __ bsfl(out, Address(ESP, src.GetStackIndex()));
   2434     }
   2435 
   2436     // BSF sets ZF if the input was zero, and the output is undefined.
   2437     NearLabel done;
   2438     __ j(kNotEqual, &done);
   2439 
   2440     // Fix the zero case with the expected result.
   2441     __ movl(out, Immediate(32));
   2442 
   2443     __ Bind(&done);
   2444     return;
   2445   }
   2446 
   2447   // 64 bit case needs to worry about both parts of the register.
   2448   DCHECK(src.IsRegisterPair());
   2449   Register src_lo = src.AsRegisterPairLow<Register>();
   2450   Register src_hi = src.AsRegisterPairHigh<Register>();
   2451   NearLabel done, all_zeroes;
   2452 
   2453   // If the low word is zero, then ZF will be set.  If not, we have the answer.
   2454   __ bsfl(out, src_lo);
   2455   __ j(kNotEqual, &done);
   2456 
   2457   // Low word was zero.  We have to compute the high word count and add 32.
   2458   __ bsfl(out, src_hi);
   2459   __ j(kEqual, &all_zeroes);
   2460 
   2461   // We had a valid result.  Add 32 to account for the low word being zero.
   2462   __ addl(out, Immediate(32));
   2463   __ jmp(&done);
   2464 
   2465   // All zero case.
   2466   __ Bind(&all_zeroes);
   2467   __ movl(out, Immediate(64));
   2468 
   2469   __ Bind(&done);
   2470 }
   2471 
   2472 void IntrinsicLocationsBuilderX86::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2473   CreateTrailingZeroLocations(allocator_, invoke, /* is_long= */ false);
   2474 }
   2475 
   2476 void IntrinsicCodeGeneratorX86::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
   2477   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ false);
   2478 }
   2479 
   2480 void IntrinsicLocationsBuilderX86::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2481   CreateTrailingZeroLocations(allocator_, invoke, /* is_long= */ true);
   2482 }
   2483 
   2484 void IntrinsicCodeGeneratorX86::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
   2485   GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long= */ true);
   2486 }
   2487 
   2488 static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) {
   2489   return instruction->InputAt(input0) == instruction->InputAt(input1);
   2490 }
   2491 
   2492 // Compute base address for the System.arraycopy intrinsic in `base`.
   2493 static void GenSystemArrayCopyBaseAddress(X86Assembler* assembler,
   2494                                           DataType::Type type,
   2495                                           const Register& array,
   2496                                           const Location& pos,
   2497                                           const Register& base) {
   2498   // This routine is only used by the SystemArrayCopy intrinsic at the
   2499   // moment. We can allow DataType::Type::kReference as `type` to implement
   2500   // the SystemArrayCopyChar intrinsic.
   2501   DCHECK_EQ(type, DataType::Type::kReference);
   2502   const int32_t element_size = DataType::Size(type);
   2503   const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type));
   2504   const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
   2505 
   2506   if (pos.IsConstant()) {
   2507     int32_t constant = pos.GetConstant()->AsIntConstant()->GetValue();
   2508     __ leal(base, Address(array, element_size * constant + data_offset));
   2509   } else {
   2510     __ leal(base, Address(array, pos.AsRegister<Register>(), scale_factor, data_offset));
   2511   }
   2512 }
   2513 
   2514 // Compute end source address for the System.arraycopy intrinsic in `end`.
   2515 static void GenSystemArrayCopyEndAddress(X86Assembler* assembler,
   2516                                          DataType::Type type,
   2517                                          const Location& copy_length,
   2518                                          const Register& base,
   2519                                          const Register& end) {
   2520   // This routine is only used by the SystemArrayCopy intrinsic at the
   2521   // moment. We can allow DataType::Type::kReference as `type` to implement
   2522   // the SystemArrayCopyChar intrinsic.
   2523   DCHECK_EQ(type, DataType::Type::kReference);
   2524   const int32_t element_size = DataType::Size(type);
   2525   const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type));
   2526 
   2527   if (copy_length.IsConstant()) {
   2528     int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
   2529     __ leal(end, Address(base, element_size * constant));
   2530   } else {
   2531     __ leal(end, Address(base, copy_length.AsRegister<Register>(), scale_factor, 0));
   2532   }
   2533 }
   2534 
   2535 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
   2536   // The only read barrier implementation supporting the
   2537   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   2538   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
   2539     return;
   2540   }
   2541 
   2542   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
   2543   if (invoke->GetLocations() != nullptr) {
   2544     // Need a byte register for marking.
   2545     invoke->GetLocations()->SetTempAt(1, Location::RegisterLocation(ECX));
   2546 
   2547     static constexpr size_t kSrc = 0;
   2548     static constexpr size_t kSrcPos = 1;
   2549     static constexpr size_t kDest = 2;
   2550     static constexpr size_t kDestPos = 3;
   2551     static constexpr size_t kLength = 4;
   2552 
   2553     if (!invoke->InputAt(kSrcPos)->IsIntConstant() &&
   2554         !invoke->InputAt(kDestPos)->IsIntConstant() &&
   2555         !invoke->InputAt(kLength)->IsIntConstant()) {
   2556       if (!IsSameInput(invoke, kSrcPos, kDestPos) &&
   2557           !IsSameInput(invoke, kSrcPos, kLength) &&
   2558           !IsSameInput(invoke, kDestPos, kLength) &&
   2559           !IsSameInput(invoke, kSrc, kDest)) {
   2560         // Not enough registers, make the length also take a stack slot.
   2561         invoke->GetLocations()->SetInAt(kLength, Location::Any());
   2562       }
   2563     }
   2564   }
   2565 }
   2566 
   2567 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
   2568   // The only read barrier implementation supporting the
   2569   // SystemArrayCopy intrinsic is the Baker-style read barriers.
   2570   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
   2571 
   2572   X86Assembler* assembler = GetAssembler();
   2573   LocationSummary* locations = invoke->GetLocations();
   2574 
   2575   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   2576   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   2577   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   2578   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   2579   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
   2580 
   2581   Register src = locations->InAt(0).AsRegister<Register>();
   2582   Location src_pos = locations->InAt(1);
   2583   Register dest = locations->InAt(2).AsRegister<Register>();
   2584   Location dest_pos = locations->InAt(3);
   2585   Location length_arg = locations->InAt(4);
   2586   Location length = length_arg;
   2587   Location temp1_loc = locations->GetTemp(0);
   2588   Register temp1 = temp1_loc.AsRegister<Register>();
   2589   Location temp2_loc = locations->GetTemp(1);
   2590   Register temp2 = temp2_loc.AsRegister<Register>();
   2591 
   2592   SlowPathCode* intrinsic_slow_path =
   2593       new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
   2594   codegen_->AddSlowPath(intrinsic_slow_path);
   2595 
   2596   NearLabel conditions_on_positions_validated;
   2597   SystemArrayCopyOptimizations optimizations(invoke);
   2598 
   2599   // If source and destination are the same, we go to slow path if we need to do
   2600   // forward copying.
   2601   if (src_pos.IsConstant()) {
   2602     int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
   2603     if (dest_pos.IsConstant()) {
   2604       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   2605       if (optimizations.GetDestinationIsSource()) {
   2606         // Checked when building locations.
   2607         DCHECK_GE(src_pos_constant, dest_pos_constant);
   2608       } else if (src_pos_constant < dest_pos_constant) {
   2609         __ cmpl(src, dest);
   2610         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2611       }
   2612     } else {
   2613       if (!optimizations.GetDestinationIsSource()) {
   2614         __ cmpl(src, dest);
   2615         __ j(kNotEqual, &conditions_on_positions_validated);
   2616       }
   2617       __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant));
   2618       __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
   2619     }
   2620   } else {
   2621     if (!optimizations.GetDestinationIsSource()) {
   2622       __ cmpl(src, dest);
   2623       __ j(kNotEqual, &conditions_on_positions_validated);
   2624     }
   2625     if (dest_pos.IsConstant()) {
   2626       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
   2627       __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant));
   2628       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   2629     } else {
   2630       __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>());
   2631       __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   2632     }
   2633   }
   2634 
   2635   __ Bind(&conditions_on_positions_validated);
   2636 
   2637   if (!optimizations.GetSourceIsNotNull()) {
   2638     // Bail out if the source is null.
   2639     __ testl(src, src);
   2640     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2641   }
   2642 
   2643   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
   2644     // Bail out if the destination is null.
   2645     __ testl(dest, dest);
   2646     __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2647   }
   2648 
   2649   Location temp3_loc = locations->GetTemp(2);
   2650   Register temp3 = temp3_loc.AsRegister<Register>();
   2651   if (length.IsStackSlot()) {
   2652     __ movl(temp3, Address(ESP, length.GetStackIndex()));
   2653     length = Location::RegisterLocation(temp3);
   2654   }
   2655 
   2656   // If the length is negative, bail out.
   2657   // We have already checked in the LocationsBuilder for the constant case.
   2658   if (!length.IsConstant() &&
   2659       !optimizations.GetCountIsSourceLength() &&
   2660       !optimizations.GetCountIsDestinationLength()) {
   2661     __ testl(length.AsRegister<Register>(), length.AsRegister<Register>());
   2662     __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   2663   }
   2664 
   2665   // Validity checks: source.
   2666   CheckPosition(assembler,
   2667                 src_pos,
   2668                 src,
   2669                 length,
   2670                 intrinsic_slow_path,
   2671                 temp1,
   2672                 optimizations.GetCountIsSourceLength());
   2673 
   2674   // Validity checks: dest.
   2675   CheckPosition(assembler,
   2676                 dest_pos,
   2677                 dest,
   2678                 length,
   2679                 intrinsic_slow_path,
   2680                 temp1,
   2681                 optimizations.GetCountIsDestinationLength());
   2682 
   2683   if (!optimizations.GetDoesNotNeedTypeCheck()) {
   2684     // Check whether all elements of the source array are assignable to the component
   2685     // type of the destination array. We do two checks: the classes are the same,
   2686     // or the destination is Object[]. If none of these checks succeed, we go to the
   2687     // slow path.
   2688 
   2689     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   2690       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2691         // /* HeapReference<Class> */ temp1 = src->klass_
   2692         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2693             invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
   2694         // Bail out if the source is not a non primitive array.
   2695         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2696         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2697             invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false);
   2698         __ testl(temp1, temp1);
   2699         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2700         // If heap poisoning is enabled, `temp1` has been unpoisoned
   2701         // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2702       } else {
   2703         // /* HeapReference<Class> */ temp1 = src->klass_
   2704         __ movl(temp1, Address(src, class_offset));
   2705         __ MaybeUnpoisonHeapReference(temp1);
   2706         // Bail out if the source is not a non primitive array.
   2707         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2708         __ movl(temp1, Address(temp1, component_offset));
   2709         __ testl(temp1, temp1);
   2710         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2711         __ MaybeUnpoisonHeapReference(temp1);
   2712       }
   2713       __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
   2714       __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2715     }
   2716 
   2717     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2718       if (length.Equals(Location::RegisterLocation(temp3))) {
   2719         // When Baker read barriers are enabled, register `temp3`,
   2720         // which in the present case contains the `length` parameter,
   2721         // will be overwritten below.  Make the `length` location
   2722         // reference the original stack location; it will be moved
   2723         // back to `temp3` later if necessary.
   2724         DCHECK(length_arg.IsStackSlot());
   2725         length = length_arg;
   2726       }
   2727 
   2728       // /* HeapReference<Class> */ temp1 = dest->klass_
   2729       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2730           invoke, temp1_loc, dest, class_offset, /* needs_null_check= */ false);
   2731 
   2732       if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   2733         // Bail out if the destination is not a non primitive array.
   2734         //
   2735         // Register `temp1` is not trashed by the read barrier emitted
   2736         // by GenerateFieldLoadWithBakerReadBarrier below, as that
   2737         // method produces a call to a ReadBarrierMarkRegX entry point,
   2738         // which saves all potentially live registers, including
   2739         // temporaries such a `temp1`.
   2740         // /* HeapReference<Class> */ temp2 = temp1->component_type_
   2741         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2742             invoke, temp2_loc, temp1, component_offset, /* needs_null_check= */ false);
   2743         __ testl(temp2, temp2);
   2744         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2745         // If heap poisoning is enabled, `temp2` has been unpoisoned
   2746         // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2747         __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
   2748         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2749       }
   2750 
   2751       // For the same reason given earlier, `temp1` is not trashed by the
   2752       // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
   2753       // /* HeapReference<Class> */ temp2 = src->klass_
   2754       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2755           invoke, temp2_loc, src, class_offset, /* needs_null_check= */ false);
   2756       // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
   2757       __ cmpl(temp1, temp2);
   2758 
   2759       if (optimizations.GetDestinationIsTypedObjectArray()) {
   2760         NearLabel do_copy;
   2761         __ j(kEqual, &do_copy);
   2762         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2763         codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2764             invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false);
   2765         // We do not need to emit a read barrier for the following
   2766         // heap reference load, as `temp1` is only used in a
   2767         // comparison with null below, and this reference is not
   2768         // kept afterwards.
   2769         __ cmpl(Address(temp1, super_offset), Immediate(0));
   2770         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2771         __ Bind(&do_copy);
   2772       } else {
   2773         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2774       }
   2775     } else {
   2776       // Non read barrier code.
   2777 
   2778       // /* HeapReference<Class> */ temp1 = dest->klass_
   2779       __ movl(temp1, Address(dest, class_offset));
   2780       if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
   2781         __ MaybeUnpoisonHeapReference(temp1);
   2782         // Bail out if the destination is not a non primitive array.
   2783         // /* HeapReference<Class> */ temp2 = temp1->component_type_
   2784         __ movl(temp2, Address(temp1, component_offset));
   2785         __ testl(temp2, temp2);
   2786         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2787         __ MaybeUnpoisonHeapReference(temp2);
   2788         __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
   2789         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2790         // Re-poison the heap reference to make the compare instruction below
   2791         // compare two poisoned references.
   2792         __ PoisonHeapReference(temp1);
   2793       }
   2794 
   2795       // Note: if heap poisoning is on, we are comparing two poisoned references here.
   2796       __ cmpl(temp1, Address(src, class_offset));
   2797 
   2798       if (optimizations.GetDestinationIsTypedObjectArray()) {
   2799         NearLabel do_copy;
   2800         __ j(kEqual, &do_copy);
   2801         __ MaybeUnpoisonHeapReference(temp1);
   2802         // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2803         __ movl(temp1, Address(temp1, component_offset));
   2804         __ MaybeUnpoisonHeapReference(temp1);
   2805         __ cmpl(Address(temp1, super_offset), Immediate(0));
   2806         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2807         __ Bind(&do_copy);
   2808       } else {
   2809         __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2810       }
   2811     }
   2812   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
   2813     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
   2814     // Bail out if the source is not a non primitive array.
   2815     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2816       // /* HeapReference<Class> */ temp1 = src->klass_
   2817       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2818           invoke, temp1_loc, src, class_offset, /* needs_null_check= */ false);
   2819       // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2820       codegen_->GenerateFieldLoadWithBakerReadBarrier(
   2821           invoke, temp1_loc, temp1, component_offset, /* needs_null_check= */ false);
   2822       __ testl(temp1, temp1);
   2823       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2824       // If heap poisoning is enabled, `temp1` has been unpoisoned
   2825       // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
   2826     } else {
   2827       // /* HeapReference<Class> */ temp1 = src->klass_
   2828       __ movl(temp1, Address(src, class_offset));
   2829       __ MaybeUnpoisonHeapReference(temp1);
   2830       // /* HeapReference<Class> */ temp1 = temp1->component_type_
   2831       __ movl(temp1, Address(temp1, component_offset));
   2832       __ testl(temp1, temp1);
   2833       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   2834       __ MaybeUnpoisonHeapReference(temp1);
   2835     }
   2836     __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
   2837     __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   2838   }
   2839 
   2840   const DataType::Type type = DataType::Type::kReference;
   2841   const int32_t element_size = DataType::Size(type);
   2842 
   2843   // Compute the base source address in `temp1`.
   2844   GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1);
   2845 
   2846   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
   2847     // If it is needed (in the case of the fast-path loop), the base
   2848     // destination address is computed later, as `temp2` is used for
   2849     // intermediate computations.
   2850 
   2851     // Compute the end source address in `temp3`.
   2852     if (length.IsStackSlot()) {
   2853       // Location `length` is again pointing at a stack slot, as
   2854       // register `temp3` (which was containing the length parameter
   2855       // earlier) has been overwritten; restore it now
   2856       DCHECK(length.Equals(length_arg));
   2857       __ movl(temp3, Address(ESP, length.GetStackIndex()));
   2858       length = Location::RegisterLocation(temp3);
   2859     }
   2860     GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
   2861 
   2862     // SystemArrayCopy implementation for Baker read barriers (see
   2863     // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier):
   2864     //
   2865     //   if (src_ptr != end_ptr) {
   2866     //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
   2867     //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
   2868     //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   2869     //     if (is_gray) {
   2870     //       // Slow-path copy.
   2871     //       for (size_t i = 0; i != length; ++i) {
   2872     //         dest_array[dest_pos + i] =
   2873     //             MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i])));
   2874     //       }
   2875     //     } else {
   2876     //       // Fast-path copy.
   2877     //       do {
   2878     //         *dest_ptr++ = *src_ptr++;
   2879     //       } while (src_ptr != end_ptr)
   2880     //     }
   2881     //   }
   2882 
   2883     NearLabel loop, done;
   2884 
   2885     // Don't enter copy loop if `length == 0`.
   2886     __ cmpl(temp1, temp3);
   2887     __ j(kEqual, &done);
   2888 
   2889     // Given the numeric representation, it's enough to check the low bit of the rb_state.
   2890     static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0");
   2891     static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
   2892     constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
   2893     constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
   2894     constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
   2895 
   2896     // if (rb_state == ReadBarrier::GrayState())
   2897     //   goto slow_path;
   2898     // At this point, just do the "if" and make sure that flags are preserved until the branch.
   2899     __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
   2900 
   2901     // Load fence to prevent load-load reordering.
   2902     // Note that this is a no-op, thanks to the x86 memory model.
   2903     codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   2904 
   2905     // Slow path used to copy array when `src` is gray.
   2906     SlowPathCode* read_barrier_slow_path =
   2907         new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke);
   2908     codegen_->AddSlowPath(read_barrier_slow_path);
   2909 
   2910     // We have done the "if" of the gray bit check above, now branch based on the flags.
   2911     __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
   2912 
   2913     // Fast-path copy.
   2914     // Compute the base destination address in `temp2`.
   2915     GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
   2916     // Iterate over the arrays and do a raw copy of the objects. We don't need to
   2917     // poison/unpoison.
   2918     __ Bind(&loop);
   2919     __ pushl(Address(temp1, 0));
   2920     __ cfi().AdjustCFAOffset(4);
   2921     __ popl(Address(temp2, 0));
   2922     __ cfi().AdjustCFAOffset(-4);
   2923     __ addl(temp1, Immediate(element_size));
   2924     __ addl(temp2, Immediate(element_size));
   2925     __ cmpl(temp1, temp3);
   2926     __ j(kNotEqual, &loop);
   2927 
   2928     __ Bind(read_barrier_slow_path->GetExitLabel());
   2929     __ Bind(&done);
   2930   } else {
   2931     // Non read barrier code.
   2932     // Compute the base destination address in `temp2`.
   2933     GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2);
   2934     // Compute the end source address in `temp3`.
   2935     GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3);
   2936     // Iterate over the arrays and do a raw copy of the objects. We don't need to
   2937     // poison/unpoison.
   2938     NearLabel loop, done;
   2939     __ cmpl(temp1, temp3);
   2940     __ j(kEqual, &done);
   2941     __ Bind(&loop);
   2942     __ pushl(Address(temp1, 0));
   2943     __ cfi().AdjustCFAOffset(4);
   2944     __ popl(Address(temp2, 0));
   2945     __ cfi().AdjustCFAOffset(-4);
   2946     __ addl(temp1, Immediate(element_size));
   2947     __ addl(temp2, Immediate(element_size));
   2948     __ cmpl(temp1, temp3);
   2949     __ j(kNotEqual, &loop);
   2950     __ Bind(&done);
   2951   }
   2952 
   2953   // We only need one card marking on the destination array.
   2954   codegen_->MarkGCCard(temp1, temp2, dest, Register(kNoRegister), /* value_can_be_null= */ false);
   2955 
   2956   __ Bind(intrinsic_slow_path->GetExitLabel());
   2957 }
   2958 
   2959 void IntrinsicLocationsBuilderX86::VisitIntegerValueOf(HInvoke* invoke) {
   2960   DCHECK(invoke->IsInvokeStaticOrDirect());
   2961   InvokeRuntimeCallingConvention calling_convention;
   2962   IntrinsicVisitor::ComputeIntegerValueOfLocations(
   2963       invoke,
   2964       codegen_,
   2965       Location::RegisterLocation(EAX),
   2966       Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
   2967 
   2968   LocationSummary* locations = invoke->GetLocations();
   2969   if (locations != nullptr) {
   2970     HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
   2971     if (invoke_static_or_direct->HasSpecialInput() &&
   2972         invoke->InputAt(invoke_static_or_direct->GetSpecialInputIndex())
   2973             ->IsX86ComputeBaseMethodAddress()) {
   2974       locations->SetInAt(invoke_static_or_direct->GetSpecialInputIndex(),
   2975                          Location::RequiresRegister());
   2976     }
   2977   }
   2978 }
   2979 
   2980 void IntrinsicCodeGeneratorX86::VisitIntegerValueOf(HInvoke* invoke) {
   2981   DCHECK(invoke->IsInvokeStaticOrDirect());
   2982   IntrinsicVisitor::IntegerValueOfInfo info =
   2983       IntrinsicVisitor::ComputeIntegerValueOfInfo(invoke, codegen_->GetCompilerOptions());
   2984   LocationSummary* locations = invoke->GetLocations();
   2985   X86Assembler* assembler = GetAssembler();
   2986 
   2987   Register out = locations->Out().AsRegister<Register>();
   2988   InvokeRuntimeCallingConvention calling_convention;
   2989   if (invoke->InputAt(0)->IsConstant()) {
   2990     int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
   2991     if (static_cast<uint32_t>(value - info.low) < info.length) {
   2992       // Just embed the j.l.Integer in the code.
   2993       DCHECK_NE(info.value_boot_image_reference, IntegerValueOfInfo::kInvalidReference);
   2994       codegen_->LoadBootImageAddress(
   2995           out, info.value_boot_image_reference, invoke->AsInvokeStaticOrDirect());
   2996     } else {
   2997       DCHECK(locations->CanCall());
   2998       // Allocate and initialize a new j.l.Integer.
   2999       // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
   3000       // JIT object table.
   3001       codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
   3002                                              info.integer_boot_image_offset);
   3003       __ movl(Address(out, info.value_offset), Immediate(value));
   3004     }
   3005   } else {
   3006     DCHECK(locations->CanCall());
   3007     Register in = locations->InAt(0).AsRegister<Register>();
   3008     // Check bounds of our cache.
   3009     __ leal(out, Address(in, -info.low));
   3010     __ cmpl(out, Immediate(info.length));
   3011     NearLabel allocate, done;
   3012     __ j(kAboveEqual, &allocate);
   3013     // If the value is within the bounds, load the j.l.Integer directly from the array.
   3014     constexpr size_t kElementSize = sizeof(mirror::HeapReference<mirror::Object>);
   3015     static_assert((1u << TIMES_4) == sizeof(mirror::HeapReference<mirror::Object>),
   3016                   "Check heap reference size.");
   3017     if (codegen_->GetCompilerOptions().IsBootImage()) {
   3018       DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   3019       size_t method_address_index = invoke->AsInvokeStaticOrDirect()->GetSpecialInputIndex();
   3020       HX86ComputeBaseMethodAddress* method_address =
   3021           invoke->InputAt(method_address_index)->AsX86ComputeBaseMethodAddress();
   3022       DCHECK(method_address != nullptr);
   3023       Register method_address_reg =
   3024           invoke->GetLocations()->InAt(method_address_index).AsRegister<Register>();
   3025       __ movl(out, Address(method_address_reg, out, TIMES_4, CodeGeneratorX86::kDummy32BitOffset));
   3026       codegen_->RecordBootImageIntrinsicPatch(method_address, info.array_data_boot_image_reference);
   3027     } else {
   3028       // Note: We're about to clobber the index in `out`, so we need to use `in` and
   3029       // adjust the offset accordingly.
   3030       uint32_t mid_array_boot_image_offset =
   3031               info.array_data_boot_image_reference - info.low * kElementSize;
   3032       codegen_->LoadBootImageAddress(
   3033           out, mid_array_boot_image_offset, invoke->AsInvokeStaticOrDirect());
   3034       DCHECK_NE(out, in);
   3035       __ movl(out, Address(out, in, TIMES_4, 0));
   3036     }
   3037     __ MaybeUnpoisonHeapReference(out);
   3038     __ jmp(&done);
   3039     __ Bind(&allocate);
   3040     // Otherwise allocate and initialize a new j.l.Integer.
   3041     codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
   3042                                            info.integer_boot_image_offset);
   3043     __ movl(Address(out, info.value_offset), in);
   3044     __ Bind(&done);
   3045   }
   3046 }
   3047 
   3048 void IntrinsicLocationsBuilderX86::VisitThreadInterrupted(HInvoke* invoke) {
   3049   LocationSummary* locations =
   3050       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3051   locations->SetOut(Location::RequiresRegister());
   3052 }
   3053 
   3054 void IntrinsicCodeGeneratorX86::VisitThreadInterrupted(HInvoke* invoke) {
   3055   X86Assembler* assembler = GetAssembler();
   3056   Register out = invoke->GetLocations()->Out().AsRegister<Register>();
   3057   Address address = Address::Absolute(Thread::InterruptedOffset<kX86PointerSize>().Int32Value());
   3058   NearLabel done;
   3059   __ fs()->movl(out, address);
   3060   __ testl(out, out);
   3061   __ j(kEqual, &done);
   3062   __ fs()->movl(address, Immediate(0));
   3063   codegen_->MemoryFence();
   3064   __ Bind(&done);
   3065 }
   3066 
   3067 void IntrinsicLocationsBuilderX86::VisitReachabilityFence(HInvoke* invoke) {
   3068   LocationSummary* locations =
   3069       new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
   3070   locations->SetInAt(0, Location::Any());
   3071 }
   3072 
   3073 void IntrinsicCodeGeneratorX86::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
   3074 
   3075 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
   3076 UNIMPLEMENTED_INTRINSIC(X86, ReferenceGetReferent)
   3077 UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite)
   3078 UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite)
   3079 UNIMPLEMENTED_INTRINSIC(X86, IntegerHighestOneBit)
   3080 UNIMPLEMENTED_INTRINSIC(X86, LongHighestOneBit)
   3081 UNIMPLEMENTED_INTRINSIC(X86, CRC32Update)
   3082 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes)
   3083 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
   3084 
   3085 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
   3086 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
   3087 UNIMPLEMENTED_INTRINSIC(X86, StringBufferAppend);
   3088 UNIMPLEMENTED_INTRINSIC(X86, StringBufferLength);
   3089 UNIMPLEMENTED_INTRINSIC(X86, StringBufferToString);
   3090 UNIMPLEMENTED_INTRINSIC(X86, StringBuilderAppend);
   3091 UNIMPLEMENTED_INTRINSIC(X86, StringBuilderLength);
   3092 UNIMPLEMENTED_INTRINSIC(X86, StringBuilderToString);
   3093 
   3094 // 1.8.
   3095 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddInt)
   3096 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddLong)
   3097 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndSetInt)
   3098 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndSetLong)
   3099 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndSetObject)
   3100 
   3101 UNREACHABLE_INTRINSICS(X86)
   3102 
   3103 #undef __
   3104 
   3105 }  // namespace x86
   3106 }  // namespace art
   3107