Home | History | Annotate | Download | only in aarch64
      1 /* Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
      2 
      3 Permission is hereby granted, free of charge, to any person obtaining
      4 a copy of this software and associated documentation files (the
      5 ``Software''), to deal in the Software without restriction, including
      6 without limitation the rights to use, copy, modify, merge, publish,
      7 distribute, sublicense, and/or sell copies of the Software, and to
      8 permit persons to whom the Software is furnished to do so, subject to
      9 the following conditions:
     10 
     11 The above copyright notice and this permission notice shall be
     12 included in all copies or substantial portions of the Software.
     13 
     14 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
     15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     17 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     19 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     20 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
     21 
     22 #include <stdio.h>
     23 
     24 #include <ffi.h>
     25 #include <ffi_common.h>
     26 
     27 #include <stdlib.h>
     28 
     29 /* Stack alignment requirement in bytes */
     30 #if defined (__APPLE__)
     31 #define AARCH64_STACK_ALIGN 1
     32 #else
     33 #define AARCH64_STACK_ALIGN 16
     34 #endif
     35 
     36 #define N_X_ARG_REG 8
     37 #define N_V_ARG_REG 8
     38 
     39 #define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
     40 
     41 union _d
     42 {
     43   UINT64 d;
     44   UINT32 s[2];
     45 };
     46 
     47 struct call_context
     48 {
     49   UINT64 x [AARCH64_N_XREG];
     50   struct
     51   {
     52     union _d d[2];
     53   } v [AARCH64_N_VREG];
     54 };
     55 
     56 #if defined (__clang__) && defined (__APPLE__)
     57 extern void
     58 sys_icache_invalidate (void *start, size_t len);
     59 #endif
     60 
     61 static inline void
     62 ffi_clear_cache (void *start, void *end)
     63 {
     64 #if defined (__clang__) && defined (__APPLE__)
     65 	sys_icache_invalidate (start, (char *)end - (char *)start);
     66 #elif defined (__GNUC__)
     67 	__builtin___clear_cache (start, end);
     68 #else
     69 #error "Missing builtin to flush instruction cache"
     70 #endif
     71 }
     72 
     73 static void *
     74 get_x_addr (struct call_context *context, unsigned n)
     75 {
     76   return &context->x[n];
     77 }
     78 
     79 static void *
     80 get_s_addr (struct call_context *context, unsigned n)
     81 {
     82 #if defined __AARCH64EB__
     83   return &context->v[n].d[1].s[1];
     84 #else
     85   return &context->v[n].d[0].s[0];
     86 #endif
     87 }
     88 
     89 static void *
     90 get_d_addr (struct call_context *context, unsigned n)
     91 {
     92 #if defined __AARCH64EB__
     93   return &context->v[n].d[1];
     94 #else
     95   return &context->v[n].d[0];
     96 #endif
     97 }
     98 
     99 static void *
    100 get_v_addr (struct call_context *context, unsigned n)
    101 {
    102   return &context->v[n];
    103 }
    104 
    105 /* Return the memory location at which a basic type would reside
    106    were it to have been stored in register n.  */
    107 
    108 static void *
    109 get_basic_type_addr (unsigned short type, struct call_context *context,
    110 		     unsigned n)
    111 {
    112   switch (type)
    113     {
    114     case FFI_TYPE_FLOAT:
    115       return get_s_addr (context, n);
    116     case FFI_TYPE_DOUBLE:
    117       return get_d_addr (context, n);
    118 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    119     case FFI_TYPE_LONGDOUBLE:
    120       return get_v_addr (context, n);
    121 #endif
    122     case FFI_TYPE_UINT8:
    123     case FFI_TYPE_SINT8:
    124     case FFI_TYPE_UINT16:
    125     case FFI_TYPE_SINT16:
    126     case FFI_TYPE_UINT32:
    127     case FFI_TYPE_SINT32:
    128     case FFI_TYPE_INT:
    129     case FFI_TYPE_POINTER:
    130     case FFI_TYPE_UINT64:
    131     case FFI_TYPE_SINT64:
    132       return get_x_addr (context, n);
    133     case FFI_TYPE_VOID:
    134       return NULL;
    135     default:
    136       FFI_ASSERT (0);
    137       return NULL;
    138     }
    139 }
    140 
    141 /* Return the alignment width for each of the basic types.  */
    142 
    143 static size_t
    144 get_basic_type_alignment (unsigned short type)
    145 {
    146   switch (type)
    147     {
    148     case FFI_TYPE_FLOAT:
    149     case FFI_TYPE_DOUBLE:
    150       return sizeof (UINT64);
    151 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    152     case FFI_TYPE_LONGDOUBLE:
    153       return sizeof (long double);
    154 #endif
    155     case FFI_TYPE_UINT8:
    156     case FFI_TYPE_SINT8:
    157 #if defined (__APPLE__)
    158 	  return sizeof (UINT8);
    159 #endif
    160     case FFI_TYPE_UINT16:
    161     case FFI_TYPE_SINT16:
    162 #if defined (__APPLE__)
    163 	  return sizeof (UINT16);
    164 #endif
    165     case FFI_TYPE_UINT32:
    166     case FFI_TYPE_INT:
    167     case FFI_TYPE_SINT32:
    168 #if defined (__APPLE__)
    169 	  return sizeof (UINT32);
    170 #endif
    171     case FFI_TYPE_POINTER:
    172     case FFI_TYPE_UINT64:
    173     case FFI_TYPE_SINT64:
    174       return sizeof (UINT64);
    175 
    176     default:
    177       FFI_ASSERT (0);
    178       return 0;
    179     }
    180 }
    181 
    182 /* Return the size in bytes for each of the basic types.  */
    183 
    184 static size_t
    185 get_basic_type_size (unsigned short type)
    186 {
    187   switch (type)
    188     {
    189     case FFI_TYPE_FLOAT:
    190       return sizeof (UINT32);
    191     case FFI_TYPE_DOUBLE:
    192       return sizeof (UINT64);
    193 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    194     case FFI_TYPE_LONGDOUBLE:
    195       return sizeof (long double);
    196 #endif
    197     case FFI_TYPE_UINT8:
    198       return sizeof (UINT8);
    199     case FFI_TYPE_SINT8:
    200       return sizeof (SINT8);
    201     case FFI_TYPE_UINT16:
    202       return sizeof (UINT16);
    203     case FFI_TYPE_SINT16:
    204       return sizeof (SINT16);
    205     case FFI_TYPE_UINT32:
    206       return sizeof (UINT32);
    207     case FFI_TYPE_INT:
    208     case FFI_TYPE_SINT32:
    209       return sizeof (SINT32);
    210     case FFI_TYPE_POINTER:
    211     case FFI_TYPE_UINT64:
    212       return sizeof (UINT64);
    213     case FFI_TYPE_SINT64:
    214       return sizeof (SINT64);
    215 
    216     default:
    217       FFI_ASSERT (0);
    218       return 0;
    219     }
    220 }
    221 
    222 extern void
    223 ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
    224 			    extended_cif *),
    225                struct call_context *context,
    226                extended_cif *,
    227                size_t,
    228                void (*fn)(void));
    229 
    230 extern void
    231 ffi_closure_SYSV (ffi_closure *);
    232 
    233 /* Test for an FFI floating point representation.  */
    234 
    235 static unsigned
    236 is_floating_type (unsigned short type)
    237 {
    238   return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
    239 	  || type == FFI_TYPE_LONGDOUBLE);
    240 }
    241 
    242 /* Test for a homogeneous structure.  */
    243 
    244 static unsigned short
    245 get_homogeneous_type (ffi_type *ty)
    246 {
    247   if (ty->type == FFI_TYPE_STRUCT && ty->elements)
    248     {
    249       unsigned i;
    250       unsigned short candidate_type
    251 	= get_homogeneous_type (ty->elements[0]);
    252       for (i =1; ty->elements[i]; i++)
    253 	{
    254 	  unsigned short iteration_type = 0;
    255 	  /* If we have a nested struct, we must find its homogeneous type.
    256 	     If that fits with our candidate type, we are still
    257 	     homogeneous.  */
    258 	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
    259 	      && ty->elements[i]->elements)
    260 	    {
    261 	      iteration_type = get_homogeneous_type (ty->elements[i]);
    262 	    }
    263 	  else
    264 	    {
    265 	      iteration_type = ty->elements[i]->type;
    266 	    }
    267 
    268 	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
    269 	  if (candidate_type != iteration_type)
    270 	    return FFI_TYPE_STRUCT;
    271 	}
    272       return candidate_type;
    273     }
    274 
    275   /* Base case, we have no more levels of nesting, so we
    276      are a basic type, and so, trivially homogeneous in that type.  */
    277   return ty->type;
    278 }
    279 
    280 /* Determine the number of elements within a STRUCT.
    281 
    282    Note, we must handle nested structs.
    283 
    284    If ty is not a STRUCT this function will return 0.  */
    285 
    286 static unsigned
    287 element_count (ffi_type *ty)
    288 {
    289   if (ty->type == FFI_TYPE_STRUCT && ty->elements)
    290     {
    291       unsigned n;
    292       unsigned elems = 0;
    293       for (n = 0; ty->elements[n]; n++)
    294 	{
    295 	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
    296 	      && ty->elements[n]->elements)
    297 	    elems += element_count (ty->elements[n]);
    298 	  else
    299 	    elems++;
    300 	}
    301       return elems;
    302     }
    303   return 0;
    304 }
    305 
    306 /* Test for a homogeneous floating point aggregate.
    307 
    308    A homogeneous floating point aggregate is a homogeneous aggregate of
    309    a half- single- or double- precision floating point type with one
    310    to four elements.  Note that this includes nested structs of the
    311    basic type.  */
    312 
    313 static int
    314 is_hfa (ffi_type *ty)
    315 {
    316   if (ty->type == FFI_TYPE_STRUCT
    317       && ty->elements[0]
    318       && is_floating_type (get_homogeneous_type (ty)))
    319     {
    320       unsigned n = element_count (ty);
    321       return n >= 1 && n <= 4;
    322     }
    323   return 0;
    324 }
    325 
    326 /* Test if an ffi_type is a candidate for passing in a register.
    327 
    328    This test does not check that sufficient registers of the
    329    appropriate class are actually available, merely that IFF
    330    sufficient registers are available then the argument will be passed
    331    in register(s).
    332 
    333    Note that an ffi_type that is deemed to be a register candidate
    334    will always be returned in registers.
    335 
    336    Returns 1 if a register candidate else 0.  */
    337 
    338 static int
    339 is_register_candidate (ffi_type *ty)
    340 {
    341   switch (ty->type)
    342     {
    343     case FFI_TYPE_VOID:
    344     case FFI_TYPE_FLOAT:
    345     case FFI_TYPE_DOUBLE:
    346 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    347     case FFI_TYPE_LONGDOUBLE:
    348 #endif
    349     case FFI_TYPE_UINT8:
    350     case FFI_TYPE_UINT16:
    351     case FFI_TYPE_UINT32:
    352     case FFI_TYPE_UINT64:
    353     case FFI_TYPE_POINTER:
    354     case FFI_TYPE_SINT8:
    355     case FFI_TYPE_SINT16:
    356     case FFI_TYPE_SINT32:
    357     case FFI_TYPE_INT:
    358     case FFI_TYPE_SINT64:
    359       return 1;
    360 
    361     case FFI_TYPE_STRUCT:
    362       if (is_hfa (ty))
    363         {
    364           return 1;
    365         }
    366       else if (ty->size > 16)
    367         {
    368           /* Too large. Will be replaced with a pointer to memory. The
    369              pointer MAY be passed in a register, but the value will
    370              not. This test specifically fails since the argument will
    371              never be passed by value in registers. */
    372           return 0;
    373         }
    374       else
    375         {
    376           /* Might be passed in registers depending on the number of
    377              registers required. */
    378           return (ty->size + 7) / 8 < N_X_ARG_REG;
    379         }
    380       break;
    381 
    382     default:
    383       FFI_ASSERT (0);
    384       break;
    385     }
    386 
    387   return 0;
    388 }
    389 
    390 /* Test if an ffi_type argument or result is a candidate for a vector
    391    register.  */
    392 
    393 static int
    394 is_v_register_candidate (ffi_type *ty)
    395 {
    396   return is_floating_type (ty->type)
    397 	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
    398 }
    399 
    400 /* Representation of the procedure call argument marshalling
    401    state.
    402 
    403    The terse state variable names match the names used in the AARCH64
    404    PCS. */
    405 
    406 struct arg_state
    407 {
    408   unsigned ngrn;                /* Next general-purpose register number. */
    409   unsigned nsrn;                /* Next vector register number. */
    410   size_t nsaa;                  /* Next stack offset. */
    411 
    412 #if defined (__APPLE__)
    413   unsigned allocating_variadic;
    414 #endif
    415 };
    416 
    417 /* Initialize a procedure call argument marshalling state.  */
    418 static void
    419 arg_init (struct arg_state *state, size_t call_frame_size)
    420 {
    421   state->ngrn = 0;
    422   state->nsrn = 0;
    423   state->nsaa = 0;
    424 
    425 #if defined (__APPLE__)
    426   state->allocating_variadic = 0;
    427 #endif
    428 }
    429 
    430 /* Return the number of available consecutive core argument
    431    registers.  */
    432 
    433 static unsigned
    434 available_x (struct arg_state *state)
    435 {
    436   return N_X_ARG_REG - state->ngrn;
    437 }
    438 
    439 /* Return the number of available consecutive vector argument
    440    registers.  */
    441 
    442 static unsigned
    443 available_v (struct arg_state *state)
    444 {
    445   return N_V_ARG_REG - state->nsrn;
    446 }
    447 
    448 static void *
    449 allocate_to_x (struct call_context *context, struct arg_state *state)
    450 {
    451   FFI_ASSERT (state->ngrn < N_X_ARG_REG);
    452   return get_x_addr (context, (state->ngrn)++);
    453 }
    454 
    455 static void *
    456 allocate_to_s (struct call_context *context, struct arg_state *state)
    457 {
    458   FFI_ASSERT (state->nsrn < N_V_ARG_REG);
    459   return get_s_addr (context, (state->nsrn)++);
    460 }
    461 
    462 static void *
    463 allocate_to_d (struct call_context *context, struct arg_state *state)
    464 {
    465   FFI_ASSERT (state->nsrn < N_V_ARG_REG);
    466   return get_d_addr (context, (state->nsrn)++);
    467 }
    468 
    469 static void *
    470 allocate_to_v (struct call_context *context, struct arg_state *state)
    471 {
    472   FFI_ASSERT (state->nsrn < N_V_ARG_REG);
    473   return get_v_addr (context, (state->nsrn)++);
    474 }
    475 
    476 /* Allocate an aligned slot on the stack and return a pointer to it.  */
    477 static void *
    478 allocate_to_stack (struct arg_state *state, void *stack, size_t alignment,
    479 		   size_t size)
    480 {
    481   void *allocation;
    482 
    483   /* Round up the NSAA to the larger of 8 or the natural
    484      alignment of the argument's type.  */
    485   state->nsaa = ALIGN (state->nsaa, alignment);
    486   state->nsaa = ALIGN (state->nsaa, alignment);
    487 #if defined (__APPLE__)
    488   if (state->allocating_variadic)
    489     state->nsaa = ALIGN (state->nsaa, 8);
    490 #else
    491   state->nsaa = ALIGN (state->nsaa, 8);
    492 #endif
    493 
    494   allocation = stack + state->nsaa;
    495 
    496   state->nsaa += size;
    497   return allocation;
    498 }
    499 
    500 static void
    501 copy_basic_type (void *dest, void *source, unsigned short type)
    502 {
    503   /* This is necessary to ensure that basic types are copied
    504      sign extended to 64-bits as libffi expects.  */
    505   switch (type)
    506     {
    507     case FFI_TYPE_FLOAT:
    508       *(float *) dest = *(float *) source;
    509       break;
    510     case FFI_TYPE_DOUBLE:
    511       *(double *) dest = *(double *) source;
    512       break;
    513 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    514     case FFI_TYPE_LONGDOUBLE:
    515       *(long double *) dest = *(long double *) source;
    516       break;
    517 #endif
    518     case FFI_TYPE_UINT8:
    519       *(ffi_arg *) dest = *(UINT8 *) source;
    520       break;
    521     case FFI_TYPE_SINT8:
    522       *(ffi_sarg *) dest = *(SINT8 *) source;
    523       break;
    524     case FFI_TYPE_UINT16:
    525       *(ffi_arg *) dest = *(UINT16 *) source;
    526       break;
    527     case FFI_TYPE_SINT16:
    528       *(ffi_sarg *) dest = *(SINT16 *) source;
    529       break;
    530     case FFI_TYPE_UINT32:
    531       *(ffi_arg *) dest = *(UINT32 *) source;
    532       break;
    533     case FFI_TYPE_INT:
    534     case FFI_TYPE_SINT32:
    535       *(ffi_sarg *) dest = *(SINT32 *) source;
    536       break;
    537     case FFI_TYPE_POINTER:
    538     case FFI_TYPE_UINT64:
    539       *(ffi_arg *) dest = *(UINT64 *) source;
    540       break;
    541     case FFI_TYPE_SINT64:
    542       *(ffi_sarg *) dest = *(SINT64 *) source;
    543       break;
    544     case FFI_TYPE_VOID:
    545       break;
    546 
    547     default:
    548       FFI_ASSERT (0);
    549     }
    550 }
    551 
    552 static void
    553 copy_hfa_to_reg_or_stack (void *memory,
    554 			  ffi_type *ty,
    555 			  struct call_context *context,
    556 			  unsigned char *stack,
    557 			  struct arg_state *state)
    558 {
    559   unsigned elems = element_count (ty);
    560   if (available_v (state) < elems)
    561     {
    562       /* There are insufficient V registers. Further V register allocations
    563 	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
    564 	 and the argument is copied to memory at the adjusted NSAA.  */
    565       state->nsrn = N_V_ARG_REG;
    566       memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
    567 	      memory,
    568 	      ty->size);
    569     }
    570   else
    571     {
    572       int i;
    573       unsigned short type = get_homogeneous_type (ty);
    574       for (i = 0; i < elems; i++)
    575 	{
    576 	  void *reg = allocate_to_v (context, state);
    577 	  copy_basic_type (reg, memory, type);
    578 	  memory += get_basic_type_size (type);
    579 	}
    580     }
    581 }
    582 
    583 /* Either allocate an appropriate register for the argument type, or if
    584    none are available, allocate a stack slot and return a pointer
    585    to the allocated space.  */
    586 
    587 static void *
    588 allocate_to_register_or_stack (struct call_context *context,
    589 			       unsigned char *stack,
    590 			       struct arg_state *state,
    591 			       unsigned short type)
    592 {
    593   size_t alignment = get_basic_type_alignment (type);
    594   size_t size = alignment;
    595   switch (type)
    596     {
    597     case FFI_TYPE_FLOAT:
    598       /* This is the only case for which the allocated stack size
    599 	 should not match the alignment of the type.  */
    600       size = sizeof (UINT32);
    601       /* Fall through.  */
    602     case FFI_TYPE_DOUBLE:
    603       if (state->nsrn < N_V_ARG_REG)
    604 	return allocate_to_d (context, state);
    605       state->nsrn = N_V_ARG_REG;
    606       break;
    607 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    608     case FFI_TYPE_LONGDOUBLE:
    609       if (state->nsrn < N_V_ARG_REG)
    610 	return allocate_to_v (context, state);
    611       state->nsrn = N_V_ARG_REG;
    612       break;
    613 #endif
    614     case FFI_TYPE_UINT8:
    615     case FFI_TYPE_SINT8:
    616     case FFI_TYPE_UINT16:
    617     case FFI_TYPE_SINT16:
    618     case FFI_TYPE_UINT32:
    619     case FFI_TYPE_SINT32:
    620     case FFI_TYPE_INT:
    621     case FFI_TYPE_POINTER:
    622     case FFI_TYPE_UINT64:
    623     case FFI_TYPE_SINT64:
    624       if (state->ngrn < N_X_ARG_REG)
    625 	return allocate_to_x (context, state);
    626       state->ngrn = N_X_ARG_REG;
    627       break;
    628     default:
    629       FFI_ASSERT (0);
    630     }
    631 
    632     return allocate_to_stack (state, stack, alignment, size);
    633 }
    634 
    635 /* Copy a value to an appropriate register, or if none are
    636    available, to the stack.  */
    637 
    638 static void
    639 copy_to_register_or_stack (struct call_context *context,
    640 			   unsigned char *stack,
    641 			   struct arg_state *state,
    642 			   void *value,
    643 			   unsigned short type)
    644 {
    645   copy_basic_type (
    646 	  allocate_to_register_or_stack (context, stack, state, type),
    647 	  value,
    648 	  type);
    649 }
    650 
    651 /* Marshall the arguments from FFI representation to procedure call
    652    context and stack.  */
    653 
    654 static unsigned
    655 aarch64_prep_args (struct call_context *context, unsigned char *stack,
    656 		   extended_cif *ecif)
    657 {
    658   int i;
    659   struct arg_state state;
    660 
    661   arg_init (&state, ALIGN(ecif->cif->bytes, 16));
    662 
    663   for (i = 0; i < ecif->cif->nargs; i++)
    664     {
    665       ffi_type *ty = ecif->cif->arg_types[i];
    666       switch (ty->type)
    667 	{
    668 	case FFI_TYPE_VOID:
    669 	  FFI_ASSERT (0);
    670 	  break;
    671 
    672 	/* If the argument is a basic type the argument is allocated to an
    673 	   appropriate register, or if none are available, to the stack.  */
    674 	case FFI_TYPE_FLOAT:
    675 	case FFI_TYPE_DOUBLE:
    676 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    677 	case FFI_TYPE_LONGDOUBLE:
    678 #endif
    679 	case FFI_TYPE_UINT8:
    680 	case FFI_TYPE_SINT8:
    681 	case FFI_TYPE_UINT16:
    682 	case FFI_TYPE_SINT16:
    683 	case FFI_TYPE_UINT32:
    684 	case FFI_TYPE_INT:
    685 	case FFI_TYPE_SINT32:
    686 	case FFI_TYPE_POINTER:
    687 	case FFI_TYPE_UINT64:
    688 	case FFI_TYPE_SINT64:
    689 	  copy_to_register_or_stack (context, stack, &state,
    690 				     ecif->avalue[i], ty->type);
    691 	  break;
    692 
    693 	case FFI_TYPE_STRUCT:
    694 	  if (is_hfa (ty))
    695 	    {
    696 	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
    697 					stack, &state);
    698 	    }
    699 	  else if (ty->size > 16)
    700 	    {
    701 	      /* If the argument is a composite type that is larger than 16
    702 		 bytes, then the argument has been copied to memory, and
    703 		 the argument is replaced by a pointer to the copy.  */
    704 
    705 	      copy_to_register_or_stack (context, stack, &state,
    706 					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
    707 	    }
    708 	  else if (available_x (&state) >= (ty->size + 7) / 8)
    709 	    {
    710 	      /* If the argument is a composite type and the size in
    711 		 double-words is not more than the number of available
    712 		 X registers, then the argument is copied into consecutive
    713 		 X registers.  */
    714 	      int j;
    715 	      for (j = 0; j < (ty->size + 7) / 8; j++)
    716 		{
    717 		  memcpy (allocate_to_x (context, &state),
    718 			  &(((UINT64 *) ecif->avalue[i])[j]),
    719 			  sizeof (UINT64));
    720 		}
    721 	    }
    722 	  else
    723 	    {
    724 	      /* Otherwise, there are insufficient X registers. Further X
    725 		 register allocations are prevented, the NSAA is adjusted
    726 		 (by allocate_to_stack ()) and the argument is copied to
    727 		 memory at the adjusted NSAA.  */
    728 	      state.ngrn = N_X_ARG_REG;
    729 
    730 	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
    731 					 ty->size), ecif->avalue + i, ty->size);
    732 	    }
    733 	  break;
    734 
    735 	default:
    736 	  FFI_ASSERT (0);
    737 	  break;
    738 	}
    739 
    740 #if defined (__APPLE__)
    741       if (i + 1 == ecif->cif->aarch64_nfixedargs)
    742 	{
    743 	  state.ngrn = N_X_ARG_REG;
    744 	  state.nsrn = N_V_ARG_REG;
    745 
    746 	  state.allocating_variadic = 1;
    747 	}
    748 #endif
    749     }
    750 
    751   return ecif->cif->aarch64_flags;
    752 }
    753 
    754 ffi_status
    755 ffi_prep_cif_machdep (ffi_cif *cif)
    756 {
    757   /* Round the stack up to a multiple of the stack alignment requirement. */
    758   cif->bytes =
    759     (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
    760 
    761   /* Initialize our flags. We are interested if this CIF will touch a
    762      vector register, if so we will enable context save and load to
    763      those registers, otherwise not. This is intended to be friendly
    764      to lazy float context switching in the kernel.  */
    765   cif->aarch64_flags = 0;
    766 
    767   if (is_v_register_candidate (cif->rtype))
    768     {
    769       cif->aarch64_flags |= AARCH64_FFI_WITH_V;
    770     }
    771   else
    772     {
    773       int i;
    774       for (i = 0; i < cif->nargs; i++)
    775         if (is_v_register_candidate (cif->arg_types[i]))
    776           {
    777             cif->aarch64_flags |= AARCH64_FFI_WITH_V;
    778             break;
    779           }
    780     }
    781 
    782   return FFI_OK;
    783 }
    784 
    785 #if defined (__APPLE__)
    786 
    787 /* Perform Apple-specific cif processing for variadic calls */
    788 ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
    789 				    unsigned int nfixedargs,
    790 				    unsigned int ntotalargs)
    791 {
    792   cif->aarch64_nfixedargs = nfixedargs;
    793 
    794   return ffi_prep_cif_machdep(cif);
    795 }
    796 
    797 #endif
    798 
    799 /* Call a function with the provided arguments and capture the return
    800    value.  */
    801 void
    802 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
    803 {
    804   extended_cif ecif;
    805 
    806   ecif.cif = cif;
    807   ecif.avalue = avalue;
    808   ecif.rvalue = rvalue;
    809 
    810   switch (cif->abi)
    811     {
    812     case FFI_SYSV:
    813       {
    814         struct call_context context;
    815 	size_t stack_bytes;
    816 
    817 	/* Figure out the total amount of stack space we need, the
    818 	   above call frame space needs to be 16 bytes aligned to
    819 	   ensure correct alignment of the first object inserted in
    820 	   that space hence the ALIGN applied to cif->bytes.*/
    821 	stack_bytes = ALIGN(cif->bytes, 16);
    822 
    823 	memset (&context, 0, sizeof (context));
    824         if (is_register_candidate (cif->rtype))
    825           {
    826             ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
    827             switch (cif->rtype->type)
    828               {
    829               case FFI_TYPE_VOID:
    830               case FFI_TYPE_FLOAT:
    831               case FFI_TYPE_DOUBLE:
    832 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    833               case FFI_TYPE_LONGDOUBLE:
    834 #endif
    835               case FFI_TYPE_UINT8:
    836               case FFI_TYPE_SINT8:
    837               case FFI_TYPE_UINT16:
    838               case FFI_TYPE_SINT16:
    839               case FFI_TYPE_UINT32:
    840               case FFI_TYPE_SINT32:
    841               case FFI_TYPE_POINTER:
    842               case FFI_TYPE_UINT64:
    843               case FFI_TYPE_INT:
    844               case FFI_TYPE_SINT64:
    845 		{
    846 		  void *addr = get_basic_type_addr (cif->rtype->type,
    847 						    &context, 0);
    848 		  copy_basic_type (rvalue, addr, cif->rtype->type);
    849 		  break;
    850 		}
    851 
    852               case FFI_TYPE_STRUCT:
    853                 if (is_hfa (cif->rtype))
    854 		  {
    855 		    int j;
    856 		    unsigned short type = get_homogeneous_type (cif->rtype);
    857 		    unsigned elems = element_count (cif->rtype);
    858 		    for (j = 0; j < elems; j++)
    859 		      {
    860 			void *reg = get_basic_type_addr (type, &context, j);
    861 			copy_basic_type (rvalue, reg, type);
    862 			rvalue += get_basic_type_size (type);
    863 		      }
    864 		  }
    865                 else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
    866                   {
    867                     size_t size = ALIGN (cif->rtype->size, sizeof (UINT64));
    868                     memcpy (rvalue, get_x_addr (&context, 0), size);
    869                   }
    870                 else
    871                   {
    872                     FFI_ASSERT (0);
    873                   }
    874                 break;
    875 
    876               default:
    877                 FFI_ASSERT (0);
    878                 break;
    879               }
    880           }
    881         else
    882           {
    883             memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
    884             ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
    885 			   stack_bytes, fn);
    886           }
    887         break;
    888       }
    889 
    890     default:
    891       FFI_ASSERT (0);
    892       break;
    893     }
    894 }
    895 
    896 static unsigned char trampoline [] =
    897 { 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
    898   0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
    899   0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
    900 };
    901 
    902 /* Build a trampoline.  */
    903 
    904 #define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
    905   ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
    906     UINT64  __fun = (UINT64)(FUN);					\
    907     UINT64  __ctx = (UINT64)(CTX);					\
    908     UINT64  __flags = (UINT64)(FLAGS);					\
    909     memcpy (__tramp, trampoline, sizeof (trampoline));			\
    910     memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
    911     memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
    912     memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
    913     ffi_clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
    914   })
    915 
    916 ffi_status
    917 ffi_prep_closure_loc (ffi_closure* closure,
    918                       ffi_cif* cif,
    919                       void (*fun)(ffi_cif*,void*,void**,void*),
    920                       void *user_data,
    921                       void *codeloc)
    922 {
    923   if (cif->abi != FFI_SYSV)
    924     return FFI_BAD_ABI;
    925 
    926   FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
    927 		       cif->aarch64_flags);
    928 
    929   closure->cif  = cif;
    930   closure->user_data = user_data;
    931   closure->fun  = fun;
    932 
    933   return FFI_OK;
    934 }
    935 
    936 /* Primary handler to setup and invoke a function within a closure.
    937 
    938    A closure when invoked enters via the assembler wrapper
    939    ffi_closure_SYSV(). The wrapper allocates a call context on the
    940    stack, saves the interesting registers (from the perspective of
    941    the calling convention) into the context then passes control to
    942    ffi_closure_SYSV_inner() passing the saved context and a pointer to
    943    the stack at the point ffi_closure_SYSV() was invoked.
    944 
    945    On the return path the assembler wrapper will reload call context
    946    registers.
    947 
    948    ffi_closure_SYSV_inner() marshalls the call context into ffi value
    949    descriptors, invokes the wrapped function, then marshalls the return
    950    value back into the call context.  */
    951 
    952 void FFI_HIDDEN
    953 ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
    954 			void *stack)
    955 {
    956   ffi_cif *cif = closure->cif;
    957   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
    958   void *rvalue = NULL;
    959   int i;
    960   struct arg_state state;
    961 
    962   arg_init (&state, ALIGN(cif->bytes, 16));
    963 
    964   for (i = 0; i < cif->nargs; i++)
    965     {
    966       ffi_type *ty = cif->arg_types[i];
    967 
    968       switch (ty->type)
    969 	{
    970 	case FFI_TYPE_VOID:
    971 	  FFI_ASSERT (0);
    972 	  break;
    973 
    974 	case FFI_TYPE_UINT8:
    975 	case FFI_TYPE_SINT8:
    976 	case FFI_TYPE_UINT16:
    977 	case FFI_TYPE_SINT16:
    978 	case FFI_TYPE_UINT32:
    979 	case FFI_TYPE_SINT32:
    980 	case FFI_TYPE_INT:
    981 	case FFI_TYPE_POINTER:
    982 	case FFI_TYPE_UINT64:
    983 	case FFI_TYPE_SINT64:
    984 	case  FFI_TYPE_FLOAT:
    985 	case  FFI_TYPE_DOUBLE:
    986 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
    987 	case  FFI_TYPE_LONGDOUBLE:
    988 	  avalue[i] = allocate_to_register_or_stack (context, stack,
    989 						     &state, ty->type);
    990 	  break;
    991 #endif
    992 
    993 	case FFI_TYPE_STRUCT:
    994 	  if (is_hfa (ty))
    995 	    {
    996 	      unsigned n = element_count (ty);
    997 	      if (available_v (&state) < n)
    998 		{
    999 		  state.nsrn = N_V_ARG_REG;
   1000 		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
   1001 						 ty->size);
   1002 		}
   1003 	      else
   1004 		{
   1005 		  switch (get_homogeneous_type (ty))
   1006 		    {
   1007 		    case FFI_TYPE_FLOAT:
   1008 		      {
   1009 			/* Eeek! We need a pointer to the structure,
   1010 			   however the homogeneous float elements are
   1011 			   being passed in individual S registers,
   1012 			   therefore the structure is not represented as
   1013 			   a contiguous sequence of bytes in our saved
   1014 			   register context. We need to fake up a copy
   1015 			   of the structure laid out in memory
   1016 			   correctly. The fake can be tossed once the
   1017 			   closure function has returned hence alloca()
   1018 			   is sufficient. */
   1019 			int j;
   1020 			UINT32 *p = avalue[i] = alloca (ty->size);
   1021 			for (j = 0; j < element_count (ty); j++)
   1022 			  memcpy (&p[j],
   1023 				  allocate_to_s (context, &state),
   1024 				  sizeof (*p));
   1025 			break;
   1026 		      }
   1027 
   1028 		    case FFI_TYPE_DOUBLE:
   1029 		      {
   1030 			/* Eeek! We need a pointer to the structure,
   1031 			   however the homogeneous float elements are
   1032 			   being passed in individual S registers,
   1033 			   therefore the structure is not represented as
   1034 			   a contiguous sequence of bytes in our saved
   1035 			   register context. We need to fake up a copy
   1036 			   of the structure laid out in memory
   1037 			   correctly. The fake can be tossed once the
   1038 			   closure function has returned hence alloca()
   1039 			   is sufficient. */
   1040 			int j;
   1041 			UINT64 *p = avalue[i] = alloca (ty->size);
   1042 			for (j = 0; j < element_count (ty); j++)
   1043 			  memcpy (&p[j],
   1044 				  allocate_to_d (context, &state),
   1045 				  sizeof (*p));
   1046 			break;
   1047 		      }
   1048 
   1049 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
   1050 		    case FFI_TYPE_LONGDOUBLE:
   1051 			  memcpy (&avalue[i],
   1052 				  allocate_to_v (context, &state),
   1053 				  sizeof (*avalue));
   1054 		      break;
   1055 #endif
   1056 
   1057 		    default:
   1058 		      FFI_ASSERT (0);
   1059 		      break;
   1060 		    }
   1061 		}
   1062 	    }
   1063 	  else if (ty->size > 16)
   1064 	    {
   1065 	      /* Replace Composite type of size greater than 16 with a
   1066 		 pointer.  */
   1067 	      memcpy (&avalue[i],
   1068 		      allocate_to_register_or_stack (context, stack,
   1069 						     &state, FFI_TYPE_POINTER),
   1070 		      sizeof (avalue[i]));
   1071 	    }
   1072 	  else if (available_x (&state) >= (ty->size + 7) / 8)
   1073 	    {
   1074 	      avalue[i] = get_x_addr (context, state.ngrn);
   1075 	      state.ngrn += (ty->size + 7) / 8;
   1076 	    }
   1077 	  else
   1078 	    {
   1079 	      state.ngrn = N_X_ARG_REG;
   1080 
   1081 	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
   1082 					     ty->size);
   1083 	    }
   1084 	  break;
   1085 
   1086 	default:
   1087 	  FFI_ASSERT (0);
   1088 	  break;
   1089 	}
   1090     }
   1091 
   1092   /* Figure out where the return value will be passed, either in
   1093      registers or in a memory block allocated by the caller and passed
   1094      in x8.  */
   1095 
   1096   if (is_register_candidate (cif->rtype))
   1097     {
   1098       /* Register candidates are *always* returned in registers. */
   1099 
   1100       /* Allocate a scratchpad for the return value, we will let the
   1101          callee scrible the result into the scratch pad then move the
   1102          contents into the appropriate return value location for the
   1103          call convention.  */
   1104       rvalue = alloca (cif->rtype->size);
   1105       (closure->fun) (cif, rvalue, avalue, closure->user_data);
   1106 
   1107       /* Copy the return value into the call context so that it is returned
   1108          as expected to our caller.  */
   1109       switch (cif->rtype->type)
   1110         {
   1111         case FFI_TYPE_VOID:
   1112           break;
   1113 
   1114         case FFI_TYPE_UINT8:
   1115         case FFI_TYPE_UINT16:
   1116         case FFI_TYPE_UINT32:
   1117         case FFI_TYPE_POINTER:
   1118         case FFI_TYPE_UINT64:
   1119         case FFI_TYPE_SINT8:
   1120         case FFI_TYPE_SINT16:
   1121         case FFI_TYPE_INT:
   1122         case FFI_TYPE_SINT32:
   1123         case FFI_TYPE_SINT64:
   1124         case FFI_TYPE_FLOAT:
   1125         case FFI_TYPE_DOUBLE:
   1126 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
   1127         case FFI_TYPE_LONGDOUBLE:
   1128 #endif
   1129 	  {
   1130 	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
   1131 	    copy_basic_type (addr, rvalue, cif->rtype->type);
   1132             break;
   1133 	  }
   1134         case FFI_TYPE_STRUCT:
   1135           if (is_hfa (cif->rtype))
   1136 	    {
   1137 	      int j;
   1138 	      unsigned short type = get_homogeneous_type (cif->rtype);
   1139 	      unsigned elems = element_count (cif->rtype);
   1140 	      for (j = 0; j < elems; j++)
   1141 		{
   1142 		  void *reg = get_basic_type_addr (type, context, j);
   1143 		  copy_basic_type (reg, rvalue, type);
   1144 		  rvalue += get_basic_type_size (type);
   1145 		}
   1146 	    }
   1147           else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
   1148             {
   1149               size_t size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
   1150               memcpy (get_x_addr (context, 0), rvalue, size);
   1151             }
   1152           else
   1153             {
   1154               FFI_ASSERT (0);
   1155             }
   1156           break;
   1157         default:
   1158           FFI_ASSERT (0);
   1159           break;
   1160         }
   1161     }
   1162   else
   1163     {
   1164       memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
   1165       (closure->fun) (cif, rvalue, avalue, closure->user_data);
   1166     }
   1167 }
   1168 
   1169