1 /* ----------------------------------------------------------------------- 2 unix64.S - Copyright (c) 2002 Bo Thorsen <bo (at) suse.de> 3 Copyright (c) 2008 Red Hat, Inc 4 5 x86-64 Foreign Function Interface 6 7 Permission is hereby granted, free of charge, to any person obtaining 8 a copy of this software and associated documentation files (the 9 ``Software''), to deal in the Software without restriction, including 10 without limitation the rights to use, copy, modify, merge, publish, 11 distribute, sublicense, and/or sell copies of the Software, and to 12 permit persons to whom the Software is furnished to do so, subject to 13 the following conditions: 14 15 The above copyright notice and this permission notice shall be included 16 in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 DEALINGS IN THE SOFTWARE. 26 ----------------------------------------------------------------------- */ 27 28 #ifdef __x86_64__ 29 #define LIBFFI_ASM 30 #include <fficonfig.h> 31 #include <ffi.h> 32 33 .text 34 35 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, 36 void *raddr, void (*fnaddr)(void)); 37 38 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame 39 for this function. This has been allocated by ffi_call. We also 40 deallocate some of the stack that has been alloca'd. */ 41 42 .align 2 43 .globl ffi_call_unix64 44 .type ffi_call_unix64,@function 45 46 ffi_call_unix64: 47 .LUW0: 48 movq (%rsp), %r10 /* Load return address. */ 49 leaq (%rdi, %rsi), %rax /* Find local stack base. */ 50 movq %rdx, (%rax) /* Save flags. */ 51 movq %rcx, 8(%rax) /* Save raddr. */ 52 movq %rbp, 16(%rax) /* Save old frame pointer. */ 53 movq %r10, 24(%rax) /* Relocate return address. */ 54 movq %rax, %rbp /* Finalize local stack frame. */ 55 .LUW1: 56 movq %rdi, %r10 /* Save a copy of the register area. */ 57 movq %r8, %r11 /* Save a copy of the target fn. */ 58 movl %r9d, %eax /* Set number of SSE registers. */ 59 60 /* Load up all argument registers. */ 61 movq (%r10), %rdi 62 movq 8(%r10), %rsi 63 movq 16(%r10), %rdx 64 movq 24(%r10), %rcx 65 movq 32(%r10), %r8 66 movq 40(%r10), %r9 67 testl %eax, %eax 68 jnz .Lload_sse 69 .Lret_from_load_sse: 70 71 /* Deallocate the reg arg area. */ 72 leaq 176(%r10), %rsp 73 74 /* Call the user function. */ 75 call *%r11 76 77 /* Deallocate stack arg area; local stack frame in redzone. */ 78 leaq 24(%rbp), %rsp 79 80 movq 0(%rbp), %rcx /* Reload flags. */ 81 movq 8(%rbp), %rdi /* Reload raddr. */ 82 movq 16(%rbp), %rbp /* Reload old frame pointer. */ 83 .LUW2: 84 85 /* The first byte of the flags contains the FFI_TYPE. */ 86 movzbl %cl, %r10d 87 leaq .Lstore_table(%rip), %r11 88 movslq (%r11, %r10, 4), %r10 89 addq %r11, %r10 90 jmp *%r10 91 92 .section .rodata 93 .Lstore_table: 94 .long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */ 95 .long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */ 96 .long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */ 97 .long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */ 98 .long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */ 99 .long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */ 100 .long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */ 101 .long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */ 102 .long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */ 103 .long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */ 104 .long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */ 105 .long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */ 106 .long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */ 107 .long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */ 108 .long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */ 109 110 .text 111 .align 2 112 .Lst_void: 113 ret 114 .align 2 115 116 .Lst_uint8: 117 movzbq %al, %rax 118 movq %rax, (%rdi) 119 ret 120 .align 2 121 .Lst_sint8: 122 movsbq %al, %rax 123 movq %rax, (%rdi) 124 ret 125 .align 2 126 .Lst_uint16: 127 movzwq %ax, %rax 128 movq %rax, (%rdi) 129 .align 2 130 .Lst_sint16: 131 movswq %ax, %rax 132 movq %rax, (%rdi) 133 ret 134 .align 2 135 .Lst_uint32: 136 movl %eax, %eax 137 movq %rax, (%rdi) 138 .align 2 139 .Lst_sint32: 140 cltq 141 movq %rax, (%rdi) 142 ret 143 .align 2 144 .Lst_int64: 145 movq %rax, (%rdi) 146 ret 147 148 .align 2 149 .Lst_float: 150 movss %xmm0, (%rdi) 151 ret 152 .align 2 153 .Lst_double: 154 movsd %xmm0, (%rdi) 155 ret 156 .Lst_ldouble: 157 fstpt (%rdi) 158 ret 159 160 .align 2 161 .Lst_struct: 162 leaq -20(%rsp), %rsi /* Scratch area in redzone. */ 163 164 /* We have to locate the values now, and since we don't want to 165 write too much data into the user's return value, we spill the 166 value to a 16 byte scratch area first. Bits 8, 9, and 10 167 control where the values are located. Only one of the three 168 bits will be set; see ffi_prep_cif_machdep for the pattern. */ 169 movd %xmm0, %r10 170 movd %xmm1, %r11 171 testl $0x100, %ecx 172 cmovnz %rax, %rdx 173 cmovnz %r10, %rax 174 testl $0x200, %ecx 175 cmovnz %r10, %rdx 176 testl $0x400, %ecx 177 cmovnz %r10, %rax 178 cmovnz %r11, %rdx 179 movq %rax, (%rsi) 180 movq %rdx, 8(%rsi) 181 182 /* Bits 12-31 contain the true size of the structure. Copy from 183 the scratch area to the true destination. */ 184 shrl $12, %ecx 185 rep movsb 186 ret 187 188 /* Many times we can avoid loading any SSE registers at all. 189 It's not worth an indirect jump to load the exact set of 190 SSE registers needed; zero or all is a good compromise. */ 191 .align 2 192 .LUW3: 193 .Lload_sse: 194 movdqa 48(%r10), %xmm0 195 movdqa 64(%r10), %xmm1 196 movdqa 80(%r10), %xmm2 197 movdqa 96(%r10), %xmm3 198 movdqa 112(%r10), %xmm4 199 movdqa 128(%r10), %xmm5 200 movdqa 144(%r10), %xmm6 201 movdqa 160(%r10), %xmm7 202 jmp .Lret_from_load_sse 203 204 .LUW4: 205 .size ffi_call_unix64,.-ffi_call_unix64 206 207 .align 2 208 .globl ffi_closure_unix64 209 .type ffi_closure_unix64,@function 210 211 ffi_closure_unix64: 212 .LUW5: 213 /* The carry flag is set by the trampoline iff SSE registers 214 are used. Don't clobber it before the branch instruction. */ 215 leaq -200(%rsp), %rsp 216 .LUW6: 217 movq %rdi, (%rsp) 218 movq %rsi, 8(%rsp) 219 movq %rdx, 16(%rsp) 220 movq %rcx, 24(%rsp) 221 movq %r8, 32(%rsp) 222 movq %r9, 40(%rsp) 223 jc .Lsave_sse 224 .Lret_from_save_sse: 225 226 movq %r10, %rdi 227 leaq 176(%rsp), %rsi 228 movq %rsp, %rdx 229 leaq 208(%rsp), %rcx 230 call ffi_closure_unix64_inner@PLT 231 232 /* Deallocate stack frame early; return value is now in redzone. */ 233 addq $200, %rsp 234 .LUW7: 235 236 /* The first byte of the return value contains the FFI_TYPE. */ 237 movzbl %al, %r10d 238 leaq .Lload_table(%rip), %r11 239 movslq (%r11, %r10, 4), %r10 240 addq %r11, %r10 241 jmp *%r10 242 243 .section .rodata 244 .Lload_table: 245 .long .Lld_void-.Lload_table /* FFI_TYPE_VOID */ 246 .long .Lld_int32-.Lload_table /* FFI_TYPE_INT */ 247 .long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */ 248 .long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */ 249 .long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */ 250 .long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */ 251 .long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */ 252 .long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */ 253 .long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */ 254 .long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */ 255 .long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */ 256 .long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */ 257 .long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */ 258 .long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */ 259 .long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */ 260 261 .text 262 .align 2 263 .Lld_void: 264 ret 265 266 .align 2 267 .Lld_int8: 268 movzbl -24(%rsp), %eax 269 ret 270 .align 2 271 .Lld_int16: 272 movzwl -24(%rsp), %eax 273 ret 274 .align 2 275 .Lld_int32: 276 movl -24(%rsp), %eax 277 ret 278 .align 2 279 .Lld_int64: 280 movq -24(%rsp), %rax 281 ret 282 283 .align 2 284 .Lld_float: 285 movss -24(%rsp), %xmm0 286 ret 287 .align 2 288 .Lld_double: 289 movsd -24(%rsp), %xmm0 290 ret 291 .align 2 292 .Lld_ldouble: 293 fldt -24(%rsp) 294 ret 295 296 .align 2 297 .Lld_struct: 298 /* There are four possibilities here, %rax/%rdx, %xmm0/%rax, 299 %rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading 300 both rdx and xmm1 with the second word. For the remaining, 301 bit 8 set means xmm0 gets the second word, and bit 9 means 302 that rax gets the second word. */ 303 movq -24(%rsp), %rcx 304 movq -16(%rsp), %rdx 305 movq -16(%rsp), %xmm1 306 testl $0x100, %eax 307 cmovnz %rdx, %rcx 308 movd %rcx, %xmm0 309 testl $0x200, %eax 310 movq -24(%rsp), %rax 311 cmovnz %rdx, %rax 312 ret 313 314 /* See the comment above .Lload_sse; the same logic applies here. */ 315 .align 2 316 .LUW8: 317 .Lsave_sse: 318 movdqa %xmm0, 48(%rsp) 319 movdqa %xmm1, 64(%rsp) 320 movdqa %xmm2, 80(%rsp) 321 movdqa %xmm3, 96(%rsp) 322 movdqa %xmm4, 112(%rsp) 323 movdqa %xmm5, 128(%rsp) 324 movdqa %xmm6, 144(%rsp) 325 movdqa %xmm7, 160(%rsp) 326 jmp .Lret_from_save_sse 327 328 .LUW9: 329 .size ffi_closure_unix64,.-ffi_closure_unix64 330 331 .section .eh_frame,"a",@progbits 332 .Lframe1: 333 .long .LECIE1-.LSCIE1 /* CIE Length */ 334 .LSCIE1: 335 .long 0 /* CIE Identifier Tag */ 336 .byte 1 /* CIE Version */ 337 .ascii "zR\0" /* CIE Augmentation */ 338 .uleb128 1 /* CIE Code Alignment Factor */ 339 .sleb128 -8 /* CIE Data Alignment Factor */ 340 .byte 0x10 /* CIE RA Column */ 341 .uleb128 1 /* Augmentation size */ 342 .byte 0x1b /* FDE Encoding (pcrel sdata4) */ 343 .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ 344 .uleb128 7 345 .uleb128 8 346 .byte 0x80+16 /* DW_CFA_offset, %rip offset 1*-8 */ 347 .uleb128 1 348 .align 8 349 .LECIE1: 350 .LSFDE1: 351 .long .LEFDE1-.LASFDE1 /* FDE Length */ 352 .LASFDE1: 353 .long .LASFDE1-.Lframe1 /* FDE CIE offset */ 354 .long .LUW0-. /* FDE initial location */ 355 .long .LUW4-.LUW0 /* FDE address range */ 356 .uleb128 0x0 /* Augmentation size */ 357 358 .byte 0x4 /* DW_CFA_advance_loc4 */ 359 .long .LUW1-.LUW0 360 361 /* New stack frame based off rbp. This is a itty bit of unwind 362 trickery in that the CFA *has* changed. There is no easy way 363 to describe it correctly on entry to the function. Fortunately, 364 it doesn't matter too much since at all points we can correctly 365 unwind back to ffi_call. Note that the location to which we 366 moved the return address is (the new) CFA-8, so from the 367 perspective of the unwind info, it hasn't moved. */ 368 .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */ 369 .uleb128 6 370 .uleb128 32 371 .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ 372 .uleb128 2 373 .byte 0xa /* DW_CFA_remember_state */ 374 375 .byte 0x4 /* DW_CFA_advance_loc4 */ 376 .long .LUW2-.LUW1 377 .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ 378 .uleb128 7 379 .uleb128 8 380 .byte 0xc0+6 /* DW_CFA_restore, %rbp */ 381 382 .byte 0x4 /* DW_CFA_advance_loc4 */ 383 .long .LUW3-.LUW2 384 .byte 0xb /* DW_CFA_restore_state */ 385 386 .align 8 387 .LEFDE1: 388 .LSFDE3: 389 .long .LEFDE3-.LASFDE3 /* FDE Length */ 390 .LASFDE3: 391 .long .LASFDE3-.Lframe1 /* FDE CIE offset */ 392 .long .LUW5-. /* FDE initial location */ 393 .long .LUW9-.LUW5 /* FDE address range */ 394 .uleb128 0x0 /* Augmentation size */ 395 396 .byte 0x4 /* DW_CFA_advance_loc4 */ 397 .long .LUW6-.LUW5 398 .byte 0xe /* DW_CFA_def_cfa_offset */ 399 .uleb128 208 400 .byte 0xa /* DW_CFA_remember_state */ 401 402 .byte 0x4 /* DW_CFA_advance_loc4 */ 403 .long .LUW7-.LUW6 404 .byte 0xe /* DW_CFA_def_cfa_offset */ 405 .uleb128 8 406 407 .byte 0x4 /* DW_CFA_advance_loc4 */ 408 .long .LUW8-.LUW7 409 .byte 0xb /* DW_CFA_restore_state */ 410 411 .align 8 412 .LEFDE3: 413 414 #endif /* __x86_64__ */ 415 416 #if defined __ELF__ && defined __linux__ 417 .section .note.GNU-stack,"",@progbits 418 #endif 419