Home | History | Annotate | Download | only in x86_64
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 /******************************************************************************/
     32 //                     ALGORITHM DESCRIPTION
     33 //                     ---------------------
     34 //
     35 // X87 version:
     36 // Use 80-bit FPU precision fmul, fsqrt to compute square and sqrt.
     37 //
     38 // SSE version:
     39 // Swap x, y if |x|<|y|
     40 // For x=2^k*x, get y=y*2^(-k)
     41 // Get S ~ sqrt(x^2+y^2)  (leading 1 + leading 25 mantissa bits)
     42 //
     43 // Get D = ( RN(x^2+y^2) - S^2 ) + ( x^2 - RN(x^2) ) +
     44 //                               + ( y^2 - ((RN(x^2+y^2)-RN(x^2)) )
     45 //
     46 // Result is 2^k*(S + Se),  where Se = S*e
     47 //        S*e is approximated as (D/2S)*( 1 - (D/2S)^2*1.0/S )
     48 //
     49 // Return 2^k*(S+Se)
     50 //
     51 // For |y/x|<2^(-64), return x
     52 //
     53 // For cases where maximum biased exponent is either greater than 7fdh or
     54 // below 32, take a special path to check for special cases (0, NaN, Inf),
     55 // possible overflow, and more accurate computation for denormal results
     56 //
     57 // Special cases:
     58 //  hypot(x,y), hypot(y,x), and hypot(x,-y) are equivalent
     59 //  hypot(x,+-0) is equivalent to fabs(x)
     60 //  hypot(x,y) = y if (x==NaN or x==INF) and y==INF
     61 //  hypot(x,y) = x if (x==NaN or x==INF) and y!=INF (even if y==NaN!)
     62 //  hypot(x,y) = y if (x!=NaN and x!=INF) and (y==NaN or y==INF)
     63 //
     64 /******************************************************************************/
     65 
     66 #include <private/bionic_asm.h>
     67 # -- Begin  hypot
     68 ENTRY(hypot)
     69 # parameter 1: %xmm0
     70 # parameter 2: %xmm1
     71 ..B1.1:
     72 ..___tag_value_hypot.1:
     73 ..___tag_value_hypot.3:
     74 ..B1.2:
     75         subq      $64, %rsp
     76         movapd    static_const_table(%rip), %xmm3
     77         movsd     %xmm0, 48(%rsp)
     78         movsd     %xmm1, 56(%rsp)
     79         andpd     %xmm3, %xmm0
     80         andpd     %xmm3, %xmm1
     81         pextrw    $3, %xmm0, %eax
     82         pextrw    $3, %xmm1, %edx
     83         cmpl      $24528, %eax
     84         ja        .L_2TAG_PACKET_0.0.1
     85         cmpl      $24528, %edx
     86         ja        .L_2TAG_PACKET_0.0.1
     87 .L_2TAG_PACKET_1.0.1:
     88         fldl      48(%rsp)
     89         fldl      56(%rsp)
     90         fxch      %st(1)
     91         fmul      %st(0), %st
     92         fxch      %st(1)
     93         nop
     94         fmul      %st(0), %st
     95         faddp     %st, %st(1)
     96         fsqrt
     97         jmp       .L_2TAG_PACKET_2.0.1
     98 .L_2TAG_PACKET_0.0.1:
     99         cmpl      $32752, %eax
    100         movl      %eax, %ecx
    101         jae       .L_2TAG_PACKET_3.0.1
    102         subl      %edx, %ecx
    103         cmpl      $32752, %edx
    104         jae       .L_2TAG_PACKET_3.0.1
    105         addl      $928, %ecx
    106         addl      %edx, %eax
    107         cmpl      $1856, %ecx
    108         ja        .L_2TAG_PACKET_4.0.1
    109         cmpl      $49056, %eax
    110         jb        .L_2TAG_PACKET_1.0.1
    111         fldl      48(%rsp)
    112         fldl      56(%rsp)
    113         fxch      %st(1)
    114         fmul      %st(0), %st
    115         fxch      %st(1)
    116         nop
    117         fmul      %st(0), %st
    118         faddp     %st, %st(1)
    119         fsqrt
    120 .L_2TAG_PACKET_5.0.1:
    121         fstl      (%rsp)
    122         fstpt     16(%rsp)
    123         xorl      %eax, %eax
    124         movw      24(%rsp), %ax
    125         cmpl      $17407, %eax
    126         jae       .L_2TAG_PACKET_6.0.1
    127         fldl      (%rsp)
    128         jmp       .L_2TAG_PACKET_7.0.1
    129 .L_2TAG_PACKET_4.0.1:
    130         movsd     %xmm0, 32(%rsp)
    131         movsd     %xmm1, 40(%rsp)
    132         fldl      32(%rsp)
    133         faddl     40(%rsp)
    134         jmp       .L_2TAG_PACKET_5.0.1
    135 .L_2TAG_PACKET_6.0.1:
    136         fldl      (%rsp)
    137         jmp       .L_2TAG_PACKET_7.0.1
    138 .L_2TAG_PACKET_3.0.1:
    139         shufpd    $0, %xmm1, %xmm0
    140         movdqa    %xmm0, %xmm2
    141         movdqa    16+static_const_table(%rip), %xmm3
    142         movsd     %xmm0, 32(%rsp)
    143         movsd     %xmm1, 40(%rsp)
    144         cmppd     $3, %xmm0, %xmm2
    145         cmppd     $0, %xmm0, %xmm3
    146         movmskpd  %xmm2, %edx
    147         movmskpd  %xmm3, %rax
    148         testl     %edx, %edx
    149         je        .L_2TAG_PACKET_8.0.1
    150         fldl      32(%rsp)
    151         fmull     40(%rsp)
    152         testq     $1, %rax
    153         jne       .L_2TAG_PACKET_9.0.1
    154         testq     $2, %rax
    155         jne       .L_2TAG_PACKET_10.0.1
    156         jmp       .L_2TAG_PACKET_2.0.1
    157 .L_2TAG_PACKET_8.0.1:
    158         fldl      32(%rsp)
    159         faddl     40(%rsp)
    160         jmp       .L_2TAG_PACKET_2.0.1
    161 .L_2TAG_PACKET_9.0.1:
    162         fstpl     40(%rsp)
    163         fldl      32(%rsp)
    164         jmp       .L_2TAG_PACKET_7.0.1
    165 .L_2TAG_PACKET_10.0.1:
    166         fstpl     32(%rsp)
    167         fldl      40(%rsp)
    168         jmp       .L_2TAG_PACKET_7.0.1
    169 .L_2TAG_PACKET_2.0.1:
    170 .L_2TAG_PACKET_7.0.1:
    171         fstpl     16(%rsp)
    172         movq      16(%rsp), %xmm0
    173         addq      $64, %rsp
    174         ret
    175 ..B1.3:
    176 ..___tag_value_hypot.4:
    177 END(hypot)
    178 # -- End  hypot
    179 	.section .rodata, "a"
    180 	.align 16
    181 	.align 16
    182 static_const_table:
    183 	.long	4294967295
    184 	.long	2147483647
    185 	.long	4294967295
    186 	.long	2147483647
    187 	.long	0
    188 	.long	2146435072
    189 	.long	0
    190 	.long	2146435072
    191 	.type	static_const_table,@object
    192 	.size	static_const_table,32
    193 	.data
    194 	.section .note.GNU-stack, ""
    195 // -- Begin DWARF2 SEGMENT .eh_frame
    196 	.section .eh_frame,"a",@progbits
    197 .eh_frame_seg:
    198 	.align 1
    199 	.4byte 0x00000014
    200 	.8byte 0x00527a0100000000
    201 	.8byte 0x08070c1b01107801
    202 	.4byte 0x00000190
    203 	.4byte 0x00000014
    204 	.4byte 0x0000001c
    205 	.4byte ..___tag_value_hypot.1-.
    206 	.4byte ..___tag_value_hypot.4-..___tag_value_hypot.1
    207 	.2byte 0x0400
    208 	.4byte ..___tag_value_hypot.3-..___tag_value_hypot.1
    209 	.2byte 0x100e
    210 # End
    211