Home | History | Annotate | Download | only in include
      1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
      2    Copyright (C) 1991-2014 Free Software Foundation, Inc.
      3 
      4    This file is part of the GNU C Library.
      5 
      6    The GNU C Library is free software; you can redistribute it and/or
      7    modify it under the terms of the GNU Lesser General Public
      8    License as published by the Free Software Foundation; either
      9    version 2.1 of the License, or (at your option) any later version.
     10 
     11    In addition to the permissions in the GNU Lesser General Public
     12    License, the Free Software Foundation gives you unlimited
     13    permission to link the compiled version of this file into
     14    combinations with other programs, and to distribute those
     15    combinations without any restriction coming from the use of this
     16    file.  (The Lesser General Public License restrictions do apply in
     17    other respects; for example, they cover modification of the file,
     18    and distribution when not linked into a combine executable.)
     19 
     20    The GNU C Library is distributed in the hope that it will be useful,
     21    but WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    Lesser General Public License for more details.
     24 
     25    You should have received a copy of the GNU Lesser General Public
     26    License along with the GNU C Library; if not, see
     27    <http://www.gnu.org/licenses/>.  */
     28 
     29 /* You have to define the following before including this file:
     30 
     31    UWtype -- An unsigned type, default type for operations (typically a "word")
     32    UHWtype -- An unsigned type, at least half the size of UWtype.
     33    UDWtype -- An unsigned type, at least twice as large a UWtype
     34    W_TYPE_SIZE -- size in bits of UWtype
     35 
     36    UQItype -- Unsigned 8 bit type.
     37    SItype, USItype -- Signed and unsigned 32 bit types.
     38    DItype, UDItype -- Signed and unsigned 64 bit types.
     39 
     40    On a 32 bit machine UWtype should typically be USItype;
     41    on a 64 bit machine, UWtype should typically be UDItype.  */
     42 
     43 #define __BITS4 (W_TYPE_SIZE / 4)
     44 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
     45 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
     46 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
     47 
     48 #ifndef W_TYPE_SIZE
     49 #define W_TYPE_SIZE	32
     50 #define UWtype		USItype
     51 #define UHWtype		USItype
     52 #define UDWtype		UDItype
     53 #endif
     54 
     55 /* Used in glibc only.  */
     56 #ifndef attribute_hidden
     57 #define attribute_hidden
     58 #endif
     59 
     60 extern const UQItype __clz_tab[256] attribute_hidden;
     61 
     62 /* Define auxiliary asm macros.
     63 
     64    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
     65    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
     66    word product in HIGH_PROD and LOW_PROD.
     67 
     68    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
     69    UDWtype product.  This is just a variant of umul_ppmm.
     70 
     71    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     72    denominator) divides a UDWtype, composed by the UWtype integers
     73    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
     74    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
     75    than DENOMINATOR for correct operation.  If, in addition, the most
     76    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
     77    UDIV_NEEDS_NORMALIZATION is defined to 1.
     78 
     79    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
     80    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
     81    is rounded towards 0.
     82 
     83    5) count_leading_zeros(count, x) counts the number of zero-bits from the
     84    msb to the first nonzero bit in the UWtype X.  This is the number of
     85    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
     86    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
     87 
     88    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
     89    from the least significant end.
     90 
     91    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
     92    high_addend_2, low_addend_2) adds two UWtype integers, composed by
     93    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
     94    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
     95    (i.e. carry out) is not stored anywhere, and is lost.
     96 
     97    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
     98    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
     99    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
    100    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
    101    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
    102    and is lost.
    103 
    104    If any of these macros are left undefined for a particular CPU,
    105    C macros are used.  */
    106 
    107 /* The CPUs come in alphabetical order below.
    108 
    109    Please add support for more CPUs here, or improve the current support
    110    for the CPUs below!
    111    (E.g. WE32100, IBM360.)  */
    112 
    113 #if defined (__GNUC__) && !defined (NO_ASM)
    114 
    115 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    116    understood by gcc1.  Use cpp to avoid major code duplication.  */
    117 #if __GNUC__ < 2
    118 #define __CLOBBER_CC
    119 #define __AND_CLOBBER_CC
    120 #else /* __GNUC__ >= 2 */
    121 #define __CLOBBER_CC : "cc"
    122 #define __AND_CLOBBER_CC , "cc"
    123 #endif /* __GNUC__ < 2 */
    124 
    125 #if defined (__aarch64__)
    126 
    127 #if W_TYPE_SIZE == 32
    128 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
    129 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctz (X))
    130 #define COUNT_LEADING_ZEROS_0 32
    131 #endif /* W_TYPE_SIZE == 32 */
    132 
    133 #if W_TYPE_SIZE == 64
    134 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clzll (X))
    135 #define count_trailing_zeros(COUNT, X)   ((COUNT) = __builtin_ctzll (X))
    136 #define COUNT_LEADING_ZEROS_0 64
    137 #endif /* W_TYPE_SIZE == 64 */
    138 
    139 #endif /* __aarch64__ */
    140 
    141 #if defined (__alpha) && W_TYPE_SIZE == 64
    142 #define umul_ppmm(ph, pl, m0, m1) \
    143   do {									\
    144     UDItype __m0 = (m0), __m1 = (m1);					\
    145     (ph) = __builtin_alpha_umulh (__m0, __m1);				\
    146     (pl) = __m0 * __m1;							\
    147   } while (0)
    148 #define UMUL_TIME 46
    149 #ifndef LONGLONG_STANDALONE
    150 #define udiv_qrnnd(q, r, n1, n0, d) \
    151   do { UDItype __r;							\
    152     (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));				\
    153     (r) = __r;								\
    154   } while (0)
    155 extern UDItype __udiv_qrnnd (UDItype *, UDItype, UDItype, UDItype);
    156 #define UDIV_TIME 220
    157 #endif /* LONGLONG_STANDALONE */
    158 #ifdef __alpha_cix__
    159 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clzl (X))
    160 #define count_trailing_zeros(COUNT,X)	((COUNT) = __builtin_ctzl (X))
    161 #define COUNT_LEADING_ZEROS_0 64
    162 #else
    163 #define count_leading_zeros(COUNT,X) \
    164   do {									\
    165     UDItype __xr = (X), __t, __a;					\
    166     __t = __builtin_alpha_cmpbge (0, __xr);				\
    167     __a = __clz_tab[__t ^ 0xff] - 1;					\
    168     __t = __builtin_alpha_extbl (__xr, __a);				\
    169     (COUNT) = 64 - (__clz_tab[__t] + __a*8);				\
    170   } while (0)
    171 #define count_trailing_zeros(COUNT,X) \
    172   do {									\
    173     UDItype __xr = (X), __t, __a;					\
    174     __t = __builtin_alpha_cmpbge (0, __xr);				\
    175     __t = ~__t & -~__t;							\
    176     __a = ((__t & 0xCC) != 0) * 2;					\
    177     __a += ((__t & 0xF0) != 0) * 4;					\
    178     __a += ((__t & 0xAA) != 0);						\
    179     __t = __builtin_alpha_extbl (__xr, __a);				\
    180     __a <<= 3;								\
    181     __t &= -__t;							\
    182     __a += ((__t & 0xCC) != 0) * 2;					\
    183     __a += ((__t & 0xF0) != 0) * 4;					\
    184     __a += ((__t & 0xAA) != 0);						\
    185     (COUNT) = __a;							\
    186   } while (0)
    187 #endif /* __alpha_cix__ */
    188 #endif /* __alpha */
    189 
    190 #if defined (__arc__) && W_TYPE_SIZE == 32
    191 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    192   __asm__ ("add.f	%1, %4, %5\n\tadc	%0, %2, %3"		\
    193 	   : "=r" ((USItype) (sh)),					\
    194 	     "=&r" ((USItype) (sl))					\
    195 	   : "%r" ((USItype) (ah)),					\
    196 	     "rIJ" ((USItype) (bh)),					\
    197 	     "%r" ((USItype) (al)),					\
    198 	     "rIJ" ((USItype) (bl)))
    199 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    200   __asm__ ("sub.f	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    201 	   : "=r" ((USItype) (sh)),					\
    202 	     "=&r" ((USItype) (sl))					\
    203 	   : "r" ((USItype) (ah)),					\
    204 	     "rIJ" ((USItype) (bh)),					\
    205 	     "r" ((USItype) (al)),					\
    206 	     "rIJ" ((USItype) (bl)))
    207 
    208 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
    209 #ifdef __ARC_NORM__
    210 #define count_leading_zeros(count, x) \
    211   do									\
    212     {									\
    213       SItype c_;							\
    214 									\
    215       __asm__ ("norm.f\t%0,%1\n\tmov.mi\t%0,-1" : "=r" (c_) : "r" (x) : "cc");\
    216       (count) = c_ + 1;							\
    217     }									\
    218   while (0)
    219 #define COUNT_LEADING_ZEROS_0 32
    220 #endif
    221 #endif
    222 
    223 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
    224  && W_TYPE_SIZE == 32
    225 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    226   __asm__ ("adds	%1, %4, %5\n\tadc	%0, %2, %3"		\
    227 	   : "=r" ((USItype) (sh)),					\
    228 	     "=&r" ((USItype) (sl))					\
    229 	   : "%r" ((USItype) (ah)),					\
    230 	     "rI" ((USItype) (bh)),					\
    231 	     "%r" ((USItype) (al)),					\
    232 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    233 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    234   __asm__ ("subs	%1, %4, %5\n\tsbc	%0, %2, %3"		\
    235 	   : "=r" ((USItype) (sh)),					\
    236 	     "=&r" ((USItype) (sl))					\
    237 	   : "r" ((USItype) (ah)),					\
    238 	     "rI" ((USItype) (bh)),					\
    239 	     "r" ((USItype) (al)),					\
    240 	     "rI" ((USItype) (bl)) __CLOBBER_CC)
    241 # if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
    242      || defined(__ARM_ARCH_3__)
    243 #  define umul_ppmm(xh, xl, a, b)					\
    244   do {									\
    245     register USItype __t0, __t1, __t2;					\
    246     __asm__ ("%@ Inlined umul_ppmm\n"					\
    247 	   "	mov	%2, %5, lsr #16\n"				\
    248 	   "	mov	%0, %6, lsr #16\n"				\
    249 	   "	bic	%3, %5, %2, lsl #16\n"				\
    250 	   "	bic	%4, %6, %0, lsl #16\n"				\
    251 	   "	mul	%1, %3, %4\n"					\
    252 	   "	mul	%4, %2, %4\n"					\
    253 	   "	mul	%3, %0, %3\n"					\
    254 	   "	mul	%0, %2, %0\n"					\
    255 	   "	adds	%3, %4, %3\n"					\
    256 	   "	addcs	%0, %0, #65536\n"				\
    257 	   "	adds	%1, %1, %3, lsl #16\n"				\
    258 	   "	adc	%0, %0, %3, lsr #16"				\
    259 	   : "=&r" ((USItype) (xh)),					\
    260 	     "=r" ((USItype) (xl)),					\
    261 	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
    262 	   : "r" ((USItype) (a)),					\
    263 	     "r" ((USItype) (b)) __CLOBBER_CC );			\
    264   } while (0)
    265 #  define UMUL_TIME 20
    266 # else
    267 #  define umul_ppmm(xh, xl, a, b)					\
    268   do {									\
    269     /* Generate umull, under compiler control.  */			\
    270     register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);	\
    271     (xl) = (USItype)__t0;						\
    272     (xh) = (USItype)(__t0 >> 32);					\
    273   } while (0)
    274 #  define UMUL_TIME 3
    275 # endif
    276 # define UDIV_TIME 100
    277 #endif /* __arm__ */
    278 
    279 #if defined(__arm__)
    280 /* Let gcc decide how best to implement count_leading_zeros.  */
    281 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    282 #define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
    283 #define COUNT_LEADING_ZEROS_0 32
    284 #endif
    285 
    286 #if defined (__AVR__)
    287 
    288 #if W_TYPE_SIZE == 16
    289 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clz (X))
    290 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X))
    291 #define COUNT_LEADING_ZEROS_0 16
    292 #endif /* W_TYPE_SIZE == 16 */
    293 
    294 #if W_TYPE_SIZE == 32
    295 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzl (X))
    296 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzl (X))
    297 #define COUNT_LEADING_ZEROS_0 32
    298 #endif /* W_TYPE_SIZE == 32 */
    299 
    300 #if W_TYPE_SIZE == 64
    301 #define count_leading_zeros(COUNT,X)  ((COUNT) = __builtin_clzll (X))
    302 #define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctzll (X))
    303 #define COUNT_LEADING_ZEROS_0 64
    304 #endif /* W_TYPE_SIZE == 64 */
    305 
    306 #endif /* defined (__AVR__) */
    307 
    308 #if defined (__CRIS__)
    309 
    310 #if __CRIS_arch_version >= 3
    311 #define count_leading_zeros(COUNT, X) ((COUNT) = __builtin_clz (X))
    312 #define COUNT_LEADING_ZEROS_0 32
    313 #endif /* __CRIS_arch_version >= 3 */
    314 
    315 #if __CRIS_arch_version >= 8
    316 #define count_trailing_zeros(COUNT, X) ((COUNT) = __builtin_ctz (X))
    317 #endif /* __CRIS_arch_version >= 8 */
    318 
    319 #if __CRIS_arch_version >= 10
    320 #define __umulsidi3(u,v) ((UDItype)(USItype) (u) * (UDItype)(USItype) (v))
    321 #else
    322 #define __umulsidi3 __umulsidi3
    323 extern UDItype __umulsidi3 (USItype, USItype);
    324 #endif /* __CRIS_arch_version >= 10 */
    325 
    326 #define umul_ppmm(w1, w0, u, v)		\
    327   do {					\
    328     UDItype __x = __umulsidi3 (u, v);	\
    329     (w0) = (USItype) (__x);		\
    330     (w1) = (USItype) (__x >> 32);	\
    331   } while (0)
    332 
    333 /* FIXME: defining add_ssaaaa and sub_ddmmss should be advantageous for
    334    DFmode ("double" intrinsics, avoiding two of the three insns handling
    335    carry), but defining them as open-code C composing and doing the
    336    operation in DImode (UDImode) shows that the DImode needs work:
    337    register pressure from requiring neighboring registers and the
    338    traffic to and from them come to dominate, in the 4.7 series.  */
    339 
    340 #endif /* defined (__CRIS__) */
    341 
    342 #if defined (__hppa) && W_TYPE_SIZE == 32
    343 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    344   __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
    345 	   : "=r" ((USItype) (sh)),					\
    346 	     "=&r" ((USItype) (sl))					\
    347 	   : "%rM" ((USItype) (ah)),					\
    348 	     "rM" ((USItype) (bh)),					\
    349 	     "%rM" ((USItype) (al)),					\
    350 	     "rM" ((USItype) (bl)))
    351 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    352   __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
    353 	   : "=r" ((USItype) (sh)),					\
    354 	     "=&r" ((USItype) (sl))					\
    355 	   : "rM" ((USItype) (ah)),					\
    356 	     "rM" ((USItype) (bh)),					\
    357 	     "rM" ((USItype) (al)),					\
    358 	     "rM" ((USItype) (bl)))
    359 #if defined (_PA_RISC1_1)
    360 #define umul_ppmm(w1, w0, u, v) \
    361   do {									\
    362     union								\
    363       {									\
    364 	UDItype __f;							\
    365 	struct {USItype __w1, __w0;} __w1w0;				\
    366       } __t;								\
    367     __asm__ ("xmpyu %1,%2,%0"						\
    368 	     : "=x" (__t.__f)						\
    369 	     : "x" ((USItype) (u)),					\
    370 	       "x" ((USItype) (v)));					\
    371     (w1) = __t.__w1w0.__w1;						\
    372     (w0) = __t.__w1w0.__w0;						\
    373      } while (0)
    374 #define UMUL_TIME 8
    375 #else
    376 #define UMUL_TIME 30
    377 #endif
    378 #define UDIV_TIME 40
    379 #define count_leading_zeros(count, x) \
    380   do {									\
    381     USItype __tmp;							\
    382     __asm__ (								\
    383        "ldi		1,%0\n"						\
    384 "	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
    385 "	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"\
    386 "	ldo		16(%0),%0		; Yes.  Perform add.\n"	\
    387 "	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
    388 "	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"\
    389 "	ldo		8(%0),%0		; Yes.  Perform add.\n"	\
    390 "	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
    391 "	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"\
    392 "	ldo		4(%0),%0		; Yes.  Perform add.\n"	\
    393 "	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
    394 "	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"\
    395 "	ldo		2(%0),%0		; Yes.  Perform add.\n"	\
    396 "	extru		%1,30,1,%1		; Extract bit 1.\n"	\
    397 "	sub		%0,%1,%0		; Subtract it.\n"	\
    398 	: "=r" (count), "=r" (__tmp) : "1" (x));			\
    399   } while (0)
    400 #endif
    401 
    402 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
    403 #if !defined (__zarch__)
    404 #define smul_ppmm(xh, xl, m0, m1) \
    405   do {									\
    406     union {DItype __ll;							\
    407 	   struct {USItype __h, __l;} __i;				\
    408 	  } __x;							\
    409     __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
    410 	     : "=&r" (__x.__ll)						\
    411 	     : "r" (m0), "r" (m1));					\
    412     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    413   } while (0)
    414 #define sdiv_qrnnd(q, r, n1, n0, d) \
    415   do {									\
    416     union {DItype __ll;							\
    417 	   struct {USItype __h, __l;} __i;				\
    418 	  } __x;							\
    419     __x.__i.__h = n1; __x.__i.__l = n0;					\
    420     __asm__ ("dr %0,%2"							\
    421 	     : "=r" (__x.__ll)						\
    422 	     : "0" (__x.__ll), "r" (d));				\
    423     (q) = __x.__i.__l; (r) = __x.__i.__h;				\
    424   } while (0)
    425 #else
    426 #define smul_ppmm(xh, xl, m0, m1) \
    427   do {                                                                  \
    428     register SItype __r0 __asm__ ("0");					\
    429     register SItype __r1 __asm__ ("1") = (m0);				\
    430 									\
    431     __asm__ ("mr\t%%r0,%3"                                              \
    432 	     : "=r" (__r0), "=r" (__r1)					\
    433 	     : "r"  (__r1),  "r" (m1));					\
    434     (xh) = __r0; (xl) = __r1;						\
    435   } while (0)
    436 
    437 #define sdiv_qrnnd(q, r, n1, n0, d) \
    438   do {									\
    439     register SItype __r0 __asm__ ("0") = (n1);				\
    440     register SItype __r1 __asm__ ("1") = (n0);				\
    441 									\
    442     __asm__ ("dr\t%%r0,%4"                                              \
    443 	     : "=r" (__r0), "=r" (__r1)					\
    444 	     : "r" (__r0), "r" (__r1), "r" (d));			\
    445     (q) = __r1; (r) = __r0;						\
    446   } while (0)
    447 #endif /* __zarch__ */
    448 #endif
    449 
    450 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
    451 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    452   __asm__ ("add{l} {%5,%1|%1,%5}\n\tadc{l} {%3,%0|%0,%3}"		\
    453 	   : "=r" ((USItype) (sh)),					\
    454 	     "=&r" ((USItype) (sl))					\
    455 	   : "%0" ((USItype) (ah)),					\
    456 	     "g" ((USItype) (bh)),					\
    457 	     "%1" ((USItype) (al)),					\
    458 	     "g" ((USItype) (bl)))
    459 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    460   __asm__ ("sub{l} {%5,%1|%1,%5}\n\tsbb{l} {%3,%0|%0,%3}"		\
    461 	   : "=r" ((USItype) (sh)),					\
    462 	     "=&r" ((USItype) (sl))					\
    463 	   : "0" ((USItype) (ah)),					\
    464 	     "g" ((USItype) (bh)),					\
    465 	     "1" ((USItype) (al)),					\
    466 	     "g" ((USItype) (bl)))
    467 #define umul_ppmm(w1, w0, u, v) \
    468   __asm__ ("mul{l} %3"							\
    469 	   : "=a" ((USItype) (w0)),					\
    470 	     "=d" ((USItype) (w1))					\
    471 	   : "%0" ((USItype) (u)),					\
    472 	     "rm" ((USItype) (v)))
    473 #define udiv_qrnnd(q, r, n1, n0, dv) \
    474   __asm__ ("div{l} %4"							\
    475 	   : "=a" ((USItype) (q)),					\
    476 	     "=d" ((USItype) (r))					\
    477 	   : "0" ((USItype) (n0)),					\
    478 	     "1" ((USItype) (n1)),					\
    479 	     "rm" ((USItype) (dv)))
    480 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
    481 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
    482 #define UMUL_TIME 40
    483 #define UDIV_TIME 40
    484 #endif /* 80x86 */
    485 
    486 #if defined (__x86_64__) && W_TYPE_SIZE == 64
    487 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    488   __asm__ ("add{q} {%5,%1|%1,%5}\n\tadc{q} {%3,%0|%0,%3}"		\
    489 	   : "=r" ((UDItype) (sh)),					\
    490 	     "=&r" ((UDItype) (sl))					\
    491 	   : "%0" ((UDItype) (ah)),					\
    492 	     "rme" ((UDItype) (bh)),					\
    493 	     "%1" ((UDItype) (al)),					\
    494 	     "rme" ((UDItype) (bl)))
    495 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    496   __asm__ ("sub{q} {%5,%1|%1,%5}\n\tsbb{q} {%3,%0|%0,%3}"		\
    497 	   : "=r" ((UDItype) (sh)),					\
    498 	     "=&r" ((UDItype) (sl))					\
    499 	   : "0" ((UDItype) (ah)),					\
    500 	     "rme" ((UDItype) (bh)),					\
    501 	     "1" ((UDItype) (al)),					\
    502 	     "rme" ((UDItype) (bl)))
    503 #define umul_ppmm(w1, w0, u, v) \
    504   __asm__ ("mul{q} %3"							\
    505 	   : "=a" ((UDItype) (w0)),					\
    506 	     "=d" ((UDItype) (w1))					\
    507 	   : "%0" ((UDItype) (u)),					\
    508 	     "rm" ((UDItype) (v)))
    509 #define udiv_qrnnd(q, r, n1, n0, dv) \
    510   __asm__ ("div{q} %4"							\
    511 	   : "=a" ((UDItype) (q)),					\
    512 	     "=d" ((UDItype) (r))					\
    513 	   : "0" ((UDItype) (n0)),					\
    514 	     "1" ((UDItype) (n1)),					\
    515 	     "rm" ((UDItype) (dv)))
    516 #define count_leading_zeros(count, x)	((count) = __builtin_clzll (x))
    517 #define count_trailing_zeros(count, x)	((count) = __builtin_ctzll (x))
    518 #define UMUL_TIME 40
    519 #define UDIV_TIME 40
    520 #endif /* x86_64 */
    521 
    522 #if defined (__i960__) && W_TYPE_SIZE == 32
    523 #define umul_ppmm(w1, w0, u, v) \
    524   ({union {UDItype __ll;						\
    525 	   struct {USItype __l, __h;} __i;				\
    526 	  } __xx;							\
    527   __asm__ ("emul	%2,%1,%0"					\
    528 	   : "=d" (__xx.__ll)						\
    529 	   : "%dI" ((USItype) (u)),					\
    530 	     "dI" ((USItype) (v)));					\
    531   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
    532 #define __umulsidi3(u, v) \
    533   ({UDItype __w;							\
    534     __asm__ ("emul	%2,%1,%0"					\
    535 	     : "=d" (__w)						\
    536 	     : "%dI" ((USItype) (u)),					\
    537 	       "dI" ((USItype) (v)));					\
    538     __w; })
    539 #endif /* __i960__ */
    540 
    541 #if defined (__ia64) && W_TYPE_SIZE == 64
    542 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
    543    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
    544    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
    545    register, which takes an extra cycle.  */
    546 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
    547   do {									\
    548     UWtype __x;								\
    549     __x = (al) - (bl);							\
    550     if ((al) < (bl))							\
    551       (sh) = (ah) - (bh) - 1;						\
    552     else								\
    553       (sh) = (ah) - (bh);						\
    554     (sl) = __x;								\
    555   } while (0)
    556 
    557 /* Do both product parts in assembly, since that gives better code with
    558    all gcc versions.  Some callers will just use the upper part, and in
    559    that situation we waste an instruction, but not any cycles.  */
    560 #define umul_ppmm(ph, pl, m0, m1)					\
    561   __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
    562 	   : "=&f" (ph), "=f" (pl)					\
    563 	   : "f" (m0), "f" (m1))
    564 #define count_leading_zeros(count, x)					\
    565   do {									\
    566     UWtype _x = (x), _y, _a, _c;					\
    567     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
    568     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
    569     _c = (_a - 1) << 3;							\
    570     _x >>= _c;								\
    571     if (_x >= 1 << 4)							\
    572       _x >>= 4, _c += 4;						\
    573     if (_x >= 1 << 2)							\
    574       _x >>= 2, _c += 2;						\
    575     _c += _x >> 1;							\
    576     (count) =  W_TYPE_SIZE - 1 - _c;					\
    577   } while (0)
    578 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
    579    based, and we don't need a special case for x==0 here */
    580 #define count_trailing_zeros(count, x)					\
    581   do {									\
    582     UWtype __ctz_x = (x);						\
    583     __asm__ ("popcnt %0 = %1"						\
    584 	     : "=r" (count)						\
    585 	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
    586   } while (0)
    587 #define UMUL_TIME 14
    588 #endif
    589 
    590 #if defined (__M32R__) && W_TYPE_SIZE == 32
    591 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    592   /* The cmp clears the condition bit.  */ \
    593   __asm__ ("cmp %0,%0\n\taddx %1,%5\n\taddx %0,%3"			\
    594 	   : "=r" ((USItype) (sh)),					\
    595 	     "=&r" ((USItype) (sl))					\
    596 	   : "0" ((USItype) (ah)),					\
    597 	     "r" ((USItype) (bh)),					\
    598 	     "1" ((USItype) (al)),					\
    599 	     "r" ((USItype) (bl))					\
    600 	   : "cbit")
    601 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    602   /* The cmp clears the condition bit.  */ \
    603   __asm__ ("cmp %0,%0\n\tsubx %1,%5\n\tsubx %0,%3"			\
    604 	   : "=r" ((USItype) (sh)),					\
    605 	     "=&r" ((USItype) (sl))					\
    606 	   : "0" ((USItype) (ah)),					\
    607 	     "r" ((USItype) (bh)),					\
    608 	     "1" ((USItype) (al)),					\
    609 	     "r" ((USItype) (bl))					\
    610 	   : "cbit")
    611 #endif /* __M32R__ */
    612 
    613 #if defined (__mc68000__) && W_TYPE_SIZE == 32
    614 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    615   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
    616 	   : "=d" ((USItype) (sh)),					\
    617 	     "=&d" ((USItype) (sl))					\
    618 	   : "%0" ((USItype) (ah)),					\
    619 	     "d" ((USItype) (bh)),					\
    620 	     "%1" ((USItype) (al)),					\
    621 	     "g" ((USItype) (bl)))
    622 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    623   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
    624 	   : "=d" ((USItype) (sh)),					\
    625 	     "=&d" ((USItype) (sl))					\
    626 	   : "0" ((USItype) (ah)),					\
    627 	     "d" ((USItype) (bh)),					\
    628 	     "1" ((USItype) (al)),					\
    629 	     "g" ((USItype) (bl)))
    630 
    631 /* The '020, '030, '040, '060 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
    632 #if (defined (__mc68020__) && !defined (__mc68060__))
    633 #define umul_ppmm(w1, w0, u, v) \
    634   __asm__ ("mulu%.l %3,%1:%0"						\
    635 	   : "=d" ((USItype) (w0)),					\
    636 	     "=d" ((USItype) (w1))					\
    637 	   : "%0" ((USItype) (u)),					\
    638 	     "dmi" ((USItype) (v)))
    639 #define UMUL_TIME 45
    640 #define udiv_qrnnd(q, r, n1, n0, d) \
    641   __asm__ ("divu%.l %4,%1:%0"						\
    642 	   : "=d" ((USItype) (q)),					\
    643 	     "=d" ((USItype) (r))					\
    644 	   : "0" ((USItype) (n0)),					\
    645 	     "1" ((USItype) (n1)),					\
    646 	     "dmi" ((USItype) (d)))
    647 #define UDIV_TIME 90
    648 #define sdiv_qrnnd(q, r, n1, n0, d) \
    649   __asm__ ("divs%.l %4,%1:%0"						\
    650 	   : "=d" ((USItype) (q)),					\
    651 	     "=d" ((USItype) (r))					\
    652 	   : "0" ((USItype) (n0)),					\
    653 	     "1" ((USItype) (n1)),					\
    654 	     "dmi" ((USItype) (d)))
    655 
    656 #elif defined (__mcoldfire__) /* not mc68020 */
    657 
    658 #define umul_ppmm(xh, xl, a, b) \
    659   __asm__ ("| Inlined umul_ppmm\n"					\
    660 	   "	move%.l	%2,%/d0\n"					\
    661 	   "	move%.l	%3,%/d1\n"					\
    662 	   "	move%.l	%/d0,%/d2\n"					\
    663 	   "	swap	%/d0\n"						\
    664 	   "	move%.l	%/d1,%/d3\n"					\
    665 	   "	swap	%/d1\n"						\
    666 	   "	move%.w	%/d2,%/d4\n"					\
    667 	   "	mulu	%/d3,%/d4\n"					\
    668 	   "	mulu	%/d1,%/d2\n"					\
    669 	   "	mulu	%/d0,%/d3\n"					\
    670 	   "	mulu	%/d0,%/d1\n"					\
    671 	   "	move%.l	%/d4,%/d0\n"					\
    672 	   "	clr%.w	%/d0\n"						\
    673 	   "	swap	%/d0\n"						\
    674 	   "	add%.l	%/d0,%/d2\n"					\
    675 	   "	add%.l	%/d3,%/d2\n"					\
    676 	   "	jcc	1f\n"						\
    677 	   "	add%.l	%#65536,%/d1\n"					\
    678 	   "1:	swap	%/d2\n"						\
    679 	   "	moveq	%#0,%/d0\n"					\
    680 	   "	move%.w	%/d2,%/d0\n"					\
    681 	   "	move%.w	%/d4,%/d2\n"					\
    682 	   "	move%.l	%/d2,%1\n"					\
    683 	   "	add%.l	%/d1,%/d0\n"					\
    684 	   "	move%.l	%/d0,%0"					\
    685 	   : "=g" ((USItype) (xh)),					\
    686 	     "=g" ((USItype) (xl))					\
    687 	   : "g" ((USItype) (a)),					\
    688 	     "g" ((USItype) (b))					\
    689 	   : "d0", "d1", "d2", "d3", "d4")
    690 #define UMUL_TIME 100
    691 #define UDIV_TIME 400
    692 #else /* not ColdFire */
    693 /* %/ inserts REGISTER_PREFIX, %# inserts IMMEDIATE_PREFIX.  */
    694 #define umul_ppmm(xh, xl, a, b) \
    695   __asm__ ("| Inlined umul_ppmm\n"					\
    696 	   "	move%.l	%2,%/d0\n"					\
    697 	   "	move%.l	%3,%/d1\n"					\
    698 	   "	move%.l	%/d0,%/d2\n"					\
    699 	   "	swap	%/d0\n"						\
    700 	   "	move%.l	%/d1,%/d3\n"					\
    701 	   "	swap	%/d1\n"						\
    702 	   "	move%.w	%/d2,%/d4\n"					\
    703 	   "	mulu	%/d3,%/d4\n"					\
    704 	   "	mulu	%/d1,%/d2\n"					\
    705 	   "	mulu	%/d0,%/d3\n"					\
    706 	   "	mulu	%/d0,%/d1\n"					\
    707 	   "	move%.l	%/d4,%/d0\n"					\
    708 	   "	eor%.w	%/d0,%/d0\n"					\
    709 	   "	swap	%/d0\n"						\
    710 	   "	add%.l	%/d0,%/d2\n"					\
    711 	   "	add%.l	%/d3,%/d2\n"					\
    712 	   "	jcc	1f\n"						\
    713 	   "	add%.l	%#65536,%/d1\n"					\
    714 	   "1:	swap	%/d2\n"						\
    715 	   "	moveq	%#0,%/d0\n"					\
    716 	   "	move%.w	%/d2,%/d0\n"					\
    717 	   "	move%.w	%/d4,%/d2\n"					\
    718 	   "	move%.l	%/d2,%1\n"					\
    719 	   "	add%.l	%/d1,%/d0\n"					\
    720 	   "	move%.l	%/d0,%0"					\
    721 	   : "=g" ((USItype) (xh)),					\
    722 	     "=g" ((USItype) (xl))					\
    723 	   : "g" ((USItype) (a)),					\
    724 	     "g" ((USItype) (b))					\
    725 	   : "d0", "d1", "d2", "d3", "d4")
    726 #define UMUL_TIME 100
    727 #define UDIV_TIME 400
    728 
    729 #endif /* not mc68020 */
    730 
    731 /* The '020, '030, '040 and '060 have bitfield insns.
    732    cpu32 disguises as a 68020, but lacks them.  */
    733 #if defined (__mc68020__) && !defined (__mcpu32__)
    734 #define count_leading_zeros(count, x) \
    735   __asm__ ("bfffo %1{%b2:%b2},%0"					\
    736 	   : "=d" ((USItype) (count))					\
    737 	   : "od" ((USItype) (x)), "n" (0))
    738 /* Some ColdFire architectures have a ff1 instruction supported via
    739    __builtin_clz. */
    740 #elif defined (__mcfisaaplus__) || defined (__mcfisac__)
    741 #define count_leading_zeros(count,x) ((count) = __builtin_clz (x))
    742 #define COUNT_LEADING_ZEROS_0 32
    743 #endif
    744 #endif /* mc68000 */
    745 
    746 #if defined (__m88000__) && W_TYPE_SIZE == 32
    747 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    748   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
    749 	   : "=r" ((USItype) (sh)),					\
    750 	     "=&r" ((USItype) (sl))					\
    751 	   : "%rJ" ((USItype) (ah)),					\
    752 	     "rJ" ((USItype) (bh)),					\
    753 	     "%rJ" ((USItype) (al)),					\
    754 	     "rJ" ((USItype) (bl)))
    755 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    756   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
    757 	   : "=r" ((USItype) (sh)),					\
    758 	     "=&r" ((USItype) (sl))					\
    759 	   : "rJ" ((USItype) (ah)),					\
    760 	     "rJ" ((USItype) (bh)),					\
    761 	     "rJ" ((USItype) (al)),					\
    762 	     "rJ" ((USItype) (bl)))
    763 #define count_leading_zeros(count, x) \
    764   do {									\
    765     USItype __cbtmp;							\
    766     __asm__ ("ff1 %0,%1"						\
    767 	     : "=r" (__cbtmp)						\
    768 	     : "r" ((USItype) (x)));					\
    769     (count) = __cbtmp ^ 31;						\
    770   } while (0)
    771 #define COUNT_LEADING_ZEROS_0 63 /* sic */
    772 #if defined (__mc88110__)
    773 #define umul_ppmm(wh, wl, u, v) \
    774   do {									\
    775     union {UDItype __ll;						\
    776 	   struct {USItype __h, __l;} __i;				\
    777 	  } __xx;							\
    778     __asm__ ("mulu.d	%0,%1,%2"					\
    779 	     : "=r" (__xx.__ll)						\
    780 	     : "r" ((USItype) (u)),					\
    781 	       "r" ((USItype) (v)));					\
    782     (wh) = __xx.__i.__h;						\
    783     (wl) = __xx.__i.__l;						\
    784   } while (0)
    785 #define udiv_qrnnd(q, r, n1, n0, d) \
    786   ({union {UDItype __ll;						\
    787 	   struct {USItype __h, __l;} __i;				\
    788 	  } __xx;							\
    789   USItype __q;								\
    790   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
    791   __asm__ ("divu.d %0,%1,%2"						\
    792 	   : "=r" (__q)							\
    793 	   : "r" (__xx.__ll),						\
    794 	     "r" ((USItype) (d)));					\
    795   (r) = (n0) - __q * (d); (q) = __q; })
    796 #define UMUL_TIME 5
    797 #define UDIV_TIME 25
    798 #else
    799 #define UMUL_TIME 17
    800 #define UDIV_TIME 150
    801 #endif /* __mc88110__ */
    802 #endif /* __m88000__ */
    803 
    804 #if defined (__mn10300__)
    805 # if defined (__AM33__)
    806 #  define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    807 #  define umul_ppmm(w1, w0, u, v)		\
    808     asm("mulu %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    809 #  define smul_ppmm(w1, w0, u, v)		\
    810     asm("mul %3,%2,%1,%0" : "=r"(w0), "=r"(w1) : "r"(u), "r"(v))
    811 # else
    812 #  define umul_ppmm(w1, w0, u, v)		\
    813     asm("nop; nop; mulu %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    814 #  define smul_ppmm(w1, w0, u, v)		\
    815     asm("nop; nop; mul %3,%0" : "=d"(w0), "=z"(w1) : "%0"(u), "d"(v))
    816 # endif
    817 # define add_ssaaaa(sh, sl, ah, al, bh, bl)	\
    818   do {						\
    819     DWunion __s, __a, __b;			\
    820     __a.s.low = (al); __a.s.high = (ah);	\
    821     __b.s.low = (bl); __b.s.high = (bh);	\
    822     __s.ll = __a.ll + __b.ll;			\
    823     (sl) = __s.s.low; (sh) = __s.s.high;	\
    824   } while (0)
    825 # define sub_ddmmss(sh, sl, ah, al, bh, bl)	\
    826   do {						\
    827     DWunion __s, __a, __b;			\
    828     __a.s.low = (al); __a.s.high = (ah);	\
    829     __b.s.low = (bl); __b.s.high = (bh);	\
    830     __s.ll = __a.ll - __b.ll;			\
    831     (sl) = __s.s.low; (sh) = __s.s.high;	\
    832   } while (0)
    833 # define udiv_qrnnd(q, r, nh, nl, d)		\
    834   asm("divu %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    835 # define sdiv_qrnnd(q, r, nh, nl, d)		\
    836   asm("div %2,%0" : "=D"(q), "=z"(r) : "D"(d), "0"(nl), "1"(nh))
    837 # define UMUL_TIME 3
    838 # define UDIV_TIME 38
    839 #endif
    840 
    841 #if defined (__mips__) && W_TYPE_SIZE == 32
    842 #define umul_ppmm(w1, w0, u, v)						\
    843   do {									\
    844     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
    845     (w1) = (USItype) (__x >> 32);					\
    846     (w0) = (USItype) (__x);						\
    847   } while (0)
    848 #define UMUL_TIME 10
    849 #define UDIV_TIME 100
    850 
    851 #if (__mips == 32 || __mips == 64) && ! defined (__mips16)
    852 #define count_leading_zeros(COUNT,X)	((COUNT) = __builtin_clz (X))
    853 #define COUNT_LEADING_ZEROS_0 32
    854 #endif
    855 #endif /* __mips__ */
    856 
    857 #if defined (__ns32000__) && W_TYPE_SIZE == 32
    858 #define umul_ppmm(w1, w0, u, v) \
    859   ({union {UDItype __ll;						\
    860 	   struct {USItype __l, __h;} __i;				\
    861 	  } __xx;							\
    862   __asm__ ("meid %2,%0"							\
    863 	   : "=g" (__xx.__ll)						\
    864 	   : "%0" ((USItype) (u)),					\
    865 	     "g" ((USItype) (v)));					\
    866   (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
    867 #define __umulsidi3(u, v) \
    868   ({UDItype __w;							\
    869     __asm__ ("meid %2,%0"						\
    870 	     : "=g" (__w)						\
    871 	     : "%0" ((USItype) (u)),					\
    872 	       "g" ((USItype) (v)));					\
    873     __w; })
    874 #define udiv_qrnnd(q, r, n1, n0, d) \
    875   ({union {UDItype __ll;						\
    876 	   struct {USItype __l, __h;} __i;				\
    877 	  } __xx;							\
    878   __xx.__i.__h = (n1); __xx.__i.__l = (n0);				\
    879   __asm__ ("deid %2,%0"							\
    880 	   : "=g" (__xx.__ll)						\
    881 	   : "0" (__xx.__ll),						\
    882 	     "g" ((USItype) (d)));					\
    883   (r) = __xx.__i.__l; (q) = __xx.__i.__h; })
    884 #define count_trailing_zeros(count,x) \
    885   do {									\
    886     __asm__ ("ffsd     %2,%0"						\
    887 	    : "=r" ((USItype) (count))					\
    888 	    : "0" ((USItype) 0),					\
    889 	      "r" ((USItype) (x)));					\
    890   } while (0)
    891 #endif /* __ns32000__ */
    892 
    893 /* FIXME: We should test _IBMR2 here when we add assembly support for the
    894    system vendor compilers.
    895    FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
    896    enough, since that hits ARM and m68k too.  */
    897 #if (defined (_ARCH_PPC)	/* AIX */				\
    898      || defined (__powerpc__)	/* gcc */				\
    899      || defined (__POWERPC__)	/* BEOS */				\
    900      || defined (__ppc__)	/* Darwin */				\
    901      || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */    \
    902      || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */               \
    903 	 && CPU_FAMILY == PPC)                                                \
    904      ) && W_TYPE_SIZE == 32
    905 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    906   do {									\
    907     if (__builtin_constant_p (bh) && (bh) == 0)				\
    908       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    909 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    910     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    911       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    912 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    913     else								\
    914       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    915 	     : "=r" (sh), "=&r" (sl)					\
    916 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    917   } while (0)
    918 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    919   do {									\
    920     if (__builtin_constant_p (ah) && (ah) == 0)				\
    921       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    922 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    923     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
    924       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    925 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    926     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    927       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    928 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    929     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    930       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    931 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    932     else								\
    933       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    934 	       : "=r" (sh), "=&r" (sl)					\
    935 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    936   } while (0)
    937 #define count_leading_zeros(count, x) \
    938   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
    939 #define COUNT_LEADING_ZEROS_0 32
    940 #if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
    941   || defined (__ppc__)                                                    \
    942   || (defined (PPC) && ! defined (CPU_FAMILY)) /* gcc 2.7.x GNU&SysV */       \
    943   || (defined (PPC) && defined (CPU_FAMILY)    /* VxWorks */                  \
    944 	 && CPU_FAMILY == PPC)
    945 #define umul_ppmm(ph, pl, m0, m1) \
    946   do {									\
    947     USItype __m0 = (m0), __m1 = (m1);					\
    948     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    949     (pl) = __m0 * __m1;							\
    950   } while (0)
    951 #define UMUL_TIME 15
    952 #define smul_ppmm(ph, pl, m0, m1) \
    953   do {									\
    954     SItype __m0 = (m0), __m1 = (m1);					\
    955     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    956     (pl) = __m0 * __m1;							\
    957   } while (0)
    958 #define SMUL_TIME 14
    959 #define UDIV_TIME 120
    960 #endif
    961 #endif /* 32-bit POWER architecture variants.  */
    962 
    963 /* We should test _IBMR2 here when we add assembly support for the system
    964    vendor compilers.  */
    965 #if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
    966 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    967   do {									\
    968     if (__builtin_constant_p (bh) && (bh) == 0)				\
    969       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"		\
    970 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    971     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    972       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"		\
    973 	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
    974     else								\
    975       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"		\
    976 	     : "=r" (sh), "=&r" (sl)					\
    977 	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
    978   } while (0)
    979 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    980   do {									\
    981     if (__builtin_constant_p (ah) && (ah) == 0)				\
    982       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"	\
    983 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    984     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
    985       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"	\
    986 	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
    987     else if (__builtin_constant_p (bh) && (bh) == 0)			\
    988       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"		\
    989 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    990     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    991       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"		\
    992 	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
    993     else								\
    994       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"	\
    995 	       : "=r" (sh), "=&r" (sl)					\
    996 	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
    997   } while (0)
    998 #define count_leading_zeros(count, x) \
    999   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
   1000 #define COUNT_LEADING_ZEROS_0 64
   1001 #define umul_ppmm(ph, pl, m0, m1) \
   1002   do {									\
   1003     UDItype __m0 = (m0), __m1 = (m1);					\
   1004     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1005     (pl) = __m0 * __m1;							\
   1006   } while (0)
   1007 #define UMUL_TIME 15
   1008 #define smul_ppmm(ph, pl, m0, m1) \
   1009   do {									\
   1010     DItype __m0 = (m0), __m1 = (m1);					\
   1011     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
   1012     (pl) = __m0 * __m1;							\
   1013   } while (0)
   1014 #define SMUL_TIME 14  /* ??? */
   1015 #define UDIV_TIME 120 /* ??? */
   1016 #endif /* 64-bit PowerPC.  */
   1017 
   1018 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
   1019 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1020   __asm__ ("a %1,%5\n\tae %0,%3"					\
   1021 	   : "=r" ((USItype) (sh)),					\
   1022 	     "=&r" ((USItype) (sl))					\
   1023 	   : "%0" ((USItype) (ah)),					\
   1024 	     "r" ((USItype) (bh)),					\
   1025 	     "%1" ((USItype) (al)),					\
   1026 	     "r" ((USItype) (bl)))
   1027 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1028   __asm__ ("s %1,%5\n\tse %0,%3"					\
   1029 	   : "=r" ((USItype) (sh)),					\
   1030 	     "=&r" ((USItype) (sl))					\
   1031 	   : "0" ((USItype) (ah)),					\
   1032 	     "r" ((USItype) (bh)),					\
   1033 	     "1" ((USItype) (al)),					\
   1034 	     "r" ((USItype) (bl)))
   1035 #define umul_ppmm(ph, pl, m0, m1) \
   1036   do {									\
   1037     USItype __m0 = (m0), __m1 = (m1);					\
   1038     __asm__ (								\
   1039        "s	r2,r2\n"						\
   1040 "	mts	r10,%2\n"						\
   1041 "	m	r2,%3\n"						\
   1042 "	m	r2,%3\n"						\
   1043 "	m	r2,%3\n"						\
   1044 "	m	r2,%3\n"						\
   1045 "	m	r2,%3\n"						\
   1046 "	m	r2,%3\n"						\
   1047 "	m	r2,%3\n"						\
   1048 "	m	r2,%3\n"						\
   1049 "	m	r2,%3\n"						\
   1050 "	m	r2,%3\n"						\
   1051 "	m	r2,%3\n"						\
   1052 "	m	r2,%3\n"						\
   1053 "	m	r2,%3\n"						\
   1054 "	m	r2,%3\n"						\
   1055 "	m	r2,%3\n"						\
   1056 "	m	r2,%3\n"						\
   1057 "	cas	%0,r2,r0\n"						\
   1058 "	mfs	r10,%1"							\
   1059 	     : "=r" ((USItype) (ph)),					\
   1060 	       "=r" ((USItype) (pl))					\
   1061 	     : "%r" (__m0),						\
   1062 		"r" (__m1)						\
   1063 	     : "r2");							\
   1064     (ph) += ((((SItype) __m0 >> 31) & __m1)				\
   1065 	     + (((SItype) __m1 >> 31) & __m0));				\
   1066   } while (0)
   1067 #define UMUL_TIME 20
   1068 #define UDIV_TIME 200
   1069 #define count_leading_zeros(count, x) \
   1070   do {									\
   1071     if ((x) >= 0x10000)							\
   1072       __asm__ ("clz	%0,%1"						\
   1073 	       : "=r" ((USItype) (count))				\
   1074 	       : "r" ((USItype) (x) >> 16));				\
   1075     else								\
   1076       {									\
   1077 	__asm__ ("clz	%0,%1"						\
   1078 		 : "=r" ((USItype) (count))				\
   1079 		 : "r" ((USItype) (x)));					\
   1080 	(count) += 16;							\
   1081       }									\
   1082   } while (0)
   1083 #endif
   1084 
   1085 #if defined(__sh__) && !__SHMEDIA__ && W_TYPE_SIZE == 32
   1086 #ifndef __sh1__
   1087 #define umul_ppmm(w1, w0, u, v) \
   1088   __asm__ (								\
   1089        "dmulu.l	%2,%3\n\tsts%M1	macl,%1\n\tsts%M0	mach,%0"	\
   1090 	   : "=r<" ((USItype)(w1)),					\
   1091 	     "=r<" ((USItype)(w0))					\
   1092 	   : "r" ((USItype)(u)),					\
   1093 	     "r" ((USItype)(v))						\
   1094 	   : "macl", "mach")
   1095 #define UMUL_TIME 5
   1096 #endif
   1097 
   1098 /* This is the same algorithm as __udiv_qrnnd_c.  */
   1099 #define UDIV_NEEDS_NORMALIZATION 1
   1100 
   1101 #define udiv_qrnnd(q, r, n1, n0, d) \
   1102   do {									\
   1103     extern UWtype __udiv_qrnnd_16 (UWtype, UWtype)			\
   1104 			__attribute__ ((visibility ("hidden")));	\
   1105     /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */	\
   1106     __asm__ (								\
   1107 	"mov%M4 %4,r5\n"						\
   1108 "	swap.w %3,r4\n"							\
   1109 "	swap.w r5,r6\n"							\
   1110 "	jsr @%5\n"							\
   1111 "	shll16 r6\n"							\
   1112 "	swap.w r4,r4\n"							\
   1113 "	jsr @%5\n"							\
   1114 "	swap.w r1,%0\n"							\
   1115 "	or r1,%0"							\
   1116 	: "=r" (q), "=&z" (r)						\
   1117 	: "1" (n1), "r" (n0), "rm" (d), "r" (&__udiv_qrnnd_16)		\
   1118 	: "r1", "r2", "r4", "r5", "r6", "pr", "t");			\
   1119   } while (0)
   1120 
   1121 #define UDIV_TIME 80
   1122 
   1123 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1124   __asm__ ("clrt;subc %5,%1; subc %4,%0"				\
   1125 	   : "=r" (sh), "=r" (sl)					\
   1126 	   : "0" (ah), "1" (al), "r" (bh), "r" (bl) : "t")
   1127 
   1128 #endif /* __sh__ */
   1129 
   1130 #if defined (__SH5__) && __SHMEDIA__ && W_TYPE_SIZE == 32
   1131 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
   1132 #define count_leading_zeros(count, x) \
   1133   do									\
   1134     {									\
   1135       UDItype x_ = (USItype)(x);					\
   1136       SItype c_;							\
   1137 									\
   1138       __asm__ ("nsb %1, %0" : "=r" (c_) : "r" (x_));			\
   1139       (count) = c_ - 31;						\
   1140     }									\
   1141   while (0)
   1142 #define COUNT_LEADING_ZEROS_0 32
   1143 #endif
   1144 
   1145 #if defined (__sparc__) && !defined (__arch64__) && !defined (__sparcv9) \
   1146     && W_TYPE_SIZE == 32
   1147 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1148   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
   1149 	   : "=r" ((USItype) (sh)),					\
   1150 	     "=&r" ((USItype) (sl))					\
   1151 	   : "%rJ" ((USItype) (ah)),					\
   1152 	     "rI" ((USItype) (bh)),					\
   1153 	     "%rJ" ((USItype) (al)),					\
   1154 	     "rI" ((USItype) (bl))					\
   1155 	   __CLOBBER_CC)
   1156 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1157   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
   1158 	   : "=r" ((USItype) (sh)),					\
   1159 	     "=&r" ((USItype) (sl))					\
   1160 	   : "rJ" ((USItype) (ah)),					\
   1161 	     "rI" ((USItype) (bh)),					\
   1162 	     "rJ" ((USItype) (al)),					\
   1163 	     "rI" ((USItype) (bl))					\
   1164 	   __CLOBBER_CC)
   1165 #if defined (__sparc_v9__)
   1166 #define umul_ppmm(w1, w0, u, v) \
   1167   do {									\
   1168     register USItype __g1 asm ("g1");					\
   1169     __asm__ ("umul\t%2,%3,%1\n\t"					\
   1170 	     "srlx\t%1, 32, %0"						\
   1171 	     : "=r" ((USItype) (w1)),					\
   1172 	       "=r" (__g1)						\
   1173 	     : "r" ((USItype) (u)),					\
   1174 	       "r" ((USItype) (v)));					\
   1175     (w0) = __g1;							\
   1176   } while (0)
   1177 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1178   __asm__ ("mov\t%2,%%y\n\t"						\
   1179 	   "udiv\t%3,%4,%0\n\t"						\
   1180 	   "umul\t%0,%4,%1\n\t"						\
   1181 	   "sub\t%3,%1,%1"						\
   1182 	   : "=&r" ((USItype) (__q)),					\
   1183 	     "=&r" ((USItype) (__r))					\
   1184 	   : "r" ((USItype) (__n1)),					\
   1185 	     "r" ((USItype) (__n0)),					\
   1186 	     "r" ((USItype) (__d)))
   1187 #else
   1188 #if defined (__sparc_v8__)
   1189 #define umul_ppmm(w1, w0, u, v) \
   1190   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1191 	   : "=r" ((USItype) (w1)),					\
   1192 	     "=r" ((USItype) (w0))					\
   1193 	   : "r" ((USItype) (u)),					\
   1194 	     "r" ((USItype) (v)))
   1195 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1196   __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
   1197 	   : "=&r" ((USItype) (__q)),					\
   1198 	     "=&r" ((USItype) (__r))					\
   1199 	   : "r" ((USItype) (__n1)),					\
   1200 	     "r" ((USItype) (__n0)),					\
   1201 	     "r" ((USItype) (__d)))
   1202 #else
   1203 #if defined (__sparclite__)
   1204 /* This has hardware multiply but not divide.  It also has two additional
   1205    instructions scan (ffs from high bit) and divscc.  */
   1206 #define umul_ppmm(w1, w0, u, v) \
   1207   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
   1208 	   : "=r" ((USItype) (w1)),					\
   1209 	     "=r" ((USItype) (w0))					\
   1210 	   : "r" ((USItype) (u)),					\
   1211 	     "r" ((USItype) (v)))
   1212 #define udiv_qrnnd(q, r, n1, n0, d) \
   1213   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1214 "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
   1215 "	tst	%%g0\n"							\
   1216 "	divscc	%3,%4,%%g1\n"						\
   1217 "	divscc	%%g1,%4,%%g1\n"						\
   1218 "	divscc	%%g1,%4,%%g1\n"						\
   1219 "	divscc	%%g1,%4,%%g1\n"						\
   1220 "	divscc	%%g1,%4,%%g1\n"						\
   1221 "	divscc	%%g1,%4,%%g1\n"						\
   1222 "	divscc	%%g1,%4,%%g1\n"						\
   1223 "	divscc	%%g1,%4,%%g1\n"						\
   1224 "	divscc	%%g1,%4,%%g1\n"						\
   1225 "	divscc	%%g1,%4,%%g1\n"						\
   1226 "	divscc	%%g1,%4,%%g1\n"						\
   1227 "	divscc	%%g1,%4,%%g1\n"						\
   1228 "	divscc	%%g1,%4,%%g1\n"						\
   1229 "	divscc	%%g1,%4,%%g1\n"						\
   1230 "	divscc	%%g1,%4,%%g1\n"						\
   1231 "	divscc	%%g1,%4,%%g1\n"						\
   1232 "	divscc	%%g1,%4,%%g1\n"						\
   1233 "	divscc	%%g1,%4,%%g1\n"						\
   1234 "	divscc	%%g1,%4,%%g1\n"						\
   1235 "	divscc	%%g1,%4,%%g1\n"						\
   1236 "	divscc	%%g1,%4,%%g1\n"						\
   1237 "	divscc	%%g1,%4,%%g1\n"						\
   1238 "	divscc	%%g1,%4,%%g1\n"						\
   1239 "	divscc	%%g1,%4,%%g1\n"						\
   1240 "	divscc	%%g1,%4,%%g1\n"						\
   1241 "	divscc	%%g1,%4,%%g1\n"						\
   1242 "	divscc	%%g1,%4,%%g1\n"						\
   1243 "	divscc	%%g1,%4,%%g1\n"						\
   1244 "	divscc	%%g1,%4,%%g1\n"						\
   1245 "	divscc	%%g1,%4,%%g1\n"						\
   1246 "	divscc	%%g1,%4,%%g1\n"						\
   1247 "	divscc	%%g1,%4,%0\n"						\
   1248 "	rd	%%y,%1\n"						\
   1249 "	bl,a 1f\n"							\
   1250 "	add	%1,%4,%1\n"						\
   1251 "1:	! End of inline udiv_qrnnd"					\
   1252 	   : "=r" ((USItype) (q)),					\
   1253 	     "=r" ((USItype) (r))					\
   1254 	   : "r" ((USItype) (n1)),					\
   1255 	     "r" ((USItype) (n0)),					\
   1256 	     "rI" ((USItype) (d))					\
   1257 	   : "g1" __AND_CLOBBER_CC)
   1258 #define UDIV_TIME 37
   1259 #define count_leading_zeros(count, x) \
   1260   do {                                                                  \
   1261   __asm__ ("scan %1,1,%0"                                               \
   1262 	   : "=r" ((USItype) (count))                                   \
   1263 	   : "r" ((USItype) (x)));					\
   1264   } while (0)
   1265 /* Early sparclites return 63 for an argument of 0, but they warn that future
   1266    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
   1267    undefined.  */
   1268 #else
   1269 /* SPARC without integer multiplication and divide instructions.
   1270    (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
   1271 #define umul_ppmm(w1, w0, u, v) \
   1272   __asm__ ("! Inlined umul_ppmm\n"					\
   1273 "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"\
   1274 "	sra	%3,31,%%o5	! Don't move this insn\n"		\
   1275 "	and	%2,%%o5,%%o5	! Don't move this insn\n"		\
   1276 "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
   1277 "	mulscc	%%g1,%3,%%g1\n"						\
   1278 "	mulscc	%%g1,%3,%%g1\n"						\
   1279 "	mulscc	%%g1,%3,%%g1\n"						\
   1280 "	mulscc	%%g1,%3,%%g1\n"						\
   1281 "	mulscc	%%g1,%3,%%g1\n"						\
   1282 "	mulscc	%%g1,%3,%%g1\n"						\
   1283 "	mulscc	%%g1,%3,%%g1\n"						\
   1284 "	mulscc	%%g1,%3,%%g1\n"						\
   1285 "	mulscc	%%g1,%3,%%g1\n"						\
   1286 "	mulscc	%%g1,%3,%%g1\n"						\
   1287 "	mulscc	%%g1,%3,%%g1\n"						\
   1288 "	mulscc	%%g1,%3,%%g1\n"						\
   1289 "	mulscc	%%g1,%3,%%g1\n"						\
   1290 "	mulscc	%%g1,%3,%%g1\n"						\
   1291 "	mulscc	%%g1,%3,%%g1\n"						\
   1292 "	mulscc	%%g1,%3,%%g1\n"						\
   1293 "	mulscc	%%g1,%3,%%g1\n"						\
   1294 "	mulscc	%%g1,%3,%%g1\n"						\
   1295 "	mulscc	%%g1,%3,%%g1\n"						\
   1296 "	mulscc	%%g1,%3,%%g1\n"						\
   1297 "	mulscc	%%g1,%3,%%g1\n"						\
   1298 "	mulscc	%%g1,%3,%%g1\n"						\
   1299 "	mulscc	%%g1,%3,%%g1\n"						\
   1300 "	mulscc	%%g1,%3,%%g1\n"						\
   1301 "	mulscc	%%g1,%3,%%g1\n"						\
   1302 "	mulscc	%%g1,%3,%%g1\n"						\
   1303 "	mulscc	%%g1,%3,%%g1\n"						\
   1304 "	mulscc	%%g1,%3,%%g1\n"						\
   1305 "	mulscc	%%g1,%3,%%g1\n"						\
   1306 "	mulscc	%%g1,%3,%%g1\n"						\
   1307 "	mulscc	%%g1,%3,%%g1\n"						\
   1308 "	mulscc	%%g1,%3,%%g1\n"						\
   1309 "	mulscc	%%g1,0,%%g1\n"						\
   1310 "	add	%%g1,%%o5,%0\n"						\
   1311 "	rd	%%y,%1"							\
   1312 	   : "=r" ((USItype) (w1)),					\
   1313 	     "=r" ((USItype) (w0))					\
   1314 	   : "%rI" ((USItype) (u)),					\
   1315 	     "r" ((USItype) (v))						\
   1316 	   : "g1", "o5" __AND_CLOBBER_CC)
   1317 #define UMUL_TIME 39		/* 39 instructions */
   1318 /* It's quite necessary to add this much assembler for the sparc.
   1319    The default udiv_qrnnd (in C) is more than 10 times slower!  */
   1320 #define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
   1321   __asm__ ("! Inlined udiv_qrnnd\n"					\
   1322 "	mov	32,%%g1\n"						\
   1323 "	subcc	%1,%2,%%g0\n"						\
   1324 "1:	bcs	5f\n"							\
   1325 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1326 "	sub	%1,%2,%1	! this kills msb of n\n"		\
   1327 "	addx	%1,%1,%1	! so this can't give carry\n"		\
   1328 "	subcc	%%g1,1,%%g1\n"						\
   1329 "2:	bne	1b\n"							\
   1330 "	 subcc	%1,%2,%%g0\n"						\
   1331 "	bcs	3f\n"							\
   1332 "	 addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n"	\
   1333 "	b	3f\n"							\
   1334 "	 sub	%1,%2,%1	! this kills msb of n\n"		\
   1335 "4:	sub	%1,%2,%1\n"						\
   1336 "5:	addxcc	%1,%1,%1\n"						\
   1337 "	bcc	2b\n"							\
   1338 "	 subcc	%%g1,1,%%g1\n"						\
   1339 "! Got carry from n.  Subtract next step to cancel this carry.\n"	\
   1340 "	bne	4b\n"							\
   1341 "	 addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n"	\
   1342 "	sub	%1,%2,%1\n"						\
   1343 "3:	xnor	%0,0,%0\n"						\
   1344 "	! End of inline udiv_qrnnd"					\
   1345 	   : "=&r" ((USItype) (__q)),					\
   1346 	     "=&r" ((USItype) (__r))					\
   1347 	   : "r" ((USItype) (__d)),					\
   1348 	     "1" ((USItype) (__n1)),					\
   1349 	     "0" ((USItype) (__n0)) : "g1" __AND_CLOBBER_CC)
   1350 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
   1351 #endif /* __sparclite__ */
   1352 #endif /* __sparc_v8__ */
   1353 #endif /* __sparc_v9__ */
   1354 #endif /* sparc32 */
   1355 
   1356 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
   1357     && W_TYPE_SIZE == 64
   1358 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
   1359   do {									\
   1360     UDItype __carry = 0;						\
   1361     __asm__ ("addcc\t%r5,%6,%1\n\t"					\
   1362 	     "add\t%r3,%4,%0\n\t"					\
   1363 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1364 	     "add\t%0, %2, %0"						\
   1365 	     : "=r" ((UDItype)(sh)),				      	\
   1366 	       "=&r" ((UDItype)(sl)),				      	\
   1367 	       "+r" (__carry)				      		\
   1368 	     : "%rJ" ((UDItype)(ah)),				     	\
   1369 	       "rI" ((UDItype)(bh)),				      	\
   1370 	       "%rJ" ((UDItype)(al)),				     	\
   1371 	       "rI" ((UDItype)(bl))				       	\
   1372 	     __CLOBBER_CC);						\
   1373   } while (0)
   1374 
   1375 #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
   1376   do {									\
   1377     UDItype __carry = 0;						\
   1378     __asm__ ("subcc\t%r5,%6,%1\n\t"					\
   1379 	     "sub\t%r3,%4,%0\n\t"					\
   1380 	     "movcs\t%%xcc, 1, %2\n\t"					\
   1381 	     "sub\t%0, %2, %0"						\
   1382 	     : "=r" ((UDItype)(sh)),				      	\
   1383 	       "=&r" ((UDItype)(sl)),				      	\
   1384 	       "+r" (__carry)				      		\
   1385 	     : "%rJ" ((UDItype)(ah)),				     	\
   1386 	       "rI" ((UDItype)(bh)),				      	\
   1387 	       "%rJ" ((UDItype)(al)),				     	\
   1388 	       "rI" ((UDItype)(bl))				       	\
   1389 	     __CLOBBER_CC);						\
   1390   } while (0)
   1391 
   1392 #define umul_ppmm(wh, wl, u, v)						\
   1393   do {									\
   1394 	  UDItype tmp1, tmp2, tmp3, tmp4;				\
   1395 	  __asm__ __volatile__ (					\
   1396 		   "srl %7,0,%3\n\t"					\
   1397 		   "mulx %3,%6,%1\n\t"					\
   1398 		   "srlx %6,32,%2\n\t"					\
   1399 		   "mulx %2,%3,%4\n\t"					\
   1400 		   "sllx %4,32,%5\n\t"					\
   1401 		   "srl %6,0,%3\n\t"					\
   1402 		   "sub %1,%5,%5\n\t"					\
   1403 		   "srlx %5,32,%5\n\t"					\
   1404 		   "addcc %4,%5,%4\n\t"					\
   1405 		   "srlx %7,32,%5\n\t"					\
   1406 		   "mulx %3,%5,%3\n\t"					\
   1407 		   "mulx %2,%5,%5\n\t"					\
   1408 		   "sethi %%hi(0x80000000),%2\n\t"			\
   1409 		   "addcc %4,%3,%4\n\t"					\
   1410 		   "srlx %4,32,%4\n\t"					\
   1411 		   "add %2,%2,%2\n\t"					\
   1412 		   "movcc %%xcc,%%g0,%2\n\t"				\
   1413 		   "addcc %5,%4,%5\n\t"					\
   1414 		   "sllx %3,32,%3\n\t"					\
   1415 		   "add %1,%3,%1\n\t"					\
   1416 		   "add %5,%2,%0"					\
   1417 	   : "=r" ((UDItype)(wh)),					\
   1418 	     "=&r" ((UDItype)(wl)),					\
   1419 	     "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4)	\
   1420 	   : "r" ((UDItype)(u)),					\
   1421 	     "r" ((UDItype)(v))						\
   1422 	   __CLOBBER_CC);						\
   1423   } while (0)
   1424 #define UMUL_TIME 96
   1425 #define UDIV_TIME 230
   1426 #endif /* sparc64 */
   1427 
   1428 #if defined (__vax__) && W_TYPE_SIZE == 32
   1429 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1430   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
   1431 	   : "=g" ((USItype) (sh)),					\
   1432 	     "=&g" ((USItype) (sl))					\
   1433 	   : "%0" ((USItype) (ah)),					\
   1434 	     "g" ((USItype) (bh)),					\
   1435 	     "%1" ((USItype) (al)),					\
   1436 	     "g" ((USItype) (bl)))
   1437 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1438   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
   1439 	   : "=g" ((USItype) (sh)),					\
   1440 	     "=&g" ((USItype) (sl))					\
   1441 	   : "0" ((USItype) (ah)),					\
   1442 	     "g" ((USItype) (bh)),					\
   1443 	     "1" ((USItype) (al)),					\
   1444 	     "g" ((USItype) (bl)))
   1445 #define umul_ppmm(xh, xl, m0, m1) \
   1446   do {									\
   1447     union {								\
   1448 	UDItype __ll;							\
   1449 	struct {USItype __l, __h;} __i;					\
   1450       } __xx;								\
   1451     USItype __m0 = (m0), __m1 = (m1);					\
   1452     __asm__ ("emul %1,%2,$0,%0"						\
   1453 	     : "=r" (__xx.__ll)						\
   1454 	     : "g" (__m0),						\
   1455 	       "g" (__m1));						\
   1456     (xh) = __xx.__i.__h;						\
   1457     (xl) = __xx.__i.__l;						\
   1458     (xh) += ((((SItype) __m0 >> 31) & __m1)				\
   1459 	     + (((SItype) __m1 >> 31) & __m0));				\
   1460   } while (0)
   1461 #define sdiv_qrnnd(q, r, n1, n0, d) \
   1462   do {									\
   1463     union {DItype __ll;							\
   1464 	   struct {SItype __l, __h;} __i;				\
   1465 	  } __xx;							\
   1466     __xx.__i.__h = n1; __xx.__i.__l = n0;				\
   1467     __asm__ ("ediv %3,%2,%0,%1"						\
   1468 	     : "=g" (q), "=g" (r)					\
   1469 	     : "g" (__xx.__ll), "g" (d));				\
   1470   } while (0)
   1471 #endif /* __vax__ */
   1472 
   1473 #ifdef _TMS320C6X
   1474 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1475   do									\
   1476     {									\
   1477       UDItype __ll;							\
   1478       __asm__ ("addu .l1 %1, %2, %0"					\
   1479 	       : "=a" (__ll) : "a" (al), "a" (bl));			\
   1480       (sl) = (USItype)__ll;						\
   1481       (sh) = ((USItype)(__ll >> 32)) + (ah) + (bh);			\
   1482     }									\
   1483   while (0)
   1484 
   1485 #ifdef _TMS320C6400_PLUS
   1486 #define __umulsidi3(u,v) ((UDItype)(USItype)u*(USItype)v)
   1487 #define umul_ppmm(w1, w0, u, v)						\
   1488   do {									\
   1489     UDItype __x = (UDItype) (USItype) (u) * (USItype) (v);		\
   1490     (w1) = (USItype) (__x >> 32);					\
   1491     (w0) = (USItype) (__x);						\
   1492   } while (0)
   1493 #endif  /* _TMS320C6400_PLUS */
   1494 
   1495 #define count_leading_zeros(count, x)	((count) = __builtin_clz (x))
   1496 #ifdef _TMS320C6400
   1497 #define count_trailing_zeros(count, x)	((count) = __builtin_ctz (x))
   1498 #endif
   1499 #define UMUL_TIME 4
   1500 #define UDIV_TIME 40
   1501 #endif /* _TMS320C6X */
   1502 
   1503 #if defined (__xtensa__) && W_TYPE_SIZE == 32
   1504 /* This code is not Xtensa-configuration-specific, so rely on the compiler
   1505    to expand builtin functions depending on what configuration features
   1506    are available.  This avoids library calls when the operation can be
   1507    performed in-line.  */
   1508 #define umul_ppmm(w1, w0, u, v)						\
   1509   do {									\
   1510     DWunion __w;							\
   1511     __w.ll = __builtin_umulsidi3 (u, v);				\
   1512     w1 = __w.s.high;							\
   1513     w0 = __w.s.low;							\
   1514   } while (0)
   1515 #define __umulsidi3(u, v)		__builtin_umulsidi3 (u, v)
   1516 #define count_leading_zeros(COUNT, X)	((COUNT) = __builtin_clz (X))
   1517 #define count_trailing_zeros(COUNT, X)	((COUNT) = __builtin_ctz (X))
   1518 #endif /* __xtensa__ */
   1519 
   1520 #if defined xstormy16
   1521 extern UHItype __stormy16_count_leading_zeros (UHItype);
   1522 #define count_leading_zeros(count, x)					\
   1523   do									\
   1524     {									\
   1525       UHItype size;							\
   1526 									\
   1527       /* We assume that W_TYPE_SIZE is a multiple of 16...  */		\
   1528       for ((count) = 0, size = W_TYPE_SIZE; size; size -= 16)		\
   1529 	{								\
   1530 	  UHItype c;							\
   1531 									\
   1532 	  c = __clzhi2 ((x) >> (size - 16));				\
   1533 	  (count) += c;							\
   1534 	  if (c != 16)							\
   1535 	    break;							\
   1536 	}								\
   1537     }									\
   1538   while (0)
   1539 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1540 #endif
   1541 
   1542 #if defined (__z8000__) && W_TYPE_SIZE == 16
   1543 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1544   __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
   1545 	   : "=r" ((unsigned int)(sh)),					\
   1546 	     "=&r" ((unsigned int)(sl))					\
   1547 	   : "%0" ((unsigned int)(ah)),					\
   1548 	     "r" ((unsigned int)(bh)),					\
   1549 	     "%1" ((unsigned int)(al)),					\
   1550 	     "rQR" ((unsigned int)(bl)))
   1551 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1552   __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
   1553 	   : "=r" ((unsigned int)(sh)),					\
   1554 	     "=&r" ((unsigned int)(sl))					\
   1555 	   : "0" ((unsigned int)(ah)),					\
   1556 	     "r" ((unsigned int)(bh)),					\
   1557 	     "1" ((unsigned int)(al)),					\
   1558 	     "rQR" ((unsigned int)(bl)))
   1559 #define umul_ppmm(xh, xl, m0, m1) \
   1560   do {									\
   1561     union {long int __ll;						\
   1562 	   struct {unsigned int __h, __l;} __i;				\
   1563 	  } __xx;							\
   1564     unsigned int __m0 = (m0), __m1 = (m1);				\
   1565     __asm__ ("mult	%S0,%H3"					\
   1566 	     : "=r" (__xx.__i.__h),					\
   1567 	       "=r" (__xx.__i.__l)					\
   1568 	     : "%1" (__m0),						\
   1569 	       "rQR" (__m1));						\
   1570     (xh) = __xx.__i.__h; (xl) = __xx.__i.__l;				\
   1571     (xh) += ((((signed int) __m0 >> 15) & __m1)				\
   1572 	     + (((signed int) __m1 >> 15) & __m0));			\
   1573   } while (0)
   1574 #endif /* __z8000__ */
   1575 
   1576 #endif /* __GNUC__ */
   1577 
   1578 /* If this machine has no inline assembler, use C macros.  */
   1579 
   1580 #if !defined (add_ssaaaa)
   1581 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   1582   do {									\
   1583     UWtype __x;								\
   1584     __x = (al) + (bl);							\
   1585     (sh) = (ah) + (bh) + (__x < (al));					\
   1586     (sl) = __x;								\
   1587   } while (0)
   1588 #endif
   1589 
   1590 #if !defined (sub_ddmmss)
   1591 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   1592   do {									\
   1593     UWtype __x;								\
   1594     __x = (al) - (bl);							\
   1595     (sh) = (ah) - (bh) - (__x > (al));					\
   1596     (sl) = __x;								\
   1597   } while (0)
   1598 #endif
   1599 
   1600 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
   1601    smul_ppmm.  */
   1602 #if !defined (umul_ppmm) && defined (smul_ppmm)
   1603 #define umul_ppmm(w1, w0, u, v)						\
   1604   do {									\
   1605     UWtype __w1;							\
   1606     UWtype __xm0 = (u), __xm1 = (v);					\
   1607     smul_ppmm (__w1, w0, __xm0, __xm1);					\
   1608     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
   1609 		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
   1610   } while (0)
   1611 #endif
   1612 
   1613 /* If we still don't have umul_ppmm, define it using plain C.  */
   1614 #if !defined (umul_ppmm)
   1615 #define umul_ppmm(w1, w0, u, v)						\
   1616   do {									\
   1617     UWtype __x0, __x1, __x2, __x3;					\
   1618     UHWtype __ul, __vl, __uh, __vh;					\
   1619 									\
   1620     __ul = __ll_lowpart (u);						\
   1621     __uh = __ll_highpart (u);						\
   1622     __vl = __ll_lowpart (v);						\
   1623     __vh = __ll_highpart (v);						\
   1624 									\
   1625     __x0 = (UWtype) __ul * __vl;					\
   1626     __x1 = (UWtype) __ul * __vh;					\
   1627     __x2 = (UWtype) __uh * __vl;					\
   1628     __x3 = (UWtype) __uh * __vh;					\
   1629 									\
   1630     __x1 += __ll_highpart (__x0);/* this can't give carry */		\
   1631     __x1 += __x2;		/* but this indeed can */		\
   1632     if (__x1 < __x2)		/* did we get it? */			\
   1633       __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
   1634 									\
   1635     (w1) = __x3 + __ll_highpart (__x1);					\
   1636     (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
   1637   } while (0)
   1638 #endif
   1639 
   1640 #if !defined (__umulsidi3)
   1641 #define __umulsidi3(u, v) \
   1642   ({DWunion __w;							\
   1643     umul_ppmm (__w.s.high, __w.s.low, u, v);				\
   1644     __w.ll; })
   1645 #endif
   1646 
   1647 /* Define this unconditionally, so it can be used for debugging.  */
   1648 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   1649   do {									\
   1650     UWtype __d1, __d0, __q1, __q0;					\
   1651     UWtype __r1, __r0, __m;						\
   1652     __d1 = __ll_highpart (d);						\
   1653     __d0 = __ll_lowpart (d);						\
   1654 									\
   1655     __r1 = (n1) % __d1;							\
   1656     __q1 = (n1) / __d1;							\
   1657     __m = (UWtype) __q1 * __d0;						\
   1658     __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
   1659     if (__r1 < __m)							\
   1660       {									\
   1661 	__q1--, __r1 += (d);						\
   1662 	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
   1663 	  if (__r1 < __m)						\
   1664 	    __q1--, __r1 += (d);					\
   1665       }									\
   1666     __r1 -= __m;							\
   1667 									\
   1668     __r0 = __r1 % __d1;							\
   1669     __q0 = __r1 / __d1;							\
   1670     __m = (UWtype) __q0 * __d0;						\
   1671     __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
   1672     if (__r0 < __m)							\
   1673       {									\
   1674 	__q0--, __r0 += (d);						\
   1675 	if (__r0 >= (d))						\
   1676 	  if (__r0 < __m)						\
   1677 	    __q0--, __r0 += (d);					\
   1678       }									\
   1679     __r0 -= __m;							\
   1680 									\
   1681     (q) = (UWtype) __q1 * __ll_B | __q0;				\
   1682     (r) = __r0;								\
   1683   } while (0)
   1684 
   1685 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
   1686    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
   1687 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
   1688 #define udiv_qrnnd(q, r, nh, nl, d) \
   1689   do {									\
   1690     extern UWtype __udiv_w_sdiv (UWtype *, UWtype, UWtype, UWtype);	\
   1691     UWtype __r;								\
   1692     (q) = __udiv_w_sdiv (&__r, nh, nl, d);				\
   1693     (r) = __r;								\
   1694   } while (0)
   1695 #endif
   1696 
   1697 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
   1698 #if !defined (udiv_qrnnd)
   1699 #define UDIV_NEEDS_NORMALIZATION 1
   1700 #define udiv_qrnnd __udiv_qrnnd_c
   1701 #endif
   1702 
   1703 #if !defined (count_leading_zeros)
   1704 #define count_leading_zeros(count, x) \
   1705   do {									\
   1706     UWtype __xr = (x);							\
   1707     UWtype __a;								\
   1708 									\
   1709     if (W_TYPE_SIZE <= 32)						\
   1710       {									\
   1711 	__a = __xr < ((UWtype)1<<2*__BITS4)				\
   1712 	  ? (__xr < ((UWtype)1<<__BITS4) ? 0 : __BITS4)			\
   1713 	  : (__xr < ((UWtype)1<<3*__BITS4) ?  2*__BITS4 : 3*__BITS4);	\
   1714       }									\
   1715     else								\
   1716       {									\
   1717 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
   1718 	  if (((__xr >> __a) & 0xff) != 0)				\
   1719 	    break;							\
   1720       }									\
   1721 									\
   1722     (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
   1723   } while (0)
   1724 #define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
   1725 #endif
   1726 
   1727 #if !defined (count_trailing_zeros)
   1728 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
   1729    defined in asm, but if it is not, the C version above is good enough.  */
   1730 #define count_trailing_zeros(count, x) \
   1731   do {									\
   1732     UWtype __ctz_x = (x);						\
   1733     UWtype __ctz_c;							\
   1734     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
   1735     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   1736   } while (0)
   1737 #endif
   1738 
   1739 #ifndef UDIV_NEEDS_NORMALIZATION
   1740 #define UDIV_NEEDS_NORMALIZATION 0
   1741 #endif
   1742