Home | History | Annotate | Download | only in runtime
      1 #include "rs_core.rsh"
      2 #include "rs_f16_util.h"
      3 
      4 extern float2 __attribute__((overloadable)) convert_float2(int2 c);
      5 extern float3 __attribute__((overloadable)) convert_float3(int3 c);
      6 extern float4 __attribute__((overloadable)) convert_float4(int4 c);
      7 
      8 extern int2 __attribute__((overloadable)) convert_int2(float2 c);
      9 extern int3 __attribute__((overloadable)) convert_int3(float3 c);
     10 extern int4 __attribute__((overloadable)) convert_int4(float4 c);
     11 
     12 
     13 extern float __attribute__((overloadable)) fmin(float v, float v2);
     14 extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
     15 extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
     16 extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
     17 
     18 extern float __attribute__((overloadable)) fmax(float v, float v2);
     19 extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
     20 extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
     21 extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
     22 
     23 // Float ops, 6.11.2
     24 
     25 #define FN_FUNC_FN(fnc)                                         \
     26 extern float2 __attribute__((overloadable)) fnc(float2 v) { \
     27     float2 r;                                                   \
     28     r.x = fnc(v.x);                                             \
     29     r.y = fnc(v.y);                                             \
     30     return r;                                                   \
     31 }                                                               \
     32 extern float3 __attribute__((overloadable)) fnc(float3 v) { \
     33     float3 r;                                                   \
     34     r.x = fnc(v.x);                                             \
     35     r.y = fnc(v.y);                                             \
     36     r.z = fnc(v.z);                                             \
     37     return r;                                                   \
     38 }                                                               \
     39 extern float4 __attribute__((overloadable)) fnc(float4 v) { \
     40     float4 r;                                                   \
     41     r.x = fnc(v.x);                                             \
     42     r.y = fnc(v.y);                                             \
     43     r.z = fnc(v.z);                                             \
     44     r.w = fnc(v.w);                                             \
     45     return r;                                                   \
     46 }
     47 
     48 #define IN_FUNC_FN(fnc)                                         \
     49 extern int2 __attribute__((overloadable)) fnc(float2 v) {   \
     50     int2 r;                                                     \
     51     r.x = fnc(v.x);                                             \
     52     r.y = fnc(v.y);                                             \
     53     return r;                                                   \
     54 }                                                               \
     55 extern int3 __attribute__((overloadable)) fnc(float3 v) {   \
     56     int3 r;                                                     \
     57     r.x = fnc(v.x);                                             \
     58     r.y = fnc(v.y);                                             \
     59     r.z = fnc(v.z);                                             \
     60     return r;                                                   \
     61 }                                                               \
     62 extern int4 __attribute__((overloadable)) fnc(float4 v) {   \
     63     int4 r;                                                     \
     64     r.x = fnc(v.x);                                             \
     65     r.y = fnc(v.y);                                             \
     66     r.z = fnc(v.z);                                             \
     67     r.w = fnc(v.w);                                             \
     68     return r;                                                   \
     69 }
     70 
     71 #define FN_FUNC_FN_FN(fnc)                                                  \
     72 extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
     73     float2 r;                                                               \
     74     r.x = fnc(v1.x, v2.x);                                                  \
     75     r.y = fnc(v1.y, v2.y);                                                  \
     76     return r;                                                               \
     77 }                                                                           \
     78 extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
     79     float3 r;                                                               \
     80     r.x = fnc(v1.x, v2.x);                                                  \
     81     r.y = fnc(v1.y, v2.y);                                                  \
     82     r.z = fnc(v1.z, v2.z);                                                  \
     83     return r;                                                               \
     84 }                                                                           \
     85 extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
     86     float4 r;                                                               \
     87     r.x = fnc(v1.x, v2.x);                                                  \
     88     r.y = fnc(v1.y, v2.y);                                                  \
     89     r.z = fnc(v1.z, v2.z);                                                  \
     90     r.w = fnc(v1.w, v2.w);                                                  \
     91     return r;                                                               \
     92 }
     93 
     94 #define FN_FUNC_FN_F(fnc)                                                   \
     95 extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) {  \
     96     float2 r;                                                               \
     97     r.x = fnc(v1.x, v2);                                                    \
     98     r.y = fnc(v1.y, v2);                                                    \
     99     return r;                                                               \
    100 }                                                                           \
    101 extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) {  \
    102     float3 r;                                                               \
    103     r.x = fnc(v1.x, v2);                                                    \
    104     r.y = fnc(v1.y, v2);                                                    \
    105     r.z = fnc(v1.z, v2);                                                    \
    106     return r;                                                               \
    107 }                                                                           \
    108 extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) {  \
    109     float4 r;                                                               \
    110     r.x = fnc(v1.x, v2);                                                    \
    111     r.y = fnc(v1.y, v2);                                                    \
    112     r.z = fnc(v1.z, v2);                                                    \
    113     r.w = fnc(v1.w, v2);                                                    \
    114     return r;                                                               \
    115 }
    116 
    117 #define FN_FUNC_FN_IN(fnc)                                                  \
    118 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) {   \
    119     float2 r;                                                               \
    120     r.x = fnc(v1.x, v2.x);                                                  \
    121     r.y = fnc(v1.y, v2.y);                                                  \
    122     return r;                                                               \
    123 }                                                                           \
    124 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) {   \
    125     float3 r;                                                               \
    126     r.x = fnc(v1.x, v2.x);                                                  \
    127     r.y = fnc(v1.y, v2.y);                                                  \
    128     r.z = fnc(v1.z, v2.z);                                                  \
    129     return r;                                                               \
    130 }                                                                           \
    131 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) {   \
    132     float4 r;                                                               \
    133     r.x = fnc(v1.x, v2.x);                                                  \
    134     r.y = fnc(v1.y, v2.y);                                                  \
    135     r.z = fnc(v1.z, v2.z);                                                  \
    136     r.w = fnc(v1.w, v2.w);                                                  \
    137     return r;                                                               \
    138 }
    139 
    140 #define FN_FUNC_FN_I(fnc)                                                   \
    141 extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) {    \
    142     float2 r;                                                               \
    143     r.x = fnc(v1.x, v2);                                                    \
    144     r.y = fnc(v1.y, v2);                                                    \
    145     return r;                                                               \
    146 }                                                                           \
    147 extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) {    \
    148     float3 r;                                                               \
    149     r.x = fnc(v1.x, v2);                                                    \
    150     r.y = fnc(v1.y, v2);                                                    \
    151     r.z = fnc(v1.z, v2);                                                    \
    152     return r;                                                               \
    153 }                                                                           \
    154 extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) {    \
    155     float4 r;                                                               \
    156     r.x = fnc(v1.x, v2);                                                    \
    157     r.y = fnc(v1.y, v2);                                                    \
    158     r.z = fnc(v1.z, v2);                                                    \
    159     r.w = fnc(v1.w, v2);                                                    \
    160     return r;                                                               \
    161 }
    162 
    163 #define FN_FUNC_FN_PFN(fnc)                     \
    164 extern float2 __attribute__((overloadable)) \
    165         fnc(float2 v1, float2 *v2) {            \
    166     float2 r;                                   \
    167     float t[2];                                 \
    168     r.x = fnc(v1.x, &t[0]);                     \
    169     r.y = fnc(v1.y, &t[1]);                     \
    170     v2->x = t[0];                               \
    171     v2->y = t[1];                               \
    172     return r;                                   \
    173 }                                               \
    174 extern float3 __attribute__((overloadable)) \
    175         fnc(float3 v1, float3 *v2) {            \
    176     float3 r;                                   \
    177     float t[3];                                 \
    178     r.x = fnc(v1.x, &t[0]);                     \
    179     r.y = fnc(v1.y, &t[1]);                     \
    180     r.z = fnc(v1.z, &t[2]);                     \
    181     v2->x = t[0];                               \
    182     v2->y = t[1];                               \
    183     v2->z = t[2];                               \
    184     return r;                                   \
    185 }                                               \
    186 extern float4 __attribute__((overloadable)) \
    187         fnc(float4 v1, float4 *v2) {            \
    188     float4 r;                                   \
    189     float t[4];                                 \
    190     r.x = fnc(v1.x, &t[0]);                     \
    191     r.y = fnc(v1.y, &t[1]);                     \
    192     r.z = fnc(v1.z, &t[2]);                     \
    193     r.w = fnc(v1.w, &t[3]);                     \
    194     v2->x = t[0];                               \
    195     v2->y = t[1];                               \
    196     v2->z = t[2];                               \
    197     v2->w = t[3];                               \
    198     return r;                                   \
    199 }
    200 
    201 #define FN_FUNC_FN_PIN(fnc)                                                 \
    202 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) {  \
    203     float2 r;                                                               \
    204     int t[2];                                                               \
    205     r.x = fnc(v1.x, &t[0]);                                                 \
    206     r.y = fnc(v1.y, &t[1]);                                                 \
    207     v2->x = t[0];                                                           \
    208     v2->y = t[1];                                                           \
    209     return r;                                                               \
    210 }                                                                           \
    211 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) {  \
    212     float3 r;                                                               \
    213     int t[3];                                                               \
    214     r.x = fnc(v1.x, &t[0]);                                                 \
    215     r.y = fnc(v1.y, &t[1]);                                                 \
    216     r.z = fnc(v1.z, &t[2]);                                                 \
    217     v2->x = t[0];                                                           \
    218     v2->y = t[1];                                                           \
    219     v2->z = t[2];                                                           \
    220     return r;                                                               \
    221 }                                                                           \
    222 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) {  \
    223     float4 r;                                                               \
    224     int t[4];                                                               \
    225     r.x = fnc(v1.x, &t[0]);                                                 \
    226     r.y = fnc(v1.y, &t[1]);                                                 \
    227     r.z = fnc(v1.z, &t[2]);                                                 \
    228     r.w = fnc(v1.w, &t[3]);                                                 \
    229     v2->x = t[0];                                                           \
    230     v2->y = t[1];                                                           \
    231     v2->z = t[2];                                                           \
    232     v2->w = t[3];                                                           \
    233     return r;                                                               \
    234 }
    235 
    236 #define FN_FUNC_FN_FN_FN(fnc)                   \
    237 extern float2 __attribute__((overloadable)) \
    238         fnc(float2 v1, float2 v2, float2 v3) {  \
    239     float2 r;                                   \
    240     r.x = fnc(v1.x, v2.x, v3.x);                \
    241     r.y = fnc(v1.y, v2.y, v3.y);                \
    242     return r;                                   \
    243 }                                               \
    244 extern float3 __attribute__((overloadable)) \
    245         fnc(float3 v1, float3 v2, float3 v3) {  \
    246     float3 r;                                   \
    247     r.x = fnc(v1.x, v2.x, v3.x);                \
    248     r.y = fnc(v1.y, v2.y, v3.y);                \
    249     r.z = fnc(v1.z, v2.z, v3.z);                \
    250     return r;                                   \
    251 }                                               \
    252 extern float4 __attribute__((overloadable)) \
    253         fnc(float4 v1, float4 v2, float4 v3) {  \
    254     float4 r;                                   \
    255     r.x = fnc(v1.x, v2.x, v3.x);                \
    256     r.y = fnc(v1.y, v2.y, v3.y);                \
    257     r.z = fnc(v1.z, v2.z, v3.z);                \
    258     r.w = fnc(v1.w, v2.w, v3.w);                \
    259     return r;                                   \
    260 }
    261 
    262 #define FN_FUNC_FN_FN_PIN(fnc)                  \
    263 extern float2 __attribute__((overloadable)) \
    264         fnc(float2 v1, float2 v2, int2 *v3) {   \
    265     float2 r;                                   \
    266     int t[2];                                   \
    267     r.x = fnc(v1.x, v2.x, &t[0]);               \
    268     r.y = fnc(v1.y, v2.y, &t[1]);               \
    269     v3->x = t[0];                               \
    270     v3->y = t[1];                               \
    271     return r;                                   \
    272 }                                               \
    273 extern float3 __attribute__((overloadable)) \
    274         fnc(float3 v1, float3 v2, int3 *v3) {   \
    275     float3 r;                                   \
    276     int t[3];                                   \
    277     r.x = fnc(v1.x, v2.x, &t[0]);               \
    278     r.y = fnc(v1.y, v2.y, &t[1]);               \
    279     r.z = fnc(v1.z, v2.z, &t[2]);               \
    280     v3->x = t[0];                               \
    281     v3->y = t[1];                               \
    282     v3->z = t[2];                               \
    283     return r;                                   \
    284 }                                               \
    285 extern float4 __attribute__((overloadable)) \
    286         fnc(float4 v1, float4 v2, int4 *v3) {   \
    287     float4 r;                                   \
    288     int t[4];                                   \
    289     r.x = fnc(v1.x, v2.x, &t[0]);               \
    290     r.y = fnc(v1.y, v2.y, &t[1]);               \
    291     r.z = fnc(v1.z, v2.z, &t[2]);               \
    292     r.w = fnc(v1.w, v2.w, &t[3]);               \
    293     v3->x = t[0];                               \
    294     v3->y = t[1];                               \
    295     v3->z = t[2];                               \
    296     v3->w = t[3];                               \
    297     return r;                                   \
    298 }
    299 
    300 static const int iposinf = 0x7f800000;
    301 static const int ineginf = 0xff800000;
    302 
    303 static const float posinf() {
    304     float f = *((float*)&iposinf);
    305     return f;
    306 }
    307 
    308 static const float neginf() {
    309     float f = *((float*)&ineginf);
    310     return f;
    311 }
    312 
    313 static bool isinf(float f) {
    314     int i = *((int*)(void*)&f);
    315     return (i == iposinf) || (i == ineginf);
    316 }
    317 
    318 static bool isnan(float f) {
    319     int i = *((int*)(void*)&f);
    320     return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
    321 }
    322 
    323 static bool isposzero(float f) {
    324     int i = *((int*)(void*)&f);
    325     return (i == 0x00000000);
    326 }
    327 
    328 static bool isnegzero(float f) {
    329     int i = *((int*)(void*)&f);
    330     return (i == 0x80000000);
    331 }
    332 
    333 static bool iszero(float f) {
    334     return isposzero(f) || isnegzero(f);
    335 }
    336 
    337 
    338 extern float __attribute__((overloadable)) SC_acosf(float);
    339 float __attribute__((overloadable)) acos(float v) {
    340     return SC_acosf(v);
    341 }
    342 FN_FUNC_FN(acos)
    343 
    344 extern float __attribute__((overloadable)) SC_acoshf(float);
    345 float __attribute__((overloadable)) acosh(float v) {
    346     return SC_acoshf(v);
    347 }
    348 FN_FUNC_FN(acosh)
    349 
    350 
    351 extern float __attribute__((overloadable)) acospi(float v) {
    352     return acos(v) / M_PI;
    353 }
    354 FN_FUNC_FN(acospi)
    355 
    356 extern float __attribute__((overloadable)) SC_asinf(float);
    357 float __attribute__((overloadable)) asin(float v) {
    358     return SC_asinf(v);
    359 }
    360 FN_FUNC_FN(asin)
    361 
    362 extern float __attribute__((overloadable)) SC_asinhf(float);
    363 float __attribute__((overloadable)) asinh(float v) {
    364     return SC_asinhf(v);
    365 }
    366 FN_FUNC_FN(asinh)
    367 
    368 extern float __attribute__((overloadable)) asinpi(float v) {
    369     return asin(v) / M_PI;
    370 }
    371 FN_FUNC_FN(asinpi)
    372 
    373 extern float __attribute__((overloadable)) SC_atanf(float);
    374 float __attribute__((overloadable)) atan(float v) {
    375     return SC_atanf(v);
    376 }
    377 FN_FUNC_FN(atan)
    378 
    379 extern float __attribute__((overloadable)) SC_atan2f(float, float);
    380 float __attribute__((overloadable)) atan2(float v1, float v2) {
    381     return SC_atan2f(v1, v2);
    382 }
    383 FN_FUNC_FN_FN(atan2)
    384 
    385 extern float __attribute__((overloadable)) SC_atanhf(float);
    386 float __attribute__((overloadable)) atanh(float v) {
    387     return SC_atanhf(v);
    388 }
    389 FN_FUNC_FN(atanh)
    390 
    391 extern float __attribute__((overloadable)) atanpi(float v) {
    392     return atan(v) / M_PI;
    393 }
    394 FN_FUNC_FN(atanpi)
    395 
    396 
    397 extern float __attribute__((overloadable)) atan2pi(float y, float x) {
    398     return atan2(y, x) / M_PI;
    399 }
    400 FN_FUNC_FN_FN(atan2pi)
    401 
    402 extern float __attribute__((overloadable)) SC_cbrtf(float);
    403 float __attribute__((overloadable)) cbrt(float v) {
    404     return SC_cbrtf(v);
    405 }
    406 FN_FUNC_FN(cbrt)
    407 
    408 extern float __attribute__((overloadable)) SC_ceilf(float);
    409 float __attribute__((overloadable)) ceil(float v) {
    410     return SC_ceilf(v);
    411 }
    412 FN_FUNC_FN(ceil)
    413 
    414 extern float __attribute__((overloadable)) SC_copysignf(float, float);
    415 float __attribute__((overloadable)) copysign(float v1, float v2) {
    416     return SC_copysignf(v1, v2);
    417 }
    418 FN_FUNC_FN_FN(copysign)
    419 
    420 extern float __attribute__((overloadable)) SC_cosf(float);
    421 float __attribute__((overloadable)) cos(float v) {
    422     return SC_cosf(v);
    423 }
    424 FN_FUNC_FN(cos)
    425 
    426 extern float __attribute__((overloadable)) SC_coshf(float);
    427 float __attribute__((overloadable)) cosh(float v) {
    428     return SC_coshf(v);
    429 }
    430 FN_FUNC_FN(cosh)
    431 
    432 extern float __attribute__((overloadable)) cospi(float v) {
    433     return cos(v * M_PI);
    434 }
    435 FN_FUNC_FN(cospi)
    436 
    437 extern float __attribute__((overloadable)) SC_erfcf(float);
    438 float __attribute__((overloadable)) erfc(float v) {
    439     return SC_erfcf(v);
    440 }
    441 FN_FUNC_FN(erfc)
    442 
    443 extern float __attribute__((overloadable)) SC_erff(float);
    444 float __attribute__((overloadable)) erf(float v) {
    445     return SC_erff(v);
    446 }
    447 FN_FUNC_FN(erf)
    448 
    449 extern float __attribute__((overloadable)) SC_expf(float);
    450 float __attribute__((overloadable)) exp(float v) {
    451     return SC_expf(v);
    452 }
    453 FN_FUNC_FN(exp)
    454 
    455 extern float __attribute__((overloadable)) SC_exp2f(float);
    456 float __attribute__((overloadable)) exp2(float v) {
    457     return SC_exp2f(v);
    458 }
    459 FN_FUNC_FN(exp2)
    460 
    461 extern float __attribute__((overloadable)) pow(float, float);
    462 
    463 extern float __attribute__((overloadable)) exp10(float v) {
    464     return exp2(v * 3.321928095f);
    465 }
    466 FN_FUNC_FN(exp10)
    467 
    468 extern float __attribute__((overloadable)) SC_expm1f(float);
    469 float __attribute__((overloadable)) expm1(float v) {
    470     return SC_expm1f(v);
    471 }
    472 FN_FUNC_FN(expm1)
    473 
    474 extern float __attribute__((overloadable)) fabs(float v) {
    475     int i = *((int*)(void*)&v) & 0x7fffffff;
    476     return  *((float*)(void*)&i);
    477 }
    478 FN_FUNC_FN(fabs)
    479 
    480 extern float __attribute__((overloadable)) SC_fdimf(float, float);
    481 float __attribute__((overloadable)) fdim(float v1, float v2) {
    482     return SC_fdimf(v1, v2);
    483 }
    484 FN_FUNC_FN_FN(fdim)
    485 
    486 extern float __attribute__((overloadable)) SC_floorf(float);
    487 float __attribute__((overloadable)) floor(float v) {
    488     return SC_floorf(v);
    489 }
    490 FN_FUNC_FN(floor)
    491 
    492 extern float __attribute__((overloadable)) SC_fmaf(float, float, float);
    493 float __attribute__((overloadable)) fma(float v1, float v2, float v3) {
    494     return SC_fmaf(v1, v2, v3);
    495 }
    496 FN_FUNC_FN_FN_FN(fma)
    497 
    498 extern float __attribute__((overloadable)) SC_fminf(float, float);
    499 
    500 extern float __attribute__((overloadable)) SC_fmodf(float, float);
    501 float __attribute__((overloadable)) fmod(float v1, float v2) {
    502     return SC_fmodf(v1, v2);
    503 }
    504 FN_FUNC_FN_FN(fmod)
    505 
    506 extern float __attribute__((overloadable)) fract(float v, float *iptr) {
    507     int i = (int)floor(v);
    508     if (iptr) {
    509         iptr[0] = i;
    510     }
    511     return fmin(v - i, 0x1.fffffep-1f);
    512 }
    513 FN_FUNC_FN_PFN(fract)
    514 
    515 extern float __attribute__((const, overloadable)) fract(float v) {
    516     float unused;
    517     return fract(v, &unused);
    518 }
    519 FN_FUNC_FN(fract)
    520 
    521 extern float __attribute__((overloadable)) SC_frexpf(float, int *);
    522 float __attribute__((overloadable)) frexp(float v1, int* v2) {
    523     return SC_frexpf(v1, v2);
    524 }
    525 FN_FUNC_FN_PIN(frexp)
    526 
    527 extern float __attribute__((overloadable)) SC_hypotf(float, float);
    528 float __attribute__((overloadable)) hypot(float v1, float v2) {
    529     return SC_hypotf(v1, v2);
    530 }
    531 FN_FUNC_FN_FN(hypot)
    532 
    533 extern int __attribute__((overloadable)) SC_ilogbf(float);
    534 int __attribute__((overloadable)) ilogb(float v) {
    535     return SC_ilogbf(v);
    536 }
    537 IN_FUNC_FN(ilogb)
    538 
    539 extern float __attribute__((overloadable)) SC_ldexpf(float, int);
    540 float __attribute__((overloadable)) ldexp(float v1, int v2) {
    541     return SC_ldexpf(v1, v2);
    542 }
    543 FN_FUNC_FN_IN(ldexp)
    544 FN_FUNC_FN_I(ldexp)
    545 
    546 extern float __attribute__((overloadable)) SC_lgammaf(float);
    547 float __attribute__((overloadable)) lgamma(float v) {
    548     return SC_lgammaf(v);
    549 }
    550 FN_FUNC_FN(lgamma)
    551 extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*);
    552 float __attribute__((overloadable)) lgamma(float v, int* ptr) {
    553     return SC_lgammaf_r(v, ptr);
    554 }
    555 FN_FUNC_FN_PIN(lgamma)
    556 
    557 extern float __attribute__((overloadable)) SC_logf(float);
    558 float __attribute__((overloadable)) log(float v) {
    559     return SC_logf(v);
    560 }
    561 FN_FUNC_FN(log)
    562 
    563 extern float __attribute__((overloadable)) SC_log10f(float);
    564 float __attribute__((overloadable)) log10(float v) {
    565     return SC_log10f(v);
    566 }
    567 FN_FUNC_FN(log10)
    568 
    569 
    570 extern float __attribute__((overloadable)) log2(float v) {
    571     return log10(v) * 3.321928095f;
    572 }
    573 FN_FUNC_FN(log2)
    574 
    575 extern float __attribute__((overloadable)) SC_log1pf(float);
    576 float __attribute__((overloadable)) log1p(float v) {
    577     return SC_log1pf(v);
    578 }
    579 FN_FUNC_FN(log1p)
    580 
    581 extern float __attribute__((overloadable)) SC_logbf(float);
    582 float __attribute__((overloadable)) logb(float v) {
    583     return SC_logbf(v);
    584 }
    585 FN_FUNC_FN(logb)
    586 
    587 extern float __attribute__((overloadable)) mad(float a, float b, float c) {
    588     return a * b + c;
    589 }
    590 extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
    591     return a * b + c;
    592 }
    593 extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
    594     return a * b + c;
    595 }
    596 extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
    597     return a * b + c;
    598 }
    599 
    600 extern float __attribute__((overloadable)) SC_modff(float, float *);
    601 float __attribute__((overloadable)) modf(float v1, float *v2) {
    602     return SC_modff(v1, v2);
    603 }
    604 FN_FUNC_FN_PFN(modf);
    605 
    606 extern float __attribute__((overloadable)) nan(uint v) {
    607     float f[1];
    608     uint32_t *ip = (uint32_t *)f;
    609     *ip = v | 0x7fc00000;
    610     return f[0];
    611 }
    612 
    613 extern float __attribute__((overloadable)) SC_nextafterf(float, float);
    614 float __attribute__((overloadable)) nextafter(float v1, float v2) {
    615     return SC_nextafterf(v1, v2);
    616 }
    617 FN_FUNC_FN_FN(nextafter)
    618 
    619 // This function must be defined here if we're compiling with debug info
    620 // (libclcore_g.bc), because we need a C source to get debug information.
    621 // Otherwise the implementation can be found in IR.
    622 #if defined(RS_G_RUNTIME)
    623 extern float __attribute__((overloadable)) SC_powf(float, float);
    624 float __attribute__((overloadable)) pow(float v1, float v2) {
    625     return SC_powf(v1, v2);
    626 }
    627 #endif // defined(RS_G_RUNTIME)
    628 FN_FUNC_FN_FN(pow)
    629 
    630 extern float __attribute__((overloadable)) pown(float v, int p) {
    631     /* The mantissa of a float has fewer bits than an int (24 effective vs. 31).
    632      * For very large ints, we'll lose whether the exponent is even or odd, making
    633      * the selection of a correct sign incorrect.  We correct this.  Use copysign
    634      * to handle the negative zero case.
    635      */
    636     float sign = (p & 0x1) ? copysign(1.f, v) : 1.f;
    637     float f = pow(v, (float)p);
    638     return copysign(f, sign);
    639 }
    640 FN_FUNC_FN_IN(pown)
    641 
    642 extern float __attribute__((overloadable)) powr(float v, float p) {
    643     return pow(v, p);
    644 }
    645 extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
    646     return pow(v, p);
    647 }
    648 extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
    649     return pow(v, p);
    650 }
    651 extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
    652     return pow(v, p);
    653 }
    654 
    655 extern float __attribute__((overloadable)) SC_remainderf(float, float);
    656 float __attribute__((overloadable)) remainder(float v1, float v2) {
    657     return SC_remainderf(v1, v2);
    658 }
    659 FN_FUNC_FN_FN(remainder)
    660 
    661 extern float __attribute__((overloadable)) SC_remquof(float, float, int *);
    662 float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) {
    663     return SC_remquof(v1, v2, v3);
    664 }
    665 FN_FUNC_FN_FN_PIN(remquo)
    666 
    667 extern float __attribute__((overloadable)) SC_rintf(float);
    668 float __attribute__((overloadable)) rint(float v) {
    669     return SC_rintf(v);
    670 }
    671 FN_FUNC_FN(rint)
    672 
    673 extern float __attribute__((overloadable)) rootn(float v, int r) {
    674     if (r == 0) {
    675         return posinf();
    676     }
    677 
    678     if (iszero(v)) {
    679         if (r < 0) {
    680             if (r & 1) {
    681                 return copysign(posinf(), v);
    682             } else {
    683                 return posinf();
    684             }
    685         } else {
    686             if (r & 1) {
    687                 return copysign(0.f, v);
    688             } else {
    689                 return 0.f;
    690             }
    691         }
    692     }
    693 
    694     if (!isinf(v) && !isnan(v) && (v < 0.f)) {
    695         if (r & 1) {
    696             return (-1.f * pow(-1.f * v, 1.f / r));
    697         } else {
    698             return nan(0);
    699         }
    700     }
    701 
    702     return pow(v, 1.f / r);
    703 }
    704 FN_FUNC_FN_IN(rootn);
    705 
    706 extern float __attribute__((overloadable)) SC_roundf(float);
    707 float __attribute__((overloadable)) round(float v) {
    708     return SC_roundf(v);
    709 }
    710 FN_FUNC_FN(round)
    711 
    712 extern float __attribute__((overloadable)) SC_randf2(float, float);
    713 float __attribute__((overloadable)) rsRand(float min, float max) {
    714   return SC_randf2(min, max);
    715 }
    716 
    717 
    718 extern float __attribute__((overloadable)) rsqrt(float v) {
    719     return 1.f / sqrt(v);
    720 }
    721 
    722 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
    723 // These functions must be defined here if we are not using the SSE
    724 // implementation, which includes when we are built as part of the
    725 // debug runtime (libclcore_debug.bc) or compiling with debug info.
    726 #if defined(RS_G_RUNTIME)
    727 extern float __attribute__((overloadable)) SC_sqrtf(float);
    728 float __attribute__((overloadable)) sqrt(float v) {
    729     return SC_sqrtf(v);
    730 }
    731 #endif // defined(RS_G_RUNTIME)
    732 
    733 FN_FUNC_FN(sqrt)
    734 #else
    735 extern float2 __attribute__((overloadable)) sqrt(float2);
    736 extern float3 __attribute__((overloadable)) sqrt(float3);
    737 extern float4 __attribute__((overloadable)) sqrt(float4);
    738 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
    739 
    740 FN_FUNC_FN(rsqrt)
    741 
    742 extern float __attribute__((overloadable)) SC_sinf(float);
    743 float __attribute__((overloadable)) sin(float v) {
    744     return SC_sinf(v);
    745 }
    746 FN_FUNC_FN(sin)
    747 
    748 extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
    749     *cosptr = cos(v);
    750     return sin(v);
    751 }
    752 extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
    753     *cosptr = cos(v);
    754     return sin(v);
    755 }
    756 extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
    757     *cosptr = cos(v);
    758     return sin(v);
    759 }
    760 extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
    761     *cosptr = cos(v);
    762     return sin(v);
    763 }
    764 
    765 extern float __attribute__((overloadable)) SC_sinhf(float);
    766 float __attribute__((overloadable)) sinh(float v) {
    767     return SC_sinhf(v);
    768 }
    769 FN_FUNC_FN(sinh)
    770 
    771 extern float __attribute__((overloadable)) sinpi(float v) {
    772     return sin(v * M_PI);
    773 }
    774 FN_FUNC_FN(sinpi)
    775 
    776 extern float __attribute__((overloadable)) SC_tanf(float);
    777 float __attribute__((overloadable)) tan(float v) {
    778     return SC_tanf(v);
    779 }
    780 FN_FUNC_FN(tan)
    781 
    782 extern float __attribute__((overloadable)) SC_tanhf(float);
    783 float __attribute__((overloadable)) tanh(float v) {
    784     return SC_tanhf(v);
    785 }
    786 FN_FUNC_FN(tanh)
    787 
    788 extern float __attribute__((overloadable)) tanpi(float v) {
    789     return tan(v * M_PI);
    790 }
    791 FN_FUNC_FN(tanpi)
    792 
    793 
    794 extern float __attribute__((overloadable)) SC_tgammaf(float);
    795 float __attribute__((overloadable)) tgamma(float v) {
    796     return SC_tgammaf(v);
    797 }
    798 FN_FUNC_FN(tgamma)
    799 
    800 extern float __attribute__((overloadable)) SC_truncf(float);
    801 float __attribute__((overloadable)) trunc(float v) {
    802     return SC_truncf(v);
    803 }
    804 FN_FUNC_FN(trunc)
    805 
    806 // Int ops (partial), 6.11.3
    807 
    808 #define XN_FUNC_YN(typeout, fnc, typein)                                \
    809 extern typeout __attribute__((overloadable)) fnc(typein);               \
    810 extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) {  \
    811     typeout##2 r;                                                       \
    812     r.x = fnc(v.x);                                                     \
    813     r.y = fnc(v.y);                                                     \
    814     return r;                                                           \
    815 }                                                                       \
    816 extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) {  \
    817     typeout##3 r;                                                       \
    818     r.x = fnc(v.x);                                                     \
    819     r.y = fnc(v.y);                                                     \
    820     r.z = fnc(v.z);                                                     \
    821     return r;                                                           \
    822 }                                                                       \
    823 extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) {  \
    824     typeout##4 r;                                                       \
    825     r.x = fnc(v.x);                                                     \
    826     r.y = fnc(v.y);                                                     \
    827     r.z = fnc(v.z);                                                     \
    828     r.w = fnc(v.w);                                                     \
    829     return r;                                                           \
    830 }
    831 
    832 
    833 #define UIN_FUNC_IN(fnc)          \
    834 XN_FUNC_YN(uchar, fnc, char)      \
    835 XN_FUNC_YN(ushort, fnc, short)    \
    836 XN_FUNC_YN(uint, fnc, int)
    837 
    838 #define IN_FUNC_IN(fnc)           \
    839 XN_FUNC_YN(uchar, fnc, uchar)     \
    840 XN_FUNC_YN(char, fnc, char)       \
    841 XN_FUNC_YN(ushort, fnc, ushort)   \
    842 XN_FUNC_YN(short, fnc, short)     \
    843 XN_FUNC_YN(uint, fnc, uint)       \
    844 XN_FUNC_YN(int, fnc, int)
    845 
    846 
    847 #define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
    848 extern type __attribute__((overloadable))       \
    849         fnc(type v1, type v2) {                     \
    850     return body;                                    \
    851 }                                                   \
    852 extern type##2 __attribute__((overloadable))    \
    853         fnc(type##2 v1, type##2 v2) {               \
    854     type##2 r;                                      \
    855     r.x = fnc(v1.x, v2.x);                          \
    856     r.y = fnc(v1.y, v2.y);                          \
    857     return r;                                       \
    858 }                                                   \
    859 extern type##3 __attribute__((overloadable))    \
    860         fnc(type##3 v1, type##3 v2) {               \
    861     type##3 r;                                      \
    862     r.x = fnc(v1.x, v2.x);                          \
    863     r.y = fnc(v1.y, v2.y);                          \
    864     r.z = fnc(v1.z, v2.z);                          \
    865     return r;                                       \
    866 }                                                   \
    867 extern type##4 __attribute__((overloadable))    \
    868         fnc(type##4 v1, type##4 v2) {               \
    869     type##4 r;                                      \
    870     r.x = fnc(v1.x, v2.x);                          \
    871     r.y = fnc(v1.y, v2.y);                          \
    872     r.z = fnc(v1.z, v2.z);                          \
    873     r.w = fnc(v1.w, v2.w);                          \
    874     return r;                                       \
    875 }
    876 
    877 #define IN_FUNC_IN_IN_BODY(fnc, body) \
    878 XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
    879 XN_FUNC_XN_XN_BODY(char, fnc, body)   \
    880 XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
    881 XN_FUNC_XN_XN_BODY(short, fnc, body)  \
    882 XN_FUNC_XN_XN_BODY(uint, fnc, body)   \
    883 XN_FUNC_XN_XN_BODY(int, fnc, body)    \
    884 XN_FUNC_XN_XN_BODY(float, fnc, body)
    885 
    886 
    887 /**
    888  * abs
    889  */
    890 extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
    891     if (v < 0)
    892         return -v;
    893     return v;
    894 }
    895 extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
    896     if (v < 0)
    897         return -v;
    898     return v;
    899 }
    900 extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
    901     if (v < 0)
    902         return -v;
    903     return v;
    904 }
    905 
    906 /**
    907  * clz
    908  * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
    909  * expanded to 32 bits. For our smaller data types, we need to subtract off
    910  * these unused top bits (that will be always be composed of zeros).
    911  */
    912 extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
    913     return __builtin_clz(v);
    914 }
    915 extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
    916     return __builtin_clz(v) - 16;
    917 }
    918 extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
    919     return __builtin_clz(v) - 24;
    920 }
    921 extern int32_t __attribute__((overloadable)) clz(int32_t v) {
    922     return __builtin_clz(v);
    923 }
    924 extern int16_t __attribute__((overloadable)) clz(int16_t v) {
    925     return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
    926 }
    927 extern int8_t __attribute__((overloadable)) clz(int8_t v) {
    928     return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
    929 }
    930 
    931 
    932 UIN_FUNC_IN(abs)
    933 IN_FUNC_IN(clz)
    934 
    935 
    936 // 6.11.4
    937 
    938 
    939 extern float __attribute__((overloadable)) degrees(float radians) {
    940     return radians * (180.f / M_PI);
    941 }
    942 extern float2 __attribute__((overloadable)) degrees(float2 radians) {
    943     return radians * (180.f / M_PI);
    944 }
    945 extern float3 __attribute__((overloadable)) degrees(float3 radians) {
    946     return radians * (180.f / M_PI);
    947 }
    948 extern float4 __attribute__((overloadable)) degrees(float4 radians) {
    949     return radians * (180.f / M_PI);
    950 }
    951 
    952 extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
    953     return start + (stop - start) * amount;
    954 }
    955 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
    956     return start + (stop - start) * amount;
    957 }
    958 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
    959     return start + (stop - start) * amount;
    960 }
    961 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
    962     return start + (stop - start) * amount;
    963 }
    964 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
    965     return start + (stop - start) * amount;
    966 }
    967 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
    968     return start + (stop - start) * amount;
    969 }
    970 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
    971     return start + (stop - start) * amount;
    972 }
    973 
    974 extern float __attribute__((overloadable)) radians(float degrees) {
    975     return degrees * (M_PI / 180.f);
    976 }
    977 extern float2 __attribute__((overloadable)) radians(float2 degrees) {
    978     return degrees * (M_PI / 180.f);
    979 }
    980 extern float3 __attribute__((overloadable)) radians(float3 degrees) {
    981     return degrees * (M_PI / 180.f);
    982 }
    983 extern float4 __attribute__((overloadable)) radians(float4 degrees) {
    984     return degrees * (M_PI / 180.f);
    985 }
    986 
    987 extern float __attribute__((overloadable)) step(float edge, float v) {
    988     return (v < edge) ? 0.f : 1.f;
    989 }
    990 extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
    991     float2 r;
    992     r.x = (v.x < edge.x) ? 0.f : 1.f;
    993     r.y = (v.y < edge.y) ? 0.f : 1.f;
    994     return r;
    995 }
    996 extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
    997     float3 r;
    998     r.x = (v.x < edge.x) ? 0.f : 1.f;
    999     r.y = (v.y < edge.y) ? 0.f : 1.f;
   1000     r.z = (v.z < edge.z) ? 0.f : 1.f;
   1001     return r;
   1002 }
   1003 extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
   1004     float4 r;
   1005     r.x = (v.x < edge.x) ? 0.f : 1.f;
   1006     r.y = (v.y < edge.y) ? 0.f : 1.f;
   1007     r.z = (v.z < edge.z) ? 0.f : 1.f;
   1008     r.w = (v.w < edge.w) ? 0.f : 1.f;
   1009     return r;
   1010 }
   1011 extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
   1012     float2 r;
   1013     r.x = (v < edge.x) ? 0.f : 1.f;
   1014     r.y = (v < edge.y) ? 0.f : 1.f;
   1015     return r;
   1016 }
   1017 extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
   1018     float3 r;
   1019     r.x = (v < edge.x) ? 0.f : 1.f;
   1020     r.y = (v < edge.y) ? 0.f : 1.f;
   1021     r.z = (v < edge.z) ? 0.f : 1.f;
   1022     return r;
   1023 }
   1024 extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
   1025     float4 r;
   1026     r.x = (v < edge.x) ? 0.f : 1.f;
   1027     r.y = (v < edge.y) ? 0.f : 1.f;
   1028     r.z = (v < edge.z) ? 0.f : 1.f;
   1029     r.w = (v < edge.w) ? 0.f : 1.f;
   1030     return r;
   1031 }
   1032 extern float2 __attribute__((overloadable)) step(float edge, float2 v) {
   1033     float2 r;
   1034     r.x = (v.x < edge) ? 0.f : 1.f;
   1035     r.y = (v.y < edge) ? 0.f : 1.f;
   1036     return r;
   1037 }
   1038 extern float3 __attribute__((overloadable)) step(float edge, float3 v) {
   1039     float3 r;
   1040     r.x = (v.x < edge) ? 0.f : 1.f;
   1041     r.y = (v.y < edge) ? 0.f : 1.f;
   1042     r.z = (v.z < edge) ? 0.f : 1.f;
   1043     return r;
   1044 }
   1045 extern float4 __attribute__((overloadable)) step(float edge, float4 v) {
   1046     float4 r;
   1047     r.x = (v.x < edge) ? 0.f : 1.f;
   1048     r.y = (v.y < edge) ? 0.f : 1.f;
   1049     r.z = (v.z < edge) ? 0.f : 1.f;
   1050     r.w = (v.w < edge) ? 0.f : 1.f;
   1051     return r;
   1052 }
   1053 
   1054 extern float __attribute__((overloadable)) sign(float v) {
   1055     if (v > 0) return 1.f;
   1056     if (v < 0) return -1.f;
   1057     return v;
   1058 }
   1059 FN_FUNC_FN(sign)
   1060 
   1061 
   1062 // 6.11.5
   1063 extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
   1064     float3 r;
   1065     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
   1066     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
   1067     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
   1068     return r;
   1069 }
   1070 
   1071 extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
   1072     float4 r;
   1073     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
   1074     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
   1075     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
   1076     r.w = 0.f;
   1077     return r;
   1078 }
   1079 
   1080 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
   1081 // These functions must be defined here if we are not using the SSE
   1082 // implementation, which includes when we are built as part of the
   1083 // debug runtime (libclcore_debug.bc) or compiling with debug info.
   1084 
   1085 extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
   1086     return lhs * rhs;
   1087 }
   1088 extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
   1089     return lhs.x*rhs.x + lhs.y*rhs.y;
   1090 }
   1091 extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
   1092     return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
   1093 }
   1094 extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
   1095     return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
   1096 }
   1097 
   1098 extern float __attribute__((overloadable)) length(float v) {
   1099     return fabs(v);
   1100 }
   1101 extern float __attribute__((overloadable)) length(float2 v) {
   1102     return sqrt(v.x*v.x + v.y*v.y);
   1103 }
   1104 extern float __attribute__((overloadable)) length(float3 v) {
   1105     return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
   1106 }
   1107 extern float __attribute__((overloadable)) length(float4 v) {
   1108     return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
   1109 }
   1110 
   1111 #else
   1112 
   1113 extern float __attribute__((overloadable)) length(float v);
   1114 extern float __attribute__((overloadable)) length(float2 v);
   1115 extern float __attribute__((overloadable)) length(float3 v);
   1116 extern float __attribute__((overloadable)) length(float4 v);
   1117 
   1118 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
   1119 
   1120 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
   1121     return length(lhs - rhs);
   1122 }
   1123 extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
   1124     return length(lhs - rhs);
   1125 }
   1126 extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
   1127     return length(lhs - rhs);
   1128 }
   1129 extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
   1130     return length(lhs - rhs);
   1131 }
   1132 
   1133 /* For the normalization functions, vectors of length 0 should simply be
   1134  * returned (i.e. all the components of that vector are 0).
   1135  */
   1136 extern float __attribute__((overloadable)) normalize(float v) {
   1137     if (v == 0.0f) {
   1138         return 0.0f;
   1139     } else if (v < 0.0f) {
   1140         return -1.0f;
   1141     } else {
   1142         return 1.0f;
   1143     }
   1144 }
   1145 extern float2 __attribute__((overloadable)) normalize(float2 v) {
   1146     float l = length(v);
   1147     return l == 0.0f ? v : v / l;
   1148 }
   1149 extern float3 __attribute__((overloadable)) normalize(float3 v) {
   1150     float l = length(v);
   1151     return l == 0.0f ? v : v / l;
   1152 }
   1153 extern float4 __attribute__((overloadable)) normalize(float4 v) {
   1154     float l = length(v);
   1155     return l == 0.0f ? v : v / l;
   1156 }
   1157 
   1158 extern float __attribute__((overloadable)) half_sqrt(float v) {
   1159     return sqrt(v);
   1160 }
   1161 FN_FUNC_FN(half_sqrt)
   1162 
   1163 extern float __attribute__((overloadable)) fast_length(float v) {
   1164     return fabs(v);
   1165 }
   1166 extern float __attribute__((overloadable)) fast_length(float2 v) {
   1167     return half_sqrt(v.x*v.x + v.y*v.y);
   1168 }
   1169 extern float __attribute__((overloadable)) fast_length(float3 v) {
   1170     return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
   1171 }
   1172 extern float __attribute__((overloadable)) fast_length(float4 v) {
   1173     return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
   1174 }
   1175 
   1176 extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
   1177     return fast_length(lhs - rhs);
   1178 }
   1179 extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
   1180     return fast_length(lhs - rhs);
   1181 }
   1182 extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
   1183     return fast_length(lhs - rhs);
   1184 }
   1185 extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
   1186     return fast_length(lhs - rhs);
   1187 }
   1188 
   1189 extern float __attribute__((overloadable)) half_rsqrt(float);
   1190 
   1191 /* For the normalization functions, vectors of length 0 should simply be
   1192  * returned (i.e. all the components of that vector are 0).
   1193  */
   1194 extern float __attribute__((overloadable)) fast_normalize(float v) {
   1195     if (v == 0.0f) {
   1196         return 0.0f;
   1197     } else if (v < 0.0f) {
   1198         return -1.0f;
   1199     } else {
   1200         return 1.0f;
   1201     }
   1202 }
   1203 // If the length is 0, then rlength should be NaN.
   1204 extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
   1205     float rlength = half_rsqrt(v.x*v.x + v.y*v.y);
   1206     return (rlength == rlength) ? v * rlength : v;
   1207 }
   1208 extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
   1209     float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
   1210     return (rlength == rlength) ? v * rlength : v;
   1211 }
   1212 extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
   1213     float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
   1214     return (rlength == rlength) ? v * rlength : v;
   1215 }
   1216 
   1217 extern float __attribute__((overloadable)) half_recip(float v) {
   1218     return 1.f / v;
   1219 }
   1220 
   1221 /*
   1222 extern float __attribute__((overloadable)) approx_atan(float x) {
   1223     if (x == 0.f)
   1224         return 0.f;
   1225     if (x < 0.f)
   1226         return -1.f * approx_atan(-1.f * x);
   1227     if (x > 1.f)
   1228         return M_PI_2 - approx_atan(approx_recip(x));
   1229     return x * approx_recip(1.f + 0.28f * x*x);
   1230 }
   1231 FN_FUNC_FN(approx_atan)
   1232 */
   1233 
   1234 typedef union
   1235 {
   1236   float fv;
   1237   int32_t iv;
   1238 } ieee_float_shape_type;
   1239 
   1240 /* Get a 32 bit int from a float.  */
   1241 
   1242 #define GET_FLOAT_WORD(i,d)                 \
   1243 do {                                \
   1244   ieee_float_shape_type gf_u;                   \
   1245   gf_u.fv = (d);                     \
   1246   (i) = gf_u.iv;                      \
   1247 } while (0)
   1248 
   1249 /* Set a float from a 32 bit int.  */
   1250 
   1251 #define SET_FLOAT_WORD(d,i)                 \
   1252 do {                                \
   1253   ieee_float_shape_type sf_u;                   \
   1254   sf_u.iv = (i);                      \
   1255   (d) = sf_u.fv;                     \
   1256 } while (0)
   1257 
   1258 
   1259 
   1260 // Valid -125 to 125
   1261 extern float __attribute__((overloadable)) native_exp2(float v) {
   1262     int32_t iv = (int)v;
   1263     int32_t x = iv + (iv >> 31); // ~floor(v)
   1264     float r = (v - x);
   1265 
   1266     float fo;
   1267     SET_FLOAT_WORD(fo, (x + 127) << 23);
   1268 
   1269     r *= 0.694f; // ~ log(e) / log(2)
   1270     float r2 = r*r;
   1271     float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1272     return fo * adj;
   1273 }
   1274 
   1275 extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
   1276     int2 iv = convert_int2(v);
   1277     int2 x = iv + (iv >> (int2)31);//floor(v);
   1278     float2 r = (v - convert_float2(x));
   1279 
   1280     x += 127;
   1281 
   1282     float2 fo = (float2)(x << (int2)23);
   1283 
   1284     r *= 0.694f; // ~ log(e) / log(2)
   1285     float2 r2 = r*r;
   1286     float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1287     return fo * adj;
   1288 }
   1289 
   1290 extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
   1291     int4 iv = convert_int4(v);
   1292     int4 x = iv + (iv >> (int4)31);//floor(v);
   1293     float4 r = (v - convert_float4(x));
   1294 
   1295     x += 127;
   1296 
   1297     float4 fo = (float4)(x << (int4)23);
   1298 
   1299     r *= 0.694f; // ~ log(e) / log(2)
   1300     float4 r2 = r*r;
   1301     float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1302     return fo * adj;
   1303 }
   1304 
   1305 extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
   1306     float4 t = 1.f;
   1307     t.xyz = v;
   1308     return native_exp2(t).xyz;
   1309 }
   1310 
   1311 
   1312 extern float __attribute__((overloadable)) native_exp(float v) {
   1313     return native_exp2(v * 1.442695041f);
   1314 }
   1315 extern float2 __attribute__((overloadable)) native_exp(float2 v) {
   1316     return native_exp2(v * 1.442695041f);
   1317 }
   1318 extern float3 __attribute__((overloadable)) native_exp(float3 v) {
   1319     return native_exp2(v * 1.442695041f);
   1320 }
   1321 extern float4 __attribute__((overloadable)) native_exp(float4 v) {
   1322     return native_exp2(v * 1.442695041f);
   1323 }
   1324 
   1325 extern float __attribute__((overloadable)) native_exp10(float v) {
   1326     return native_exp2(v * 3.321928095f);
   1327 }
   1328 extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
   1329     return native_exp2(v * 3.321928095f);
   1330 }
   1331 extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
   1332     return native_exp2(v * 3.321928095f);
   1333 }
   1334 extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
   1335     return native_exp2(v * 3.321928095f);
   1336 }
   1337 
   1338 extern float __attribute__((overloadable)) native_log2(float v) {
   1339     int32_t ibits;
   1340     GET_FLOAT_WORD(ibits, v);
   1341 
   1342     int32_t e = (ibits >> 23) & 0xff;
   1343 
   1344     ibits &= 0x7fffff;
   1345     ibits |= 127 << 23;
   1346 
   1347     float ir;
   1348     SET_FLOAT_WORD(ir, ibits);
   1349     ir -= 1.5f;
   1350     float ir2 = ir*ir;
   1351     float adj2 = (0.405465108f / 0.693147181f) +
   1352                  ((0.666666667f / 0.693147181f) * ir) -
   1353                  ((0.222222222f / 0.693147181f) * ir2) +
   1354                  ((0.098765432f / 0.693147181f) * ir*ir2) -
   1355                  ((0.049382716f / 0.693147181f) * ir2*ir2) +
   1356                  ((0.026337449f / 0.693147181f) * ir*ir2*ir2) -
   1357                  ((0.014631916f / 0.693147181f) * ir2*ir2*ir2);
   1358     return (float)(e - 127) + adj2;
   1359 }
   1360 extern float2 __attribute__((overloadable)) native_log2(float2 v) {
   1361     float2 v2 = {native_log2(v.x), native_log2(v.y)};
   1362     return v2;
   1363 }
   1364 extern float3 __attribute__((overloadable)) native_log2(float3 v) {
   1365     float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
   1366     return v2;
   1367 }
   1368 extern float4 __attribute__((overloadable)) native_log2(float4 v) {
   1369     float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
   1370     return v2;
   1371 }
   1372 
   1373 extern float __attribute__((overloadable)) native_log(float v) {
   1374     return native_log2(v) * (1.f / 1.442695041f);
   1375 }
   1376 extern float2 __attribute__((overloadable)) native_log(float2 v) {
   1377     return native_log2(v) * (1.f / 1.442695041f);
   1378 }
   1379 extern float3 __attribute__((overloadable)) native_log(float3 v) {
   1380     return native_log2(v) * (1.f / 1.442695041f);
   1381 }
   1382 extern float4 __attribute__((overloadable)) native_log(float4 v) {
   1383     return native_log2(v) * (1.f / 1.442695041f);
   1384 }
   1385 
   1386 extern float __attribute__((overloadable)) native_log10(float v) {
   1387     return native_log2(v) * (1.f / 3.321928095f);
   1388 }
   1389 extern float2 __attribute__((overloadable)) native_log10(float2 v) {
   1390     return native_log2(v) * (1.f / 3.321928095f);
   1391 }
   1392 extern float3 __attribute__((overloadable)) native_log10(float3 v) {
   1393     return native_log2(v) * (1.f / 3.321928095f);
   1394 }
   1395 extern float4 __attribute__((overloadable)) native_log10(float4 v) {
   1396     return native_log2(v) * (1.f / 3.321928095f);
   1397 }
   1398 
   1399 
   1400 extern float __attribute__((overloadable)) native_powr(float v, float y) {
   1401     float v2 = native_log2(v);
   1402     v2 = fmax(v2 * y, -125.f);
   1403     return native_exp2(v2);
   1404 }
   1405 extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
   1406     float2 v2 = native_log2(v);
   1407     v2 = fmax(v2 * y, -125.f);
   1408     return native_exp2(v2);
   1409 }
   1410 extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
   1411     float3 v2 = native_log2(v);
   1412     v2 = fmax(v2 * y, -125.f);
   1413     return native_exp2(v2);
   1414 }
   1415 extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
   1416     float4 v2 = native_log2(v);
   1417     v2 = fmax(v2 * y, -125.f);
   1418     return native_exp2(v2);
   1419 }
   1420 
   1421 extern double __attribute__((overloadable)) min(double v1, double v2) {
   1422     return v1 < v2 ? v1 : v2;
   1423 }
   1424 
   1425 extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) {
   1426     double2 r;
   1427     r.x = v1.x < v2.x ? v1.x : v2.x;
   1428     r.y = v1.y < v2.y ? v1.y : v2.y;
   1429     return r;
   1430 }
   1431 
   1432 extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) {
   1433     double3 r;
   1434     r.x = v1.x < v2.x ? v1.x : v2.x;
   1435     r.y = v1.y < v2.y ? v1.y : v2.y;
   1436     r.z = v1.z < v2.z ? v1.z : v2.z;
   1437     return r;
   1438 }
   1439 
   1440 extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) {
   1441     double4 r;
   1442     r.x = v1.x < v2.x ? v1.x : v2.x;
   1443     r.y = v1.y < v2.y ? v1.y : v2.y;
   1444     r.z = v1.z < v2.z ? v1.z : v2.z;
   1445     r.w = v1.w < v2.w ? v1.w : v2.w;
   1446     return r;
   1447 }
   1448 
   1449 extern long __attribute__((overloadable)) min(long v1, long v2) {
   1450     return v1 < v2 ? v1 : v2;
   1451 }
   1452 extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
   1453     long2 r;
   1454     r.x = v1.x < v2.x ? v1.x : v2.x;
   1455     r.y = v1.y < v2.y ? v1.y : v2.y;
   1456     return r;
   1457 }
   1458 extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
   1459     long3 r;
   1460     r.x = v1.x < v2.x ? v1.x : v2.x;
   1461     r.y = v1.y < v2.y ? v1.y : v2.y;
   1462     r.z = v1.z < v2.z ? v1.z : v2.z;
   1463     return r;
   1464 }
   1465 extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
   1466     long4 r;
   1467     r.x = v1.x < v2.x ? v1.x : v2.x;
   1468     r.y = v1.y < v2.y ? v1.y : v2.y;
   1469     r.z = v1.z < v2.z ? v1.z : v2.z;
   1470     r.w = v1.w < v2.w ? v1.w : v2.w;
   1471     return r;
   1472 }
   1473 
   1474 extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
   1475     return v1 < v2 ? v1 : v2;
   1476 }
   1477 extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
   1478     ulong2 r;
   1479     r.x = v1.x < v2.x ? v1.x : v2.x;
   1480     r.y = v1.y < v2.y ? v1.y : v2.y;
   1481     return r;
   1482 }
   1483 extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
   1484     ulong3 r;
   1485     r.x = v1.x < v2.x ? v1.x : v2.x;
   1486     r.y = v1.y < v2.y ? v1.y : v2.y;
   1487     r.z = v1.z < v2.z ? v1.z : v2.z;
   1488     return r;
   1489 }
   1490 extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
   1491     ulong4 r;
   1492     r.x = v1.x < v2.x ? v1.x : v2.x;
   1493     r.y = v1.y < v2.y ? v1.y : v2.y;
   1494     r.z = v1.z < v2.z ? v1.z : v2.z;
   1495     r.w = v1.w < v2.w ? v1.w : v2.w;
   1496     return r;
   1497 }
   1498 
   1499 extern double __attribute__((overloadable)) max(double v1, double v2) {
   1500     return v1 > v2 ? v1 : v2;
   1501 }
   1502 
   1503 extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) {
   1504     double2 r;
   1505     r.x = v1.x > v2.x ? v1.x : v2.x;
   1506     r.y = v1.y > v2.y ? v1.y : v2.y;
   1507     return r;
   1508 }
   1509 
   1510 extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) {
   1511     double3 r;
   1512     r.x = v1.x > v2.x ? v1.x : v2.x;
   1513     r.y = v1.y > v2.y ? v1.y : v2.y;
   1514     r.z = v1.z > v2.z ? v1.z : v2.z;
   1515     return r;
   1516 }
   1517 
   1518 extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) {
   1519     double4 r;
   1520     r.x = v1.x > v2.x ? v1.x : v2.x;
   1521     r.y = v1.y > v2.y ? v1.y : v2.y;
   1522     r.z = v1.z > v2.z ? v1.z : v2.z;
   1523     r.w = v1.w > v2.w ? v1.w : v2.w;
   1524     return r;
   1525 }
   1526 
   1527 extern long __attribute__((overloadable)) max(long v1, long v2) {
   1528     return v1 > v2 ? v1 : v2;
   1529 }
   1530 extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
   1531     long2 r;
   1532     r.x = v1.x > v2.x ? v1.x : v2.x;
   1533     r.y = v1.y > v2.y ? v1.y : v2.y;
   1534     return r;
   1535 }
   1536 extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
   1537     long3 r;
   1538     r.x = v1.x > v2.x ? v1.x : v2.x;
   1539     r.y = v1.y > v2.y ? v1.y : v2.y;
   1540     r.z = v1.z > v2.z ? v1.z : v2.z;
   1541     return r;
   1542 }
   1543 extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
   1544     long4 r;
   1545     r.x = v1.x > v2.x ? v1.x : v2.x;
   1546     r.y = v1.y > v2.y ? v1.y : v2.y;
   1547     r.z = v1.z > v2.z ? v1.z : v2.z;
   1548     r.w = v1.w > v2.w ? v1.w : v2.w;
   1549     return r;
   1550 }
   1551 
   1552 extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
   1553     return v1 > v2 ? v1 : v2;
   1554 }
   1555 extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
   1556     ulong2 r;
   1557     r.x = v1.x > v2.x ? v1.x : v2.x;
   1558     r.y = v1.y > v2.y ? v1.y : v2.y;
   1559     return r;
   1560 }
   1561 extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
   1562     ulong3 r;
   1563     r.x = v1.x > v2.x ? v1.x : v2.x;
   1564     r.y = v1.y > v2.y ? v1.y : v2.y;
   1565     r.z = v1.z > v2.z ? v1.z : v2.z;
   1566     return r;
   1567 }
   1568 extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
   1569     ulong4 r;
   1570     r.x = v1.x > v2.x ? v1.x : v2.x;
   1571     r.y = v1.y > v2.y ? v1.y : v2.y;
   1572     r.z = v1.z > v2.z ? v1.z : v2.z;
   1573     r.w = v1.w > v2.w ? v1.w : v2.w;
   1574     return r;
   1575 }
   1576 
   1577 #define THUNK_NATIVE_F(fn) \
   1578     float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
   1579     float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
   1580     float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
   1581     float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
   1582 
   1583 #define THUNK_NATIVE_F_F(fn) \
   1584     float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
   1585     float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
   1586     float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
   1587     float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
   1588 
   1589 #define THUNK_NATIVE_F_FP(fn) \
   1590     float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
   1591     float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
   1592     float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
   1593     float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
   1594 
   1595 #define THUNK_NATIVE_F_I(fn) \
   1596     float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
   1597     float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
   1598     float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
   1599     float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
   1600 
   1601 THUNK_NATIVE_F(acos)
   1602 THUNK_NATIVE_F(acosh)
   1603 THUNK_NATIVE_F(acospi)
   1604 THUNK_NATIVE_F(asin)
   1605 THUNK_NATIVE_F(asinh)
   1606 THUNK_NATIVE_F(asinpi)
   1607 THUNK_NATIVE_F(atan)
   1608 THUNK_NATIVE_F_F(atan2)
   1609 THUNK_NATIVE_F(atanh)
   1610 THUNK_NATIVE_F(atanpi)
   1611 THUNK_NATIVE_F_F(atan2pi)
   1612 THUNK_NATIVE_F(cbrt)
   1613 THUNK_NATIVE_F(cos)
   1614 THUNK_NATIVE_F(cosh)
   1615 THUNK_NATIVE_F(cospi)
   1616 THUNK_NATIVE_F(expm1)
   1617 THUNK_NATIVE_F_F(hypot)
   1618 THUNK_NATIVE_F(log1p)
   1619 THUNK_NATIVE_F_I(rootn)
   1620 THUNK_NATIVE_F(rsqrt)
   1621 THUNK_NATIVE_F(sqrt)
   1622 THUNK_NATIVE_F(sin)
   1623 THUNK_NATIVE_F_FP(sincos)
   1624 THUNK_NATIVE_F(sinh)
   1625 THUNK_NATIVE_F(sinpi)
   1626 THUNK_NATIVE_F(tan)
   1627 THUNK_NATIVE_F(tanh)
   1628 THUNK_NATIVE_F(tanpi)
   1629 
   1630 #undef THUNK_NATIVE_F
   1631 #undef THUNK_NATIVE_F_F
   1632 #undef THUNK_NATIVE_F_I
   1633 #undef THUNK_NATIVE_F_FP
   1634 
   1635 float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
   1636 float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
   1637 float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
   1638 float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
   1639 
   1640 float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
   1641 float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
   1642 float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
   1643 float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
   1644 
   1645 float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
   1646 float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
   1647 float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
   1648 float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
   1649 
   1650 float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
   1651 float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
   1652 float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
   1653 float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
   1654 
   1655 float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
   1656 float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
   1657 float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
   1658 float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
   1659 
   1660 
   1661 
   1662 
   1663 
   1664 #undef FN_FUNC_FN
   1665 #undef IN_FUNC_FN
   1666 #undef FN_FUNC_FN_FN
   1667 #undef FN_FUNC_FN_F
   1668 #undef FN_FUNC_FN_IN
   1669 #undef FN_FUNC_FN_I
   1670 #undef FN_FUNC_FN_PFN
   1671 #undef FN_FUNC_FN_PIN
   1672 #undef FN_FUNC_FN_FN_FN
   1673 #undef FN_FUNC_FN_FN_PIN
   1674 #undef XN_FUNC_YN
   1675 #undef UIN_FUNC_IN
   1676 #undef IN_FUNC_IN
   1677 #undef XN_FUNC_XN_XN_BODY
   1678 #undef IN_FUNC_IN_IN_BODY
   1679 
   1680 static const unsigned short kHalfPositiveInfinity = 0x7c00;
   1681 
   1682 /* Define f16 functions of the form
   1683  *     HN output = fn(HN input)
   1684  * where HN is scalar or vector half type
   1685  */
   1686 #define HN_FUNC_HN(fn)                                                    \
   1687 extern half __attribute__((overloadable)) fn(half h) {                    \
   1688     return (half) fn((float) h);                                          \
   1689 }                                                                         \
   1690 extern half2 __attribute__((overloadable)) fn(half2 v) {                  \
   1691   return convert_half2(fn(convert_float2(v)));                            \
   1692 }                                                                         \
   1693 extern half3 __attribute__((overloadable)) fn(half3 v) {                  \
   1694   return convert_half3(fn(convert_float3(v)));                            \
   1695 }                                                                         \
   1696 extern half4 __attribute__((overloadable)) fn(half4 v) {                  \
   1697   return convert_half4(fn(convert_float4(v)));                            \
   1698 }
   1699 
   1700 /* Define f16 functions of the form
   1701  *     HN output = fn(HN input1, HN input2)
   1702  * where HN is scalar or vector half type
   1703  */
   1704 #define HN_FUNC_HN_HN(fn)                                                 \
   1705 extern half __attribute__((overloadable)) fn(half h1, half h2) {          \
   1706     return (half) fn((float) h1, (float) h2);                             \
   1707 }                                                                         \
   1708 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) {       \
   1709   return convert_half2(fn(convert_float2(v1),                             \
   1710                           convert_float2(v2)));                           \
   1711 }                                                                         \
   1712 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) {       \
   1713   return convert_half3(fn(convert_float3(v1),                             \
   1714                           convert_float3(v2)));                           \
   1715 }                                                                         \
   1716 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) {       \
   1717   return convert_half4(fn(convert_float4(v1),                             \
   1718                           convert_float4(v2)));                           \
   1719 }
   1720 
   1721 /* Define f16 functions of the form
   1722  *     HN output = fn(HN input1, half input2)
   1723  * where HN is scalar or vector half type
   1724  */
   1725 #define HN_FUNC_HN_H(fn)                                                  \
   1726 extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) {        \
   1727   return convert_half2(fn(convert_float2(v1), (float) v2));               \
   1728 }                                                                         \
   1729 extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) {        \
   1730   return convert_half3(fn(convert_float3(v1), (float) v2));               \
   1731 }                                                                         \
   1732 extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) {        \
   1733   return convert_half4(fn(convert_float4(v1), (float) v2));               \
   1734 }
   1735 
   1736 /* Define f16 functions of the form
   1737  *     HN output = fn(HN input1, HN input2, HN input3)
   1738  * where HN is scalar or vector half type
   1739  */
   1740 #define HN_FUNC_HN_HN_HN(fn)                                                   \
   1741 extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) {      \
   1742     return (half) fn((float) h1, (float) h2, (float) h3);                      \
   1743 }                                                                              \
   1744 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) {  \
   1745   return convert_half2(fn(convert_float2(v1),                                  \
   1746                           convert_float2(v2),                                  \
   1747                           convert_float2(v3)));                                \
   1748 }                                                                              \
   1749 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) {  \
   1750   return convert_half3(fn(convert_float3(v1),                                  \
   1751                           convert_float3(v2),                                  \
   1752                           convert_float3(v3)));                                \
   1753 }                                                                              \
   1754 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) {  \
   1755   return convert_half4(fn(convert_float4(v1),                                  \
   1756                           convert_float4(v2),                                  \
   1757                           convert_float4(v3)));                                \
   1758 }
   1759 
   1760 /* Define f16 functions of the form
   1761  *     HN output = fn(HN input1, IN input2)
   1762  * where HN is scalar or vector half type and IN the equivalent integer type
   1763  * of same vector length.
   1764  */
   1765 #define HN_FUNC_HN_IN(fn)                                                 \
   1766 extern half __attribute__((overloadable)) fn(half h1, int v) {            \
   1767     return (half) fn((float) h1, v);                                      \
   1768 }                                                                         \
   1769 extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) {        \
   1770   return convert_half2(fn(convert_float2(v1), v2));                       \
   1771 }                                                                         \
   1772 extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) {        \
   1773   return convert_half3(fn(convert_float3(v1), v2));                       \
   1774 }                                                                         \
   1775 extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) {        \
   1776   return convert_half4(fn(convert_float4(v1), v2));                       \
   1777 }
   1778 
   1779 /* Define f16 functions of the form
   1780  *     half output = fn(HN input1)
   1781  * where HN is a scalar or vector half type.
   1782  */
   1783 #define H_FUNC_HN(fn)                                                     \
   1784 extern half __attribute__((overloadable)) fn(half h) {                    \
   1785     return (half) fn((float) h);                                          \
   1786 }                                                                         \
   1787 extern half __attribute__((overloadable)) fn(half2 v) {                   \
   1788   return fn(convert_float2(v));                                           \
   1789 }                                                                         \
   1790 extern half __attribute__((overloadable)) fn(half3 v) {                   \
   1791   return fn(convert_float3(v));                                           \
   1792 }                                                                         \
   1793 extern half __attribute__((overloadable)) fn(half4 v) {                   \
   1794   return fn(convert_float4(v));                                           \
   1795 }
   1796 
   1797 /* Define f16 functions of the form
   1798  *     half output = fn(HN input1, HN input2)
   1799  * where HN is a scalar or vector half type.
   1800  */
   1801 #define H_FUNC_HN_HN(fn)                                                  \
   1802 extern half __attribute__((overloadable)) fn(half h1, half h2) {          \
   1803     return (half) fn((float) h1, (float) h2);                             \
   1804 }                                                                         \
   1805 extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) {        \
   1806   return fn(convert_float2(v1), convert_float2(v2));                      \
   1807 }                                                                         \
   1808 extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) {        \
   1809   return fn(convert_float3(v1), convert_float3(v2));                      \
   1810 }                                                                         \
   1811 extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) {        \
   1812   return fn(convert_float4(v1), convert_float4(v2));                      \
   1813 }
   1814 
   1815 #define SCALARIZE_HN_FUNC_HN_PHN(fnc)                                 \
   1816 extern half2 __attribute__((overloadable)) fnc(half2 v1, half2 *v2) { \
   1817     half2 ret;                                                        \
   1818     half t[2];                                                        \
   1819     ret.x = fnc(v1.x, &t[0]);                                         \
   1820     ret.y = fnc(v1.y, &t[1]);                                         \
   1821     v2->x = t[0];                                                     \
   1822     v2->y = t[1];                                                     \
   1823     return ret;                                                       \
   1824 }                                                                     \
   1825 extern half3 __attribute__((overloadable)) fnc(half3 v1, half3 *v2) { \
   1826     half3 ret;                                                        \
   1827     half t[3];                                                        \
   1828     ret.x = fnc(v1.x, &t[0]);                                         \
   1829     ret.y = fnc(v1.y, &t[1]);                                         \
   1830     ret.z = fnc(v1.z, &t[2]);                                         \
   1831     v2->x = t[0];                                                     \
   1832     v2->y = t[1];                                                     \
   1833     v2->z = t[2];                                                     \
   1834     return ret;                                                       \
   1835 }                                                                     \
   1836 extern half4 __attribute__((overloadable)) fnc(half4 v1, half4 *v2) { \
   1837     half4 ret;                                                        \
   1838     half t[4];                                                        \
   1839     ret.x = fnc(v1.x, &t[0]);                                         \
   1840     ret.y = fnc(v1.y, &t[1]);                                         \
   1841     ret.z = fnc(v1.z, &t[2]);                                         \
   1842     ret.w = fnc(v1.w, &t[3]);                                         \
   1843     v2->x = t[0];                                                     \
   1844     v2->y = t[1];                                                     \
   1845     v2->z = t[2];                                                     \
   1846     v2->w = t[3];                                                     \
   1847     return ret;                                                       \
   1848 }
   1849 
   1850 /* Define f16 functions of the form
   1851  *     HN output = fn(HN input1, HN input2)
   1852  * where HN is a vector half type.  The functions are defined to call the
   1853  * scalar function of the same name.
   1854  */
   1855 #define SCALARIZE_HN_FUNC_HN_HN(fn)                                       \
   1856 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) {       \
   1857   half2 ret;                                                              \
   1858   ret.x = fn(v1.x, v2.x);                                                 \
   1859   ret.y = fn(v1.y, v2.y);                                                 \
   1860   return ret;                                                             \
   1861 }                                                                         \
   1862 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) {       \
   1863   half3 ret;                                                              \
   1864   ret.x = fn(v1.x, v2.x);                                                 \
   1865   ret.y = fn(v1.y, v2.y);                                                 \
   1866   ret.z = fn(v1.z, v2.z);                                                 \
   1867   return ret;                                                             \
   1868 }                                                                         \
   1869 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) {       \
   1870   half4 ret;                                                              \
   1871   ret.x = fn(v1.x, v2.x);                                                 \
   1872   ret.y = fn(v1.y, v2.y);                                                 \
   1873   ret.z = fn(v1.z, v2.z);                                                 \
   1874   ret.w = fn(v1.w, v2.w);                                                 \
   1875   return ret;                                                             \
   1876 }                                                                         \
   1877 
   1878 HN_FUNC_HN(acos);
   1879 HN_FUNC_HN(acosh);
   1880 HN_FUNC_HN(acospi);
   1881 HN_FUNC_HN(asin);
   1882 HN_FUNC_HN(asinh);
   1883 HN_FUNC_HN(asinpi);
   1884 HN_FUNC_HN(atan);
   1885 HN_FUNC_HN(atanh);
   1886 HN_FUNC_HN(atanpi);
   1887 HN_FUNC_HN_HN(atan2);
   1888 HN_FUNC_HN_HN(atan2pi);
   1889 
   1890 HN_FUNC_HN(cbrt);
   1891 HN_FUNC_HN(ceil);
   1892 
   1893 extern half __attribute__((overloadable)) copysign(half x, half y);
   1894 SCALARIZE_HN_FUNC_HN_HN(copysign);
   1895 
   1896 HN_FUNC_HN(cos);
   1897 HN_FUNC_HN(cosh);
   1898 HN_FUNC_HN(cospi);
   1899 
   1900 extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) {
   1901     half3 r;
   1902     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
   1903     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
   1904     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
   1905     return r;
   1906 }
   1907 
   1908 extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) {
   1909     half4 r;
   1910     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
   1911     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
   1912     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
   1913     r.w = 0.f;
   1914     return r;
   1915 }
   1916 
   1917 HN_FUNC_HN(degrees);
   1918 H_FUNC_HN_HN(distance);
   1919 H_FUNC_HN_HN(dot);
   1920 
   1921 HN_FUNC_HN(erf);
   1922 HN_FUNC_HN(erfc);
   1923 HN_FUNC_HN(exp);
   1924 HN_FUNC_HN(exp10);
   1925 HN_FUNC_HN(exp2);
   1926 HN_FUNC_HN(expm1);
   1927 
   1928 HN_FUNC_HN(fabs);
   1929 HN_FUNC_HN_HN(fdim);
   1930 HN_FUNC_HN(floor);
   1931 HN_FUNC_HN_HN_HN(fma);
   1932 HN_FUNC_HN_HN(fmax);
   1933 HN_FUNC_HN_H(fmax);
   1934 HN_FUNC_HN_HN(fmin);
   1935 HN_FUNC_HN_H(fmin);
   1936 HN_FUNC_HN_HN(fmod);
   1937 
   1938 extern half __attribute__((overloadable)) fract(half v, half *iptr) {
   1939     // maxLessThanOne = 0.99951171875, the largest value < 1.0
   1940     half maxLessThanOne;
   1941     SET_HALF_WORD(maxLessThanOne, 0x3bff);
   1942 
   1943     int i = (int) floor(v);
   1944     if (iptr) {
   1945         *iptr = i;
   1946     }
   1947     // return v - floor(v), if strictly less than one
   1948     return fmin(v - i, maxLessThanOne);
   1949 }
   1950 
   1951 SCALARIZE_HN_FUNC_HN_PHN(fract);
   1952 
   1953 extern half __attribute__((const, overloadable)) fract(half v) {
   1954     half unused;
   1955     return fract(v, &unused);
   1956 }
   1957 
   1958 extern half2 __attribute__((const, overloadable)) fract(half2 v) {
   1959     half2 unused;
   1960     return fract(v, &unused);
   1961 }
   1962 
   1963 extern half3 __attribute__((const, overloadable)) fract(half3 v) {
   1964     half3 unused;
   1965     return fract(v, &unused);
   1966 }
   1967 
   1968 extern half4 __attribute__((const, overloadable)) fract(half4 v) {
   1969     half4 unused;
   1970     return fract(v, &unused);
   1971 }
   1972 
   1973 extern half __attribute__((overloadable)) frexp(half x, int *eptr);
   1974 
   1975 extern half2 __attribute__((overloadable)) frexp(half2 v1, int2 *eptr) {
   1976     half2 ret;
   1977     int e[2];
   1978     ret.x = frexp(v1.x, &e[0]);
   1979     ret.y = frexp(v1.y, &e[1]);
   1980     eptr->x = e[0];
   1981     eptr->y = e[1];
   1982     return ret;
   1983 }
   1984 
   1985 extern half3 __attribute__((overloadable)) frexp(half3 v1, int3 *eptr) {
   1986     half3 ret;
   1987     int e[3];
   1988     ret.x = frexp(v1.x, &e[0]);
   1989     ret.y = frexp(v1.y, &e[1]);
   1990     ret.z = frexp(v1.z, &e[2]);
   1991     eptr->x = e[0];
   1992     eptr->y = e[1];
   1993     eptr->z = e[2];
   1994     return ret;
   1995 }
   1996 
   1997 extern half4 __attribute__((overloadable)) frexp(half4 v1, int4 *eptr) {
   1998     half4 ret;
   1999     int e[4];
   2000     ret.x = frexp(v1.x, &e[0]);
   2001     ret.y = frexp(v1.y, &e[1]);
   2002     ret.z = frexp(v1.z, &e[2]);
   2003     ret.w = frexp(v1.w, &e[3]);
   2004     eptr->x = e[0];
   2005     eptr->y = e[1];
   2006     eptr->z = e[2];
   2007     eptr->w = e[3];
   2008     return ret;
   2009 }
   2010 
   2011 HN_FUNC_HN_HN(hypot);
   2012 
   2013 extern int __attribute__((overloadable)) ilogb(half x);
   2014 
   2015 extern int2 __attribute__((overloadable)) ilogb(half2 v) {
   2016     int2 ret;
   2017     ret.x = ilogb(v.x);
   2018     ret.y = ilogb(v.y);
   2019     return ret;
   2020 }
   2021 extern int3 __attribute__((overloadable)) ilogb(half3 v) {
   2022     int3 ret;
   2023     ret.x = ilogb(v.x);
   2024     ret.y = ilogb(v.y);
   2025     ret.z = ilogb(v.z);
   2026     return ret;
   2027 }
   2028 extern int4 __attribute__((overloadable)) ilogb(half4 v) {
   2029     int4 ret;
   2030     ret.x = ilogb(v.x);
   2031     ret.y = ilogb(v.y);
   2032     ret.z = ilogb(v.z);
   2033     ret.w = ilogb(v.w);
   2034     return ret;
   2035 }
   2036 
   2037 HN_FUNC_HN_IN(ldexp);
   2038 extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) {
   2039     return convert_half2(ldexp(convert_float2(v), exponent));
   2040 }
   2041 extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) {
   2042     return convert_half3(ldexp(convert_float3(v), exponent));
   2043 }
   2044 extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) {
   2045     return convert_half4(ldexp(convert_float4(v), exponent));
   2046 }
   2047 
   2048 H_FUNC_HN(length);
   2049 HN_FUNC_HN(lgamma);
   2050 
   2051 extern half __attribute__((overloadable)) lgamma(half h, int *signp) {
   2052     return (half) lgamma((float) h, signp);
   2053 }
   2054 extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) {
   2055     return convert_half2(lgamma(convert_float2(v), signp));
   2056 }
   2057 extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) {
   2058     return convert_half3(lgamma(convert_float3(v), signp));
   2059 }
   2060 extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) {
   2061     return convert_half4(lgamma(convert_float4(v), signp));
   2062 }
   2063 
   2064 HN_FUNC_HN(log);
   2065 HN_FUNC_HN(log10);
   2066 HN_FUNC_HN(log1p);
   2067 HN_FUNC_HN(log2);
   2068 HN_FUNC_HN(logb);
   2069 
   2070 HN_FUNC_HN_HN_HN(mad);
   2071 HN_FUNC_HN_HN(max);
   2072 HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff?
   2073 HN_FUNC_HN_HN(min);
   2074 HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff?
   2075 
   2076 extern half __attribute__((overloadable)) mix(half start, half stop, half amount) {
   2077     return start + (stop - start) * amount;
   2078 }
   2079 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) {
   2080     return start + (stop - start) * amount;
   2081 }
   2082 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) {
   2083     return start + (stop - start) * amount;
   2084 }
   2085 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) {
   2086     return start + (stop - start) * amount;
   2087 }
   2088 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) {
   2089     return start + (stop - start) * amount;
   2090 }
   2091 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) {
   2092     return start + (stop - start) * amount;
   2093 }
   2094 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) {
   2095     return start + (stop - start) * amount;
   2096 }
   2097 
   2098 extern half __attribute__((overloadable)) modf(half x, half *iptr);
   2099 SCALARIZE_HN_FUNC_HN_PHN(modf);
   2100 
   2101 half __attribute__((overloadable)) nan_half() {
   2102   unsigned short nan_short = kHalfPositiveInfinity | 0x0200;
   2103   half nan;
   2104   SET_HALF_WORD(nan, nan_short);
   2105   return nan;
   2106 }
   2107 
   2108 HN_FUNC_HN(normalize);
   2109 
   2110 extern half __attribute__((overloadable)) nextafter(half x, half y);
   2111 SCALARIZE_HN_FUNC_HN_HN(nextafter);
   2112 
   2113 HN_FUNC_HN_HN(pow);
   2114 HN_FUNC_HN_IN(pown);
   2115 HN_FUNC_HN_HN(powr);
   2116 HN_FUNC_HN(radians);
   2117 HN_FUNC_HN_HN(remainder);
   2118 
   2119 extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) {
   2120     return (float) remquo((float) n, (float) d, quo);
   2121 }
   2122 extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) {
   2123     return convert_half2(remquo(convert_float2(d), convert_float2(n), quo));
   2124 }
   2125 extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) {
   2126     return convert_half3(remquo(convert_float3(d), convert_float3(n), quo));
   2127 }
   2128 extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) {
   2129     return convert_half4(remquo(convert_float4(d), convert_float4(n), quo));
   2130 }
   2131 
   2132 HN_FUNC_HN(rint);
   2133 HN_FUNC_HN_IN(rootn);
   2134 HN_FUNC_HN(round);
   2135 HN_FUNC_HN(rsqrt);
   2136 
   2137 extern half __attribute__((overloadable)) sign(half h) {
   2138     if (h > 0) return (half) 1.f;
   2139     if (h < 0) return (half) -1.f;
   2140     return h;
   2141 }
   2142 extern half2 __attribute__((overloadable)) sign(half2 v) {
   2143     half2 ret;
   2144     ret.x = sign(v.x);
   2145     ret.y = sign(v.y);
   2146     return ret;
   2147 }
   2148 extern half3 __attribute__((overloadable)) sign(half3 v) {
   2149     half3 ret;
   2150     ret.x = sign(v.x);
   2151     ret.y = sign(v.y);
   2152     ret.z = sign(v.z);
   2153     return ret;
   2154 }
   2155 extern half4 __attribute__((overloadable)) sign(half4 v) {
   2156     half4 ret;
   2157     ret.x = sign(v.x);
   2158     ret.y = sign(v.y);
   2159     ret.z = sign(v.z);
   2160     ret.w = sign(v.w);
   2161     return ret;
   2162 }
   2163 
   2164 HN_FUNC_HN(sin);
   2165 
   2166 extern half __attribute__((overloadable)) sincos(half v, half *cosptr) {
   2167     *cosptr = cos(v);
   2168     return sin(v);
   2169 }
   2170 // TODO verify if LLVM eliminates the duplicate convert_float2
   2171 extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) {
   2172     *cosptr = cos(v);
   2173     return sin(v);
   2174 }
   2175 extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) {
   2176     *cosptr = cos(v);
   2177     return sin(v);
   2178 }
   2179 extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) {
   2180     *cosptr = cos(v);
   2181     return sin(v);
   2182 }
   2183 
   2184 HN_FUNC_HN(sinh);
   2185 HN_FUNC_HN(sinpi);
   2186 HN_FUNC_HN(sqrt);
   2187 
   2188 extern half __attribute__((overloadable)) step(half edge, half v) {
   2189     return (v < edge) ? 0.f : 1.f;
   2190 }
   2191 extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) {
   2192     half2 r;
   2193     r.x = (v.x < edge.x) ? 0.f : 1.f;
   2194     r.y = (v.y < edge.y) ? 0.f : 1.f;
   2195     return r;
   2196 }
   2197 extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) {
   2198     half3 r;
   2199     r.x = (v.x < edge.x) ? 0.f : 1.f;
   2200     r.y = (v.y < edge.y) ? 0.f : 1.f;
   2201     r.z = (v.z < edge.z) ? 0.f : 1.f;
   2202     return r;
   2203 }
   2204 extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) {
   2205     half4 r;
   2206     r.x = (v.x < edge.x) ? 0.f : 1.f;
   2207     r.y = (v.y < edge.y) ? 0.f : 1.f;
   2208     r.z = (v.z < edge.z) ? 0.f : 1.f;
   2209     r.w = (v.w < edge.w) ? 0.f : 1.f;
   2210     return r;
   2211 }
   2212 extern half2 __attribute__((overloadable)) step(half2 edge, half v) {
   2213     half2 r;
   2214     r.x = (v < edge.x) ? 0.f : 1.f;
   2215     r.y = (v < edge.y) ? 0.f : 1.f;
   2216     return r;
   2217 }
   2218 extern half3 __attribute__((overloadable)) step(half3 edge, half v) {
   2219     half3 r;
   2220     r.x = (v < edge.x) ? 0.f : 1.f;
   2221     r.y = (v < edge.y) ? 0.f : 1.f;
   2222     r.z = (v < edge.z) ? 0.f : 1.f;
   2223     return r;
   2224 }
   2225 extern half4 __attribute__((overloadable)) step(half4 edge, half v) {
   2226     half4 r;
   2227     r.x = (v < edge.x) ? 0.f : 1.f;
   2228     r.y = (v < edge.y) ? 0.f : 1.f;
   2229     r.z = (v < edge.z) ? 0.f : 1.f;
   2230     r.w = (v < edge.w) ? 0.f : 1.f;
   2231     return r;
   2232 }
   2233 extern half2 __attribute__((overloadable)) step(half edge, half2 v) {
   2234     half2 r;
   2235     r.x = (v.x < edge) ? 0.f : 1.f;
   2236     r.y = (v.y < edge) ? 0.f : 1.f;
   2237     return r;
   2238 }
   2239 extern half3 __attribute__((overloadable)) step(half edge, half3 v) {
   2240     half3 r;
   2241     r.x = (v.x < edge) ? 0.f : 1.f;
   2242     r.y = (v.y < edge) ? 0.f : 1.f;
   2243     r.z = (v.z < edge) ? 0.f : 1.f;
   2244     return r;
   2245 }
   2246 extern half4 __attribute__((overloadable)) step(half edge, half4 v) {
   2247     half4 r;
   2248     r.x = (v.x < edge) ? 0.f : 1.f;
   2249     r.y = (v.y < edge) ? 0.f : 1.f;
   2250     r.z = (v.z < edge) ? 0.f : 1.f;
   2251     r.w = (v.w < edge) ? 0.f : 1.f;
   2252     return r;
   2253 }
   2254 
   2255 HN_FUNC_HN(tan);
   2256 HN_FUNC_HN(tanh);
   2257 HN_FUNC_HN(tanpi);
   2258 HN_FUNC_HN(tgamma);
   2259 HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation?
   2260 
   2261 HN_FUNC_HN(native_acos);
   2262 HN_FUNC_HN(native_acosh);
   2263 HN_FUNC_HN(native_acospi);
   2264 HN_FUNC_HN(native_asin);
   2265 HN_FUNC_HN(native_asinh);
   2266 HN_FUNC_HN(native_asinpi);
   2267 HN_FUNC_HN(native_atan);
   2268 HN_FUNC_HN(native_atanh);
   2269 HN_FUNC_HN(native_atanpi);
   2270 HN_FUNC_HN_HN(native_atan2);
   2271 HN_FUNC_HN_HN(native_atan2pi);
   2272 
   2273 HN_FUNC_HN(native_cbrt);
   2274 HN_FUNC_HN(native_cos);
   2275 HN_FUNC_HN(native_cosh);
   2276 HN_FUNC_HN(native_cospi);
   2277 
   2278 H_FUNC_HN_HN(native_distance);
   2279 HN_FUNC_HN_HN(native_divide);
   2280 
   2281 HN_FUNC_HN(native_exp);
   2282 HN_FUNC_HN(native_exp10);
   2283 HN_FUNC_HN(native_exp2);
   2284 HN_FUNC_HN(native_expm1);
   2285 
   2286 HN_FUNC_HN_HN(native_hypot);
   2287 H_FUNC_HN(native_length);
   2288 
   2289 HN_FUNC_HN(native_log);
   2290 HN_FUNC_HN(native_log10);
   2291 HN_FUNC_HN(native_log1p);
   2292 HN_FUNC_HN(native_log2);
   2293 
   2294 HN_FUNC_HN(native_normalize);
   2295 
   2296 HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half?
   2297 
   2298 HN_FUNC_HN(native_recip);
   2299 HN_FUNC_HN_IN(native_rootn);
   2300 HN_FUNC_HN(native_rsqrt);
   2301 
   2302 HN_FUNC_HN(native_sin);
   2303 
   2304 extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) {
   2305     return sincos(v, cosptr);
   2306 }
   2307 extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) {
   2308     return sincos(v, cosptr);
   2309 }
   2310 extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) {
   2311     return sincos(v, cosptr);
   2312 }
   2313 extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) {
   2314     return sincos(v, cosptr);
   2315 }
   2316 
   2317 HN_FUNC_HN(native_sinh);
   2318 HN_FUNC_HN(native_sinpi);
   2319 HN_FUNC_HN(native_sqrt);
   2320 
   2321 HN_FUNC_HN(native_tan);
   2322 HN_FUNC_HN(native_tanh);
   2323 HN_FUNC_HN(native_tanpi);
   2324 
   2325 #undef HN_FUNC_HN
   2326 #undef HN_FUNC_HN_HN
   2327 #undef HN_FUNC_HN_H
   2328 #undef HN_FUNC_HN_HN_HN
   2329 #undef HN_FUNC_HN_IN
   2330 #undef H_FUNC_HN
   2331 #undef H_FUNC_HN_HN
   2332 #undef SCALARIZE_HN_FUNC_HN_HN
   2333 
   2334 // exports unavailable mathlib functions to compat lib
   2335 
   2336 #ifdef RS_COMPATIBILITY_LIB
   2337 
   2338 // !!! DANGER !!!
   2339 // These functions are potentially missing on older Android versions.
   2340 // Work around the issue by supplying our own variants.
   2341 // !!! DANGER !!!
   2342 
   2343 // The logbl() implementation is taken from the latest bionic/, since
   2344 // double == long double on Android.
   2345 extern "C" long double logbl(long double x) { return logb(x); }
   2346 
   2347 // __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just
   2348 // pick the simplest implementation based on the ARM EABI doc.
   2349 extern "C" int __aeabi_idiv0(int v) { return v; }
   2350 
   2351 #endif // compatibility lib
   2352