Home | History | Annotate | Download | only in runtime
      1 #include "rs_core.rsh"
      2 
      3 extern float2 __attribute__((overloadable)) convert_float2(int2 c);
      4 extern float3 __attribute__((overloadable)) convert_float3(int3 c);
      5 extern float4 __attribute__((overloadable)) convert_float4(int4 c);
      6 
      7 extern int2 __attribute__((overloadable)) convert_int2(float2 c);
      8 extern int3 __attribute__((overloadable)) convert_int3(float3 c);
      9 extern int4 __attribute__((overloadable)) convert_int4(float4 c);
     10 
     11 
     12 extern float __attribute__((overloadable)) fmin(float v, float v2);
     13 extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
     14 extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
     15 extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
     16 
     17 extern float __attribute__((overloadable)) fmax(float v, float v2);
     18 extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
     19 extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
     20 extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
     21 
     22 // Float ops, 6.11.2
     23 
     24 #define FN_FUNC_FN(fnc)                                         \
     25 extern float2 __attribute__((overloadable)) fnc(float2 v) { \
     26     float2 r;                                                   \
     27     r.x = fnc(v.x);                                             \
     28     r.y = fnc(v.y);                                             \
     29     return r;                                                   \
     30 }                                                               \
     31 extern float3 __attribute__((overloadable)) fnc(float3 v) { \
     32     float3 r;                                                   \
     33     r.x = fnc(v.x);                                             \
     34     r.y = fnc(v.y);                                             \
     35     r.z = fnc(v.z);                                             \
     36     return r;                                                   \
     37 }                                                               \
     38 extern float4 __attribute__((overloadable)) fnc(float4 v) { \
     39     float4 r;                                                   \
     40     r.x = fnc(v.x);                                             \
     41     r.y = fnc(v.y);                                             \
     42     r.z = fnc(v.z);                                             \
     43     r.w = fnc(v.w);                                             \
     44     return r;                                                   \
     45 }
     46 
     47 #define IN_FUNC_FN(fnc)                                         \
     48 extern int2 __attribute__((overloadable)) fnc(float2 v) {   \
     49     int2 r;                                                     \
     50     r.x = fnc(v.x);                                             \
     51     r.y = fnc(v.y);                                             \
     52     return r;                                                   \
     53 }                                                               \
     54 extern int3 __attribute__((overloadable)) fnc(float3 v) {   \
     55     int3 r;                                                     \
     56     r.x = fnc(v.x);                                             \
     57     r.y = fnc(v.y);                                             \
     58     r.z = fnc(v.z);                                             \
     59     return r;                                                   \
     60 }                                                               \
     61 extern int4 __attribute__((overloadable)) fnc(float4 v) {   \
     62     int4 r;                                                     \
     63     r.x = fnc(v.x);                                             \
     64     r.y = fnc(v.y);                                             \
     65     r.z = fnc(v.z);                                             \
     66     r.w = fnc(v.w);                                             \
     67     return r;                                                   \
     68 }
     69 
     70 #define FN_FUNC_FN_FN(fnc)                                                  \
     71 extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
     72     float2 r;                                                               \
     73     r.x = fnc(v1.x, v2.x);                                                  \
     74     r.y = fnc(v1.y, v2.y);                                                  \
     75     return r;                                                               \
     76 }                                                                           \
     77 extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
     78     float3 r;                                                               \
     79     r.x = fnc(v1.x, v2.x);                                                  \
     80     r.y = fnc(v1.y, v2.y);                                                  \
     81     r.z = fnc(v1.z, v2.z);                                                  \
     82     return r;                                                               \
     83 }                                                                           \
     84 extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
     85     float4 r;                                                               \
     86     r.x = fnc(v1.x, v2.x);                                                  \
     87     r.y = fnc(v1.y, v2.y);                                                  \
     88     r.z = fnc(v1.z, v2.z);                                                  \
     89     r.w = fnc(v1.w, v2.w);                                                  \
     90     return r;                                                               \
     91 }
     92 
     93 #define FN_FUNC_FN_F(fnc)                                                   \
     94 extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) {  \
     95     float2 r;                                                               \
     96     r.x = fnc(v1.x, v2);                                                    \
     97     r.y = fnc(v1.y, v2);                                                    \
     98     return r;                                                               \
     99 }                                                                           \
    100 extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) {  \
    101     float3 r;                                                               \
    102     r.x = fnc(v1.x, v2);                                                    \
    103     r.y = fnc(v1.y, v2);                                                    \
    104     r.z = fnc(v1.z, v2);                                                    \
    105     return r;                                                               \
    106 }                                                                           \
    107 extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) {  \
    108     float4 r;                                                               \
    109     r.x = fnc(v1.x, v2);                                                    \
    110     r.y = fnc(v1.y, v2);                                                    \
    111     r.z = fnc(v1.z, v2);                                                    \
    112     r.w = fnc(v1.w, v2);                                                    \
    113     return r;                                                               \
    114 }
    115 
    116 #define FN_FUNC_FN_IN(fnc)                                                  \
    117 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) {   \
    118     float2 r;                                                               \
    119     r.x = fnc(v1.x, v2.x);                                                  \
    120     r.y = fnc(v1.y, v2.y);                                                  \
    121     return r;                                                               \
    122 }                                                                           \
    123 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) {   \
    124     float3 r;                                                               \
    125     r.x = fnc(v1.x, v2.x);                                                  \
    126     r.y = fnc(v1.y, v2.y);                                                  \
    127     r.z = fnc(v1.z, v2.z);                                                  \
    128     return r;                                                               \
    129 }                                                                           \
    130 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) {   \
    131     float4 r;                                                               \
    132     r.x = fnc(v1.x, v2.x);                                                  \
    133     r.y = fnc(v1.y, v2.y);                                                  \
    134     r.z = fnc(v1.z, v2.z);                                                  \
    135     r.w = fnc(v1.w, v2.w);                                                  \
    136     return r;                                                               \
    137 }
    138 
    139 #define FN_FUNC_FN_I(fnc)                                                   \
    140 extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) {    \
    141     float2 r;                                                               \
    142     r.x = fnc(v1.x, v2);                                                    \
    143     r.y = fnc(v1.y, v2);                                                    \
    144     return r;                                                               \
    145 }                                                                           \
    146 extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) {    \
    147     float3 r;                                                               \
    148     r.x = fnc(v1.x, v2);                                                    \
    149     r.y = fnc(v1.y, v2);                                                    \
    150     r.z = fnc(v1.z, v2);                                                    \
    151     return r;                                                               \
    152 }                                                                           \
    153 extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) {    \
    154     float4 r;                                                               \
    155     r.x = fnc(v1.x, v2);                                                    \
    156     r.y = fnc(v1.y, v2);                                                    \
    157     r.z = fnc(v1.z, v2);                                                    \
    158     r.w = fnc(v1.w, v2);                                                    \
    159     return r;                                                               \
    160 }
    161 
    162 #define FN_FUNC_FN_PFN(fnc)                     \
    163 extern float2 __attribute__((overloadable)) \
    164         fnc(float2 v1, float2 *v2) {            \
    165     float2 r;                                   \
    166     float t[2];                                 \
    167     r.x = fnc(v1.x, &t[0]);                     \
    168     r.y = fnc(v1.y, &t[1]);                     \
    169     v2->x = t[0];                               \
    170     v2->y = t[1];                               \
    171     return r;                                   \
    172 }                                               \
    173 extern float3 __attribute__((overloadable)) \
    174         fnc(float3 v1, float3 *v2) {            \
    175     float3 r;                                   \
    176     float t[3];                                 \
    177     r.x = fnc(v1.x, &t[0]);                     \
    178     r.y = fnc(v1.y, &t[1]);                     \
    179     r.z = fnc(v1.z, &t[2]);                     \
    180     v2->x = t[0];                               \
    181     v2->y = t[1];                               \
    182     v2->z = t[2];                               \
    183     return r;                                   \
    184 }                                               \
    185 extern float4 __attribute__((overloadable)) \
    186         fnc(float4 v1, float4 *v2) {            \
    187     float4 r;                                   \
    188     float t[4];                                 \
    189     r.x = fnc(v1.x, &t[0]);                     \
    190     r.y = fnc(v1.y, &t[1]);                     \
    191     r.z = fnc(v1.z, &t[2]);                     \
    192     r.w = fnc(v1.w, &t[3]);                     \
    193     v2->x = t[0];                               \
    194     v2->y = t[1];                               \
    195     v2->z = t[2];                               \
    196     v2->w = t[3];                               \
    197     return r;                                   \
    198 }
    199 
    200 #define FN_FUNC_FN_PIN(fnc)                                                 \
    201 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) {  \
    202     float2 r;                                                               \
    203     int t[2];                                                               \
    204     r.x = fnc(v1.x, &t[0]);                                                 \
    205     r.y = fnc(v1.y, &t[1]);                                                 \
    206     v2->x = t[0];                                                           \
    207     v2->y = t[1];                                                           \
    208     return r;                                                               \
    209 }                                                                           \
    210 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) {  \
    211     float3 r;                                                               \
    212     int t[3];                                                               \
    213     r.x = fnc(v1.x, &t[0]);                                                 \
    214     r.y = fnc(v1.y, &t[1]);                                                 \
    215     r.z = fnc(v1.z, &t[2]);                                                 \
    216     v2->x = t[0];                                                           \
    217     v2->y = t[1];                                                           \
    218     v2->z = t[2];                                                           \
    219     return r;                                                               \
    220 }                                                                           \
    221 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) {  \
    222     float4 r;                                                               \
    223     int t[4];                                                               \
    224     r.x = fnc(v1.x, &t[0]);                                                 \
    225     r.y = fnc(v1.y, &t[1]);                                                 \
    226     r.z = fnc(v1.z, &t[2]);                                                 \
    227     r.w = fnc(v1.w, &t[3]);                                                 \
    228     v2->x = t[0];                                                           \
    229     v2->y = t[1];                                                           \
    230     v2->z = t[2];                                                           \
    231     v2->w = t[3];                                                           \
    232     return r;                                                               \
    233 }
    234 
    235 #define FN_FUNC_FN_FN_FN(fnc)                   \
    236 extern float2 __attribute__((overloadable)) \
    237         fnc(float2 v1, float2 v2, float2 v3) {  \
    238     float2 r;                                   \
    239     r.x = fnc(v1.x, v2.x, v3.x);                \
    240     r.y = fnc(v1.y, v2.y, v3.y);                \
    241     return r;                                   \
    242 }                                               \
    243 extern float3 __attribute__((overloadable)) \
    244         fnc(float3 v1, float3 v2, float3 v3) {  \
    245     float3 r;                                   \
    246     r.x = fnc(v1.x, v2.x, v3.x);                \
    247     r.y = fnc(v1.y, v2.y, v3.y);                \
    248     r.z = fnc(v1.z, v2.z, v3.z);                \
    249     return r;                                   \
    250 }                                               \
    251 extern float4 __attribute__((overloadable)) \
    252         fnc(float4 v1, float4 v2, float4 v3) {  \
    253     float4 r;                                   \
    254     r.x = fnc(v1.x, v2.x, v3.x);                \
    255     r.y = fnc(v1.y, v2.y, v3.y);                \
    256     r.z = fnc(v1.z, v2.z, v3.z);                \
    257     r.w = fnc(v1.w, v2.w, v3.w);                \
    258     return r;                                   \
    259 }
    260 
    261 #define FN_FUNC_FN_FN_PIN(fnc)                  \
    262 extern float2 __attribute__((overloadable)) \
    263         fnc(float2 v1, float2 v2, int2 *v3) {   \
    264     float2 r;                                   \
    265     int t[2];                                   \
    266     r.x = fnc(v1.x, v2.x, &t[0]);               \
    267     r.y = fnc(v1.y, v2.y, &t[1]);               \
    268     v3->x = t[0];                               \
    269     v3->y = t[1];                               \
    270     return r;                                   \
    271 }                                               \
    272 extern float3 __attribute__((overloadable)) \
    273         fnc(float3 v1, float3 v2, int3 *v3) {   \
    274     float3 r;                                   \
    275     int t[3];                                   \
    276     r.x = fnc(v1.x, v2.x, &t[0]);               \
    277     r.y = fnc(v1.y, v2.y, &t[1]);               \
    278     r.z = fnc(v1.z, v2.z, &t[2]);               \
    279     v3->x = t[0];                               \
    280     v3->y = t[1];                               \
    281     v3->z = t[2];                               \
    282     return r;                                   \
    283 }                                               \
    284 extern float4 __attribute__((overloadable)) \
    285         fnc(float4 v1, float4 v2, int4 *v3) {   \
    286     float4 r;                                   \
    287     int t[4];                                   \
    288     r.x = fnc(v1.x, v2.x, &t[0]);               \
    289     r.y = fnc(v1.y, v2.y, &t[1]);               \
    290     r.z = fnc(v1.z, v2.z, &t[2]);               \
    291     r.w = fnc(v1.w, v2.w, &t[3]);               \
    292     v3->x = t[0];                               \
    293     v3->y = t[1];                               \
    294     v3->z = t[2];                               \
    295     v3->w = t[3];                               \
    296     return r;                                   \
    297 }
    298 
    299 static const int iposinf = 0x7f800000;
    300 static const int ineginf = 0xff800000;
    301 
    302 static const float posinf() {
    303     float f = *((float*)&iposinf);
    304     return f;
    305 }
    306 
    307 static const float neginf() {
    308     float f = *((float*)&ineginf);
    309     return f;
    310 }
    311 
    312 static bool isinf(float f) {
    313     int i = *((int*)(void*)&f);
    314     return (i == iposinf) || (i == ineginf);
    315 }
    316 
    317 static bool isnan(float f) {
    318     int i = *((int*)(void*)&f);
    319     return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
    320 }
    321 
    322 static bool isposzero(float f) {
    323     int i = *((int*)(void*)&f);
    324     return (i == 0x00000000);
    325 }
    326 
    327 static bool isnegzero(float f) {
    328     int i = *((int*)(void*)&f);
    329     return (i == 0x80000000);
    330 }
    331 
    332 static bool iszero(float f) {
    333     return isposzero(f) || isnegzero(f);
    334 }
    335 
    336 
    337 extern float __attribute__((overloadable)) acos(float);
    338 FN_FUNC_FN(acos)
    339 
    340 extern float __attribute__((overloadable)) acosh(float);
    341 FN_FUNC_FN(acosh)
    342 
    343 
    344 extern float __attribute__((overloadable)) acospi(float v) {
    345     return acos(v) / M_PI;
    346 }
    347 FN_FUNC_FN(acospi)
    348 
    349 extern float __attribute__((overloadable)) asin(float);
    350 FN_FUNC_FN(asin)
    351 
    352 extern float __attribute__((overloadable)) asinh(float);
    353 FN_FUNC_FN(asinh)
    354 
    355 extern float __attribute__((overloadable)) asinpi(float v) {
    356     return asin(v) / M_PI;
    357 }
    358 FN_FUNC_FN(asinpi)
    359 
    360 extern float __attribute__((overloadable)) atan(float);
    361 FN_FUNC_FN(atan)
    362 
    363 extern float __attribute__((overloadable)) atan2(float, float);
    364 FN_FUNC_FN_FN(atan2)
    365 
    366 extern float __attribute__((overloadable)) atanh(float);
    367 FN_FUNC_FN(atanh)
    368 
    369 extern float __attribute__((overloadable)) atanpi(float v) {
    370     return atan(v) / M_PI;
    371 }
    372 FN_FUNC_FN(atanpi)
    373 
    374 
    375 extern float __attribute__((overloadable)) atan2pi(float y, float x) {
    376     return atan2(y, x) / M_PI;
    377 }
    378 FN_FUNC_FN_FN(atan2pi)
    379 
    380 extern float __attribute__((overloadable)) cbrt(float);
    381 FN_FUNC_FN(cbrt)
    382 
    383 extern float __attribute__((overloadable)) ceil(float);
    384 FN_FUNC_FN(ceil)
    385 
    386 extern float __attribute__((overloadable)) copysign(float, float);
    387 FN_FUNC_FN_FN(copysign)
    388 
    389 extern float __attribute__((overloadable)) cos(float);
    390 FN_FUNC_FN(cos)
    391 
    392 extern float __attribute__((overloadable)) cosh(float);
    393 FN_FUNC_FN(cosh)
    394 
    395 extern float __attribute__((overloadable)) cospi(float v) {
    396     return cos(v * M_PI);
    397 }
    398 FN_FUNC_FN(cospi)
    399 
    400 extern float __attribute__((overloadable)) erfc(float);
    401 FN_FUNC_FN(erfc)
    402 
    403 extern float __attribute__((overloadable)) erf(float);
    404 FN_FUNC_FN(erf)
    405 
    406 extern float __attribute__((overloadable)) exp(float);
    407 FN_FUNC_FN(exp)
    408 
    409 extern float __attribute__((overloadable)) exp2(float);
    410 FN_FUNC_FN(exp2)
    411 
    412 extern float __attribute__((overloadable)) pow(float, float);
    413 
    414 extern float __attribute__((overloadable)) exp10(float v) {
    415     return exp2(v * 3.321928095f);
    416 }
    417 FN_FUNC_FN(exp10)
    418 
    419 extern float __attribute__((overloadable)) expm1(float);
    420 FN_FUNC_FN(expm1)
    421 
    422 extern float __attribute__((overloadable)) fabs(float v) {
    423     int i = *((int*)(void*)&v) & 0x7fffffff;
    424     return  *((float*)(void*)&i);
    425 }
    426 FN_FUNC_FN(fabs)
    427 
    428 extern float __attribute__((overloadable)) fdim(float, float);
    429 FN_FUNC_FN_FN(fdim)
    430 
    431 extern float __attribute__((overloadable)) floor(float);
    432 FN_FUNC_FN(floor)
    433 
    434 extern float __attribute__((overloadable)) fma(float, float, float);
    435 FN_FUNC_FN_FN_FN(fma)
    436 
    437 extern float __attribute__((overloadable)) fmin(float, float);
    438 
    439 extern float __attribute__((overloadable)) fmod(float, float);
    440 FN_FUNC_FN_FN(fmod)
    441 
    442 extern float __attribute__((overloadable)) fract(float v, float *iptr) {
    443     int i = (int)floor(v);
    444     if (iptr) {
    445         iptr[0] = i;
    446     }
    447     return fmin(v - i, 0x1.fffffep-1f);
    448 }
    449 FN_FUNC_FN_PFN(fract)
    450 
    451 extern float __attribute__((overloadable)) frexp(float, int *);
    452 FN_FUNC_FN_PIN(frexp)
    453 
    454 extern float __attribute__((overloadable)) hypot(float, float);
    455 FN_FUNC_FN_FN(hypot)
    456 
    457 extern int __attribute__((overloadable)) ilogb(float);
    458 IN_FUNC_FN(ilogb)
    459 
    460 extern float __attribute__((overloadable)) ldexp(float, int);
    461 FN_FUNC_FN_IN(ldexp)
    462 FN_FUNC_FN_I(ldexp)
    463 
    464 extern float __attribute__((overloadable)) lgamma(float);
    465 FN_FUNC_FN(lgamma)
    466 extern float __attribute__((overloadable)) lgamma(float, int*);
    467 FN_FUNC_FN_PIN(lgamma)
    468 
    469 extern float __attribute__((overloadable)) log(float);
    470 FN_FUNC_FN(log)
    471 
    472 extern float __attribute__((overloadable)) log10(float);
    473 FN_FUNC_FN(log10)
    474 
    475 
    476 extern float __attribute__((overloadable)) log2(float v) {
    477     return log10(v) * 3.321928095f;
    478 }
    479 FN_FUNC_FN(log2)
    480 
    481 extern float __attribute__((overloadable)) log1p(float);
    482 FN_FUNC_FN(log1p)
    483 
    484 extern float __attribute__((overloadable)) logb(float);
    485 FN_FUNC_FN(logb)
    486 
    487 extern float __attribute__((overloadable)) mad(float a, float b, float c) {
    488     return a * b + c;
    489 }
    490 extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
    491     return a * b + c;
    492 }
    493 extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
    494     return a * b + c;
    495 }
    496 extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
    497     return a * b + c;
    498 }
    499 
    500 extern float __attribute__((overloadable)) modf(float, float *);
    501 FN_FUNC_FN_PFN(modf);
    502 
    503 extern float __attribute__((overloadable)) nan(uint v) {
    504     float f[1];
    505     uint32_t *ip = (uint32_t *)f;
    506     *ip = v | 0x7fc00000;
    507     return f[0];
    508 }
    509 
    510 extern float __attribute__((overloadable)) nextafter(float, float);
    511 FN_FUNC_FN_FN(nextafter)
    512 
    513 FN_FUNC_FN_FN(pow)
    514 
    515 extern float __attribute__((overloadable)) pown(float v, int p) {
    516     /* The mantissa of a float has fewer bits than an int (24 effective vs. 31).
    517      * For very large ints, we'll lose whether the exponent is even or odd, making
    518      * the selection of a correct sign incorrect.  We correct this.  Use copysign
    519      * to handle the negative zero case.
    520      */
    521     float sign = (p & 0x1) ? copysign(1.f, v) : 1.f;
    522     float f = pow(v, (float)p);
    523     return copysign(f, sign);
    524 }
    525 FN_FUNC_FN_IN(pown)
    526 
    527 extern float __attribute__((overloadable)) powr(float v, float p) {
    528     return pow(v, p);
    529 }
    530 extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
    531     return pow(v, p);
    532 }
    533 extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
    534     return pow(v, p);
    535 }
    536 extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
    537     return pow(v, p);
    538 }
    539 
    540 extern float __attribute__((overloadable)) remainder(float, float);
    541 FN_FUNC_FN_FN(remainder)
    542 
    543 extern float __attribute__((overloadable)) remquo(float, float, int *);
    544 FN_FUNC_FN_FN_PIN(remquo)
    545 
    546 extern float __attribute__((overloadable)) rint(float);
    547 FN_FUNC_FN(rint)
    548 
    549 extern float __attribute__((overloadable)) rootn(float v, int r) {
    550     if (r == 0) {
    551         return posinf();
    552     }
    553 
    554     if (iszero(v)) {
    555         if (r < 0) {
    556             if (r & 1) {
    557                 return copysign(posinf(), v);
    558             } else {
    559                 return posinf();
    560             }
    561         } else {
    562             if (r & 1) {
    563                 return copysign(0.f, v);
    564             } else {
    565                 return 0.f;
    566             }
    567         }
    568     }
    569 
    570     if (!isinf(v) && !isnan(v) && (v < 0.f)) {
    571         if (r & 1) {
    572             return (-1.f * pow(-1.f * v, 1.f / r));
    573         } else {
    574             return nan(0);
    575         }
    576     }
    577 
    578     return pow(v, 1.f / r);
    579 }
    580 FN_FUNC_FN_IN(rootn);
    581 
    582 extern float __attribute__((overloadable)) round(float);
    583 FN_FUNC_FN(round)
    584 
    585 
    586 extern float __attribute__((overloadable)) sqrt(float);
    587 extern float __attribute__((overloadable)) rsqrt(float v) {
    588     return 1.f / sqrt(v);
    589 }
    590 
    591 #if (!defined(__i386__) && !defined(__x86_64__)) || defined(RS_DEBUG_RUNTIME)
    592 // These functions must be defined here if we are not using the SSE
    593 // implementation, which includes when we are built as part of the
    594 // debug runtime (libclcore_debug.bc).
    595 FN_FUNC_FN(sqrt)
    596 #else
    597 extern float2 __attribute__((overloadable)) sqrt(float2);
    598 extern float3 __attribute__((overloadable)) sqrt(float3);
    599 extern float4 __attribute__((overloadable)) sqrt(float4);
    600 #endif // (!defined(__i386__) && !defined(__x86_64__)) || defined(RS_DEBUG_RUNTIME)
    601 
    602 FN_FUNC_FN(rsqrt)
    603 
    604 extern float __attribute__((overloadable)) sin(float);
    605 FN_FUNC_FN(sin)
    606 
    607 extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
    608     *cosptr = cos(v);
    609     return sin(v);
    610 }
    611 extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
    612     *cosptr = cos(v);
    613     return sin(v);
    614 }
    615 extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
    616     *cosptr = cos(v);
    617     return sin(v);
    618 }
    619 extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
    620     *cosptr = cos(v);
    621     return sin(v);
    622 }
    623 
    624 extern float __attribute__((overloadable)) sinh(float);
    625 FN_FUNC_FN(sinh)
    626 
    627 extern float __attribute__((overloadable)) sinpi(float v) {
    628     return sin(v * M_PI);
    629 }
    630 FN_FUNC_FN(sinpi)
    631 
    632 extern float __attribute__((overloadable)) tan(float);
    633 FN_FUNC_FN(tan)
    634 
    635 extern float __attribute__((overloadable)) tanh(float);
    636 FN_FUNC_FN(tanh)
    637 
    638 extern float __attribute__((overloadable)) tanpi(float v) {
    639     return tan(v * M_PI);
    640 }
    641 FN_FUNC_FN(tanpi)
    642 
    643 
    644 extern float __attribute__((overloadable)) tgamma(float);
    645 FN_FUNC_FN(tgamma)
    646 
    647 extern float __attribute__((overloadable)) trunc(float);
    648 FN_FUNC_FN(trunc)
    649 
    650 // Int ops (partial), 6.11.3
    651 
    652 #define XN_FUNC_YN(typeout, fnc, typein)                                \
    653 extern typeout __attribute__((overloadable)) fnc(typein);               \
    654 extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) {  \
    655     typeout##2 r;                                                       \
    656     r.x = fnc(v.x);                                                     \
    657     r.y = fnc(v.y);                                                     \
    658     return r;                                                           \
    659 }                                                                       \
    660 extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) {  \
    661     typeout##3 r;                                                       \
    662     r.x = fnc(v.x);                                                     \
    663     r.y = fnc(v.y);                                                     \
    664     r.z = fnc(v.z);                                                     \
    665     return r;                                                           \
    666 }                                                                       \
    667 extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) {  \
    668     typeout##4 r;                                                       \
    669     r.x = fnc(v.x);                                                     \
    670     r.y = fnc(v.y);                                                     \
    671     r.z = fnc(v.z);                                                     \
    672     r.w = fnc(v.w);                                                     \
    673     return r;                                                           \
    674 }
    675 
    676 
    677 #define UIN_FUNC_IN(fnc)          \
    678 XN_FUNC_YN(uchar, fnc, char)      \
    679 XN_FUNC_YN(ushort, fnc, short)    \
    680 XN_FUNC_YN(uint, fnc, int)
    681 
    682 #define IN_FUNC_IN(fnc)           \
    683 XN_FUNC_YN(uchar, fnc, uchar)     \
    684 XN_FUNC_YN(char, fnc, char)       \
    685 XN_FUNC_YN(ushort, fnc, ushort)   \
    686 XN_FUNC_YN(short, fnc, short)     \
    687 XN_FUNC_YN(uint, fnc, uint)       \
    688 XN_FUNC_YN(int, fnc, int)
    689 
    690 
    691 #define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
    692 extern type __attribute__((overloadable))       \
    693         fnc(type v1, type v2) {                     \
    694     return body;                                    \
    695 }                                                   \
    696 extern type##2 __attribute__((overloadable))    \
    697         fnc(type##2 v1, type##2 v2) {               \
    698     type##2 r;                                      \
    699     r.x = fnc(v1.x, v2.x);                          \
    700     r.y = fnc(v1.y, v2.y);                          \
    701     return r;                                       \
    702 }                                                   \
    703 extern type##3 __attribute__((overloadable))    \
    704         fnc(type##3 v1, type##3 v2) {               \
    705     type##3 r;                                      \
    706     r.x = fnc(v1.x, v2.x);                          \
    707     r.y = fnc(v1.y, v2.y);                          \
    708     r.z = fnc(v1.z, v2.z);                          \
    709     return r;                                       \
    710 }                                                   \
    711 extern type##4 __attribute__((overloadable))    \
    712         fnc(type##4 v1, type##4 v2) {               \
    713     type##4 r;                                      \
    714     r.x = fnc(v1.x, v2.x);                          \
    715     r.y = fnc(v1.y, v2.y);                          \
    716     r.z = fnc(v1.z, v2.z);                          \
    717     r.w = fnc(v1.w, v2.w);                          \
    718     return r;                                       \
    719 }
    720 
    721 #define IN_FUNC_IN_IN_BODY(fnc, body) \
    722 XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
    723 XN_FUNC_XN_XN_BODY(char, fnc, body)   \
    724 XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
    725 XN_FUNC_XN_XN_BODY(short, fnc, body)  \
    726 XN_FUNC_XN_XN_BODY(uint, fnc, body)   \
    727 XN_FUNC_XN_XN_BODY(int, fnc, body)    \
    728 XN_FUNC_XN_XN_BODY(float, fnc, body)
    729 
    730 
    731 /**
    732  * abs
    733  */
    734 extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
    735     if (v < 0)
    736         return -v;
    737     return v;
    738 }
    739 extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
    740     if (v < 0)
    741         return -v;
    742     return v;
    743 }
    744 extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
    745     if (v < 0)
    746         return -v;
    747     return v;
    748 }
    749 
    750 /**
    751  * clz
    752  * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
    753  * expanded to 32 bits. For our smaller data types, we need to subtract off
    754  * these unused top bits (that will be always be composed of zeros).
    755  */
    756 extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
    757     return __builtin_clz(v);
    758 }
    759 extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
    760     return __builtin_clz(v) - 16;
    761 }
    762 extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
    763     return __builtin_clz(v) - 24;
    764 }
    765 extern int32_t __attribute__((overloadable)) clz(int32_t v) {
    766     return __builtin_clz(v);
    767 }
    768 extern int16_t __attribute__((overloadable)) clz(int16_t v) {
    769     return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
    770 }
    771 extern int8_t __attribute__((overloadable)) clz(int8_t v) {
    772     return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
    773 }
    774 
    775 
    776 UIN_FUNC_IN(abs)
    777 IN_FUNC_IN(clz)
    778 
    779 
    780 // 6.11.4
    781 
    782 
    783 extern float __attribute__((overloadable)) degrees(float radians) {
    784     return radians * (180.f / M_PI);
    785 }
    786 extern float2 __attribute__((overloadable)) degrees(float2 radians) {
    787     return radians * (180.f / M_PI);
    788 }
    789 extern float3 __attribute__((overloadable)) degrees(float3 radians) {
    790     return radians * (180.f / M_PI);
    791 }
    792 extern float4 __attribute__((overloadable)) degrees(float4 radians) {
    793     return radians * (180.f / M_PI);
    794 }
    795 
    796 extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
    797     return start + (stop - start) * amount;
    798 }
    799 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
    800     return start + (stop - start) * amount;
    801 }
    802 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
    803     return start + (stop - start) * amount;
    804 }
    805 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
    806     return start + (stop - start) * amount;
    807 }
    808 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
    809     return start + (stop - start) * amount;
    810 }
    811 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
    812     return start + (stop - start) * amount;
    813 }
    814 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
    815     return start + (stop - start) * amount;
    816 }
    817 
    818 extern float __attribute__((overloadable)) radians(float degrees) {
    819     return degrees * (M_PI / 180.f);
    820 }
    821 extern float2 __attribute__((overloadable)) radians(float2 degrees) {
    822     return degrees * (M_PI / 180.f);
    823 }
    824 extern float3 __attribute__((overloadable)) radians(float3 degrees) {
    825     return degrees * (M_PI / 180.f);
    826 }
    827 extern float4 __attribute__((overloadable)) radians(float4 degrees) {
    828     return degrees * (M_PI / 180.f);
    829 }
    830 
    831 extern float __attribute__((overloadable)) step(float edge, float v) {
    832     return (v < edge) ? 0.f : 1.f;
    833 }
    834 extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
    835     float2 r;
    836     r.x = (v.x < edge.x) ? 0.f : 1.f;
    837     r.y = (v.y < edge.y) ? 0.f : 1.f;
    838     return r;
    839 }
    840 extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
    841     float3 r;
    842     r.x = (v.x < edge.x) ? 0.f : 1.f;
    843     r.y = (v.y < edge.y) ? 0.f : 1.f;
    844     r.z = (v.z < edge.z) ? 0.f : 1.f;
    845     return r;
    846 }
    847 extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
    848     float4 r;
    849     r.x = (v.x < edge.x) ? 0.f : 1.f;
    850     r.y = (v.y < edge.y) ? 0.f : 1.f;
    851     r.z = (v.z < edge.z) ? 0.f : 1.f;
    852     r.w = (v.w < edge.w) ? 0.f : 1.f;
    853     return r;
    854 }
    855 extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
    856     float2 r;
    857     r.x = (v < edge.x) ? 0.f : 1.f;
    858     r.y = (v < edge.y) ? 0.f : 1.f;
    859     return r;
    860 }
    861 extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
    862     float3 r;
    863     r.x = (v < edge.x) ? 0.f : 1.f;
    864     r.y = (v < edge.y) ? 0.f : 1.f;
    865     r.z = (v < edge.z) ? 0.f : 1.f;
    866     return r;
    867 }
    868 extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
    869     float4 r;
    870     r.x = (v < edge.x) ? 0.f : 1.f;
    871     r.y = (v < edge.y) ? 0.f : 1.f;
    872     r.z = (v < edge.z) ? 0.f : 1.f;
    873     r.w = (v < edge.w) ? 0.f : 1.f;
    874     return r;
    875 }
    876 extern float2 __attribute__((overloadable)) step(float edge, float2 v) {
    877     float2 r;
    878     r.x = (v.x < edge) ? 0.f : 1.f;
    879     r.y = (v.y < edge) ? 0.f : 1.f;
    880     return r;
    881 }
    882 extern float3 __attribute__((overloadable)) step(float edge, float3 v) {
    883     float3 r;
    884     r.x = (v.x < edge) ? 0.f : 1.f;
    885     r.y = (v.y < edge) ? 0.f : 1.f;
    886     r.z = (v.z < edge) ? 0.f : 1.f;
    887     return r;
    888 }
    889 extern float4 __attribute__((overloadable)) step(float edge, float4 v) {
    890     float4 r;
    891     r.x = (v.x < edge) ? 0.f : 1.f;
    892     r.y = (v.y < edge) ? 0.f : 1.f;
    893     r.z = (v.z < edge) ? 0.f : 1.f;
    894     r.w = (v.w < edge) ? 0.f : 1.f;
    895     return r;
    896 }
    897 
    898 extern float __attribute__((overloadable)) smoothstep(float, float, float);
    899 extern float2 __attribute__((overloadable)) smoothstep(float2, float2, float2);
    900 extern float3 __attribute__((overloadable)) smoothstep(float3, float3, float3);
    901 extern float4 __attribute__((overloadable)) smoothstep(float4, float4, float4);
    902 extern float2 __attribute__((overloadable)) smoothstep(float, float, float2);
    903 extern float3 __attribute__((overloadable)) smoothstep(float, float, float3);
    904 extern float4 __attribute__((overloadable)) smoothstep(float, float, float4);
    905 
    906 extern float __attribute__((overloadable)) sign(float v) {
    907     if (v > 0) return 1.f;
    908     if (v < 0) return -1.f;
    909     return v;
    910 }
    911 FN_FUNC_FN(sign)
    912 
    913 
    914 // 6.11.5
    915 extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
    916     float3 r;
    917     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
    918     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
    919     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
    920     return r;
    921 }
    922 
    923 extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
    924     float4 r;
    925     r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
    926     r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
    927     r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
    928     r.w = 0.f;
    929     return r;
    930 }
    931 
    932 #if (!defined(__i386__) && !defined(__x86_64__)) || defined(RS_DEBUG_RUNTIME)
    933 // These functions must be defined here if we are not using the SSE
    934 // implementation, which includes when we are built as part of the
    935 // debug runtime (libclcore_debug.bc).
    936 
    937 extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
    938     return lhs * rhs;
    939 }
    940 extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
    941     return lhs.x*rhs.x + lhs.y*rhs.y;
    942 }
    943 extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
    944     return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
    945 }
    946 extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
    947     return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
    948 }
    949 
    950 extern float __attribute__((overloadable)) length(float v) {
    951     return fabs(v);
    952 }
    953 extern float __attribute__((overloadable)) length(float2 v) {
    954     return sqrt(v.x*v.x + v.y*v.y);
    955 }
    956 extern float __attribute__((overloadable)) length(float3 v) {
    957     return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
    958 }
    959 extern float __attribute__((overloadable)) length(float4 v) {
    960     return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
    961 }
    962 
    963 #else
    964 
    965 extern float __attribute__((overloadable)) length(float v);
    966 extern float __attribute__((overloadable)) length(float2 v);
    967 extern float __attribute__((overloadable)) length(float3 v);
    968 extern float __attribute__((overloadable)) length(float4 v);
    969 
    970 #endif // (!defined(__i386__) && !defined(__x86_64__)) || defined(RS_DEBUG_RUNTIME)
    971 
    972 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
    973     return length(lhs - rhs);
    974 }
    975 extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
    976     return length(lhs - rhs);
    977 }
    978 extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
    979     return length(lhs - rhs);
    980 }
    981 extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
    982     return length(lhs - rhs);
    983 }
    984 
    985 /* For the normalization functions, vectors of length 0 should simply be
    986  * returned (i.e. all the components of that vector are 0).
    987  */
    988 extern float __attribute__((overloadable)) normalize(float v) {
    989     if (v == 0.0f) {
    990         return 0.0f;
    991     } else if (v < 0.0f) {
    992         return -1.0f;
    993     } else {
    994         return 1.0f;
    995     }
    996 }
    997 extern float2 __attribute__((overloadable)) normalize(float2 v) {
    998     float l = length(v);
    999     return l == 0.0f ? v : v / l;
   1000 }
   1001 extern float3 __attribute__((overloadable)) normalize(float3 v) {
   1002     float l = length(v);
   1003     return l == 0.0f ? v : v / l;
   1004 }
   1005 extern float4 __attribute__((overloadable)) normalize(float4 v) {
   1006     float l = length(v);
   1007     return l == 0.0f ? v : v / l;
   1008 }
   1009 
   1010 extern float __attribute__((overloadable)) half_sqrt(float v) {
   1011     return sqrt(v);
   1012 }
   1013 FN_FUNC_FN(half_sqrt)
   1014 
   1015 extern float __attribute__((overloadable)) fast_length(float v) {
   1016     return fabs(v);
   1017 }
   1018 extern float __attribute__((overloadable)) fast_length(float2 v) {
   1019     return half_sqrt(v.x*v.x + v.y*v.y);
   1020 }
   1021 extern float __attribute__((overloadable)) fast_length(float3 v) {
   1022     return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
   1023 }
   1024 extern float __attribute__((overloadable)) fast_length(float4 v) {
   1025     return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
   1026 }
   1027 
   1028 extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
   1029     return fast_length(lhs - rhs);
   1030 }
   1031 extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
   1032     return fast_length(lhs - rhs);
   1033 }
   1034 extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
   1035     return fast_length(lhs - rhs);
   1036 }
   1037 extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
   1038     return fast_length(lhs - rhs);
   1039 }
   1040 
   1041 extern float __attribute__((overloadable)) half_rsqrt(float);
   1042 
   1043 /* For the normalization functions, vectors of length 0 should simply be
   1044  * returned (i.e. all the components of that vector are 0).
   1045  */
   1046 extern float __attribute__((overloadable)) fast_normalize(float v) {
   1047     if (v == 0.0f) {
   1048         return 0.0f;
   1049     } else if (v < 0.0f) {
   1050         return -1.0f;
   1051     } else {
   1052         return 1.0f;
   1053     }
   1054 }
   1055 // If the length is 0, then rlength should be NaN.
   1056 extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
   1057     float rlength = half_rsqrt(v.x*v.x + v.y*v.y);
   1058     return (rlength == rlength) ? v * rlength : v;
   1059 }
   1060 extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
   1061     float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
   1062     return (rlength == rlength) ? v * rlength : v;
   1063 }
   1064 extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
   1065     float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
   1066     return (rlength == rlength) ? v * rlength : v;
   1067 }
   1068 
   1069 extern float __attribute__((overloadable)) half_recip(float v) {
   1070     return 1.f / v;
   1071 }
   1072 
   1073 /*
   1074 extern float __attribute__((overloadable)) approx_atan(float x) {
   1075     if (x == 0.f)
   1076         return 0.f;
   1077     if (x < 0.f)
   1078         return -1.f * approx_atan(-1.f * x);
   1079     if (x > 1.f)
   1080         return M_PI_2 - approx_atan(approx_recip(x));
   1081     return x * approx_recip(1.f + 0.28f * x*x);
   1082 }
   1083 FN_FUNC_FN(approx_atan)
   1084 */
   1085 
   1086 typedef union
   1087 {
   1088   float fv;
   1089   int32_t iv;
   1090 } ieee_float_shape_type;
   1091 
   1092 /* Get a 32 bit int from a float.  */
   1093 
   1094 #define GET_FLOAT_WORD(i,d)                 \
   1095 do {                                \
   1096   ieee_float_shape_type gf_u;                   \
   1097   gf_u.fv = (d);                     \
   1098   (i) = gf_u.iv;                      \
   1099 } while (0)
   1100 
   1101 /* Set a float from a 32 bit int.  */
   1102 
   1103 #define SET_FLOAT_WORD(d,i)                 \
   1104 do {                                \
   1105   ieee_float_shape_type sf_u;                   \
   1106   sf_u.iv = (i);                      \
   1107   (d) = sf_u.fv;                     \
   1108 } while (0)
   1109 
   1110 
   1111 
   1112 // Valid -125 to 125
   1113 extern float __attribute__((overloadable)) native_exp2(float v) {
   1114     int32_t iv = (int)v;
   1115     int32_t x = iv + (iv >> 31); // ~floor(v)
   1116     float r = (v - x);
   1117 
   1118     float fo;
   1119     SET_FLOAT_WORD(fo, (x + 127) << 23);
   1120 
   1121     r *= 0.694f; // ~ log(e) / log(2)
   1122     float r2 = r*r;
   1123     float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1124     return fo * adj;
   1125 }
   1126 
   1127 extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
   1128     int2 iv = convert_int2(v);
   1129     int2 x = iv + (iv >> (int2)31);//floor(v);
   1130     float2 r = (v - convert_float2(x));
   1131 
   1132     x += 127;
   1133 
   1134     float2 fo = (float2)(x << (int2)23);
   1135 
   1136     r *= 0.694f; // ~ log(e) / log(2)
   1137     float2 r2 = r*r;
   1138     float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1139     return fo * adj;
   1140 }
   1141 
   1142 extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
   1143     int4 iv = convert_int4(v);
   1144     int4 x = iv + (iv >> (int4)31);//floor(v);
   1145     float4 r = (v - convert_float4(x));
   1146 
   1147     x += 127;
   1148 
   1149     float4 fo = (float4)(x << (int4)23);
   1150 
   1151     r *= 0.694f; // ~ log(e) / log(2)
   1152     float4 r2 = r*r;
   1153     float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
   1154     return fo * adj;
   1155 }
   1156 
   1157 extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
   1158     float4 t = 1.f;
   1159     t.xyz = v;
   1160     return native_exp2(t).xyz;
   1161 }
   1162 
   1163 
   1164 extern float __attribute__((overloadable)) native_exp(float v) {
   1165     return native_exp2(v * 1.442695041f);
   1166 }
   1167 extern float2 __attribute__((overloadable)) native_exp(float2 v) {
   1168     return native_exp2(v * 1.442695041f);
   1169 }
   1170 extern float3 __attribute__((overloadable)) native_exp(float3 v) {
   1171     return native_exp2(v * 1.442695041f);
   1172 }
   1173 extern float4 __attribute__((overloadable)) native_exp(float4 v) {
   1174     return native_exp2(v * 1.442695041f);
   1175 }
   1176 
   1177 extern float __attribute__((overloadable)) native_exp10(float v) {
   1178     return native_exp2(v * 3.321928095f);
   1179 }
   1180 extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
   1181     return native_exp2(v * 3.321928095f);
   1182 }
   1183 extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
   1184     return native_exp2(v * 3.321928095f);
   1185 }
   1186 extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
   1187     return native_exp2(v * 3.321928095f);
   1188 }
   1189 
   1190 extern float __attribute__((overloadable)) native_log2(float v) {
   1191     int32_t ibits;
   1192     GET_FLOAT_WORD(ibits, v);
   1193 
   1194     int32_t e = (ibits >> 23) & 0xff;
   1195 
   1196     ibits &= 0x7fffff;
   1197     ibits |= 127 << 23;
   1198 
   1199     float ir;
   1200     SET_FLOAT_WORD(ir, ibits);
   1201     ir -= 1.5f;
   1202     float ir2 = ir*ir;
   1203     float adj2 = (0.405465108f / 0.693147181f) +
   1204                  ((0.666666667f / 0.693147181f) * ir) -
   1205                  ((0.222222222f / 0.693147181f) * ir2) +
   1206                  ((0.098765432f / 0.693147181f) * ir*ir2) -
   1207                  ((0.049382716f / 0.693147181f) * ir2*ir2) +
   1208                  ((0.026337449f / 0.693147181f) * ir*ir2*ir2) -
   1209                  ((0.014631916f / 0.693147181f) * ir2*ir2*ir2);
   1210     return (float)(e - 127) + adj2;
   1211 }
   1212 extern float2 __attribute__((overloadable)) native_log2(float2 v) {
   1213     float2 v2 = {native_log2(v.x), native_log2(v.y)};
   1214     return v2;
   1215 }
   1216 extern float3 __attribute__((overloadable)) native_log2(float3 v) {
   1217     float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
   1218     return v2;
   1219 }
   1220 extern float4 __attribute__((overloadable)) native_log2(float4 v) {
   1221     float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
   1222     return v2;
   1223 }
   1224 
   1225 extern float __attribute__((overloadable)) native_log(float v) {
   1226     return native_log2(v) * (1.f / 1.442695041f);
   1227 }
   1228 extern float2 __attribute__((overloadable)) native_log(float2 v) {
   1229     return native_log2(v) * (1.f / 1.442695041f);
   1230 }
   1231 extern float3 __attribute__((overloadable)) native_log(float3 v) {
   1232     return native_log2(v) * (1.f / 1.442695041f);
   1233 }
   1234 extern float4 __attribute__((overloadable)) native_log(float4 v) {
   1235     return native_log2(v) * (1.f / 1.442695041f);
   1236 }
   1237 
   1238 extern float __attribute__((overloadable)) native_log10(float v) {
   1239     return native_log2(v) * (1.f / 3.321928095f);
   1240 }
   1241 extern float2 __attribute__((overloadable)) native_log10(float2 v) {
   1242     return native_log2(v) * (1.f / 3.321928095f);
   1243 }
   1244 extern float3 __attribute__((overloadable)) native_log10(float3 v) {
   1245     return native_log2(v) * (1.f / 3.321928095f);
   1246 }
   1247 extern float4 __attribute__((overloadable)) native_log10(float4 v) {
   1248     return native_log2(v) * (1.f / 3.321928095f);
   1249 }
   1250 
   1251 
   1252 extern float __attribute__((overloadable)) native_powr(float v, float y) {
   1253     float v2 = native_log2(v);
   1254     v2 = fmax(v2 * y, -125.f);
   1255     return native_exp2(v2);
   1256 }
   1257 extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
   1258     float2 v2 = native_log2(v);
   1259     v2 = fmax(v2 * y, -125.f);
   1260     return native_exp2(v2);
   1261 }
   1262 extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
   1263     float3 v2 = native_log2(v);
   1264     v2 = fmax(v2 * y, -125.f);
   1265     return native_exp2(v2);
   1266 }
   1267 extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
   1268     float4 v2 = native_log2(v);
   1269     v2 = fmax(v2 * y, -125.f);
   1270     return native_exp2(v2);
   1271 }
   1272 
   1273 extern double __attribute__((overloadable)) min(double v1, double v2) {
   1274     return v1 < v2 ? v1 : v2;
   1275 }
   1276 
   1277 extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) {
   1278     double2 r;
   1279     r.x = v1.x < v2.x ? v1.x : v2.x;
   1280     r.y = v1.y < v2.y ? v1.y : v2.y;
   1281     return r;
   1282 }
   1283 
   1284 extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) {
   1285     double3 r;
   1286     r.x = v1.x < v2.x ? v1.x : v2.x;
   1287     r.y = v1.y < v2.y ? v1.y : v2.y;
   1288     r.z = v1.z < v2.z ? v1.z : v2.z;
   1289     return r;
   1290 }
   1291 
   1292 extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) {
   1293     double4 r;
   1294     r.x = v1.x < v2.x ? v1.x : v2.x;
   1295     r.y = v1.y < v2.y ? v1.y : v2.y;
   1296     r.z = v1.z < v2.z ? v1.z : v2.z;
   1297     r.w = v1.w < v2.w ? v1.w : v2.w;
   1298     return r;
   1299 }
   1300 
   1301 extern long __attribute__((overloadable)) min(long v1, long v2) {
   1302     return v1 < v2 ? v1 : v2;
   1303 }
   1304 extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
   1305     long2 r;
   1306     r.x = v1.x < v2.x ? v1.x : v2.x;
   1307     r.y = v1.y < v2.y ? v1.y : v2.y;
   1308     return r;
   1309 }
   1310 extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
   1311     long3 r;
   1312     r.x = v1.x < v2.x ? v1.x : v2.x;
   1313     r.y = v1.y < v2.y ? v1.y : v2.y;
   1314     r.z = v1.z < v2.z ? v1.z : v2.z;
   1315     return r;
   1316 }
   1317 extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
   1318     long4 r;
   1319     r.x = v1.x < v2.x ? v1.x : v2.x;
   1320     r.y = v1.y < v2.y ? v1.y : v2.y;
   1321     r.z = v1.z < v2.z ? v1.z : v2.z;
   1322     r.w = v1.w < v2.w ? v1.w : v2.w;
   1323     return r;
   1324 }
   1325 
   1326 extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
   1327     return v1 < v2 ? v1 : v2;
   1328 }
   1329 extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
   1330     ulong2 r;
   1331     r.x = v1.x < v2.x ? v1.x : v2.x;
   1332     r.y = v1.y < v2.y ? v1.y : v2.y;
   1333     return r;
   1334 }
   1335 extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
   1336     ulong3 r;
   1337     r.x = v1.x < v2.x ? v1.x : v2.x;
   1338     r.y = v1.y < v2.y ? v1.y : v2.y;
   1339     r.z = v1.z < v2.z ? v1.z : v2.z;
   1340     return r;
   1341 }
   1342 extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
   1343     ulong4 r;
   1344     r.x = v1.x < v2.x ? v1.x : v2.x;
   1345     r.y = v1.y < v2.y ? v1.y : v2.y;
   1346     r.z = v1.z < v2.z ? v1.z : v2.z;
   1347     r.w = v1.w < v2.w ? v1.w : v2.w;
   1348     return r;
   1349 }
   1350 
   1351 extern double __attribute__((overloadable)) max(double v1, double v2) {
   1352     return v1 > v2 ? v1 : v2;
   1353 }
   1354 
   1355 extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) {
   1356     double2 r;
   1357     r.x = v1.x > v2.x ? v1.x : v2.x;
   1358     r.y = v1.y > v2.y ? v1.y : v2.y;
   1359     return r;
   1360 }
   1361 
   1362 extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) {
   1363     double3 r;
   1364     r.x = v1.x > v2.x ? v1.x : v2.x;
   1365     r.y = v1.y > v2.y ? v1.y : v2.y;
   1366     r.z = v1.z > v2.z ? v1.z : v2.z;
   1367     return r;
   1368 }
   1369 
   1370 extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) {
   1371     double4 r;
   1372     r.x = v1.x > v2.x ? v1.x : v2.x;
   1373     r.y = v1.y > v2.y ? v1.y : v2.y;
   1374     r.z = v1.z > v2.z ? v1.z : v2.z;
   1375     r.w = v1.w > v2.w ? v1.w : v2.w;
   1376     return r;
   1377 }
   1378 
   1379 extern long __attribute__((overloadable)) max(long v1, long v2) {
   1380     return v1 > v2 ? v1 : v2;
   1381 }
   1382 extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
   1383     long2 r;
   1384     r.x = v1.x > v2.x ? v1.x : v2.x;
   1385     r.y = v1.y > v2.y ? v1.y : v2.y;
   1386     return r;
   1387 }
   1388 extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
   1389     long3 r;
   1390     r.x = v1.x > v2.x ? v1.x : v2.x;
   1391     r.y = v1.y > v2.y ? v1.y : v2.y;
   1392     r.z = v1.z > v2.z ? v1.z : v2.z;
   1393     return r;
   1394 }
   1395 extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
   1396     long4 r;
   1397     r.x = v1.x > v2.x ? v1.x : v2.x;
   1398     r.y = v1.y > v2.y ? v1.y : v2.y;
   1399     r.z = v1.z > v2.z ? v1.z : v2.z;
   1400     r.w = v1.w > v2.w ? v1.w : v2.w;
   1401     return r;
   1402 }
   1403 
   1404 extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
   1405     return v1 > v2 ? v1 : v2;
   1406 }
   1407 extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
   1408     ulong2 r;
   1409     r.x = v1.x > v2.x ? v1.x : v2.x;
   1410     r.y = v1.y > v2.y ? v1.y : v2.y;
   1411     return r;
   1412 }
   1413 extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
   1414     ulong3 r;
   1415     r.x = v1.x > v2.x ? v1.x : v2.x;
   1416     r.y = v1.y > v2.y ? v1.y : v2.y;
   1417     r.z = v1.z > v2.z ? v1.z : v2.z;
   1418     return r;
   1419 }
   1420 extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
   1421     ulong4 r;
   1422     r.x = v1.x > v2.x ? v1.x : v2.x;
   1423     r.y = v1.y > v2.y ? v1.y : v2.y;
   1424     r.z = v1.z > v2.z ? v1.z : v2.z;
   1425     r.w = v1.w > v2.w ? v1.w : v2.w;
   1426     return r;
   1427 }
   1428 
   1429 #define THUNK_NATIVE_F(fn) \
   1430     float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
   1431     float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
   1432     float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
   1433     float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
   1434 
   1435 #define THUNK_NATIVE_F_F(fn) \
   1436     float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
   1437     float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
   1438     float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
   1439     float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
   1440 
   1441 #define THUNK_NATIVE_F_FP(fn) \
   1442     float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
   1443     float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
   1444     float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
   1445     float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
   1446 
   1447 #define THUNK_NATIVE_F_I(fn) \
   1448     float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
   1449     float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
   1450     float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
   1451     float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
   1452 
   1453 THUNK_NATIVE_F(acos)
   1454 THUNK_NATIVE_F(acosh)
   1455 THUNK_NATIVE_F(acospi)
   1456 THUNK_NATIVE_F(asin)
   1457 THUNK_NATIVE_F(asinh)
   1458 THUNK_NATIVE_F(asinpi)
   1459 THUNK_NATIVE_F(atan)
   1460 THUNK_NATIVE_F_F(atan2)
   1461 THUNK_NATIVE_F(atanh)
   1462 THUNK_NATIVE_F(atanpi)
   1463 THUNK_NATIVE_F_F(atan2pi)
   1464 THUNK_NATIVE_F(cbrt)
   1465 THUNK_NATIVE_F(cos)
   1466 THUNK_NATIVE_F(cosh)
   1467 THUNK_NATIVE_F(cospi)
   1468 THUNK_NATIVE_F(expm1)
   1469 THUNK_NATIVE_F_F(hypot)
   1470 THUNK_NATIVE_F(log1p)
   1471 THUNK_NATIVE_F_I(rootn)
   1472 THUNK_NATIVE_F(rsqrt)
   1473 THUNK_NATIVE_F(sqrt)
   1474 THUNK_NATIVE_F(sin)
   1475 THUNK_NATIVE_F_FP(sincos)
   1476 THUNK_NATIVE_F(sinh)
   1477 THUNK_NATIVE_F(sinpi)
   1478 THUNK_NATIVE_F(tan)
   1479 THUNK_NATIVE_F(tanh)
   1480 THUNK_NATIVE_F(tanpi)
   1481 
   1482 #undef THUNK_NATIVE_F
   1483 #undef THUNK_NATIVE_F_F
   1484 #undef THUNK_NATIVE_F_I
   1485 #undef THUNK_NATIVE_F_FP
   1486 
   1487 float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
   1488 float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
   1489 float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
   1490 float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
   1491 
   1492 float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
   1493 float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
   1494 float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
   1495 float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
   1496 
   1497 float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
   1498 float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
   1499 float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
   1500 float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
   1501 
   1502 float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
   1503 float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
   1504 float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
   1505 float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
   1506 
   1507 float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
   1508 float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
   1509 float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
   1510 float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
   1511 
   1512 
   1513 
   1514 
   1515 
   1516 #undef FN_FUNC_FN
   1517 #undef IN_FUNC_FN
   1518 #undef FN_FUNC_FN_FN
   1519 #undef FN_FUNC_FN_F
   1520 #undef FN_FUNC_FN_IN
   1521 #undef FN_FUNC_FN_I
   1522 #undef FN_FUNC_FN_PFN
   1523 #undef FN_FUNC_FN_PIN
   1524 #undef FN_FUNC_FN_FN_FN
   1525 #undef FN_FUNC_FN_FN_PIN
   1526 #undef XN_FUNC_YN
   1527 #undef UIN_FUNC_IN
   1528 #undef IN_FUNC_IN
   1529 #undef XN_FUNC_XN_XN_BODY
   1530 #undef IN_FUNC_IN_IN_BODY
   1531