Home | History | Annotate | Download | only in arm
      1 /*
      2  * Copyright (C) 2011 University of Szeged
      3  * Copyright (C) 2011 Zoltan Herczeg
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "FELightingNEON.h"
     29 
     30 #if CPU(ARM_NEON) && COMPILER(GCC)
     31 
     32 #include <wtf/Vector.h>
     33 
     34 namespace WebCore {
     35 
     36 // These constants are copied to the following SIMD registers:
     37 //   ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D
     38 
     39 WTF_ALIGNED(short, s_FELightingConstantsForNeon[], 16) = {
     40     // Alpha coefficients.
     41     -2, 1, 0, -1, 2, 1, 0, -1,
     42     0, -1, -2, -1, 0, 1, 2, 1,
     43     // Remapping indicies.
     44     0x0f0e, 0x0302, 0x0504, 0x0706,
     45     0x0b0a, 0x1312, 0x1514, 0x1716,
     46 };
     47 
     48 short* feLightingConstantsForNeon()
     49 {
     50     return s_FELightingConstantsForNeon;
     51 }
     52 
     53 #define ASSTRING(str) #str
     54 #define TOSTRING(value) ASSTRING(value)
     55 
     56 #define PIXELS_OFFSET TOSTRING(0)
     57 #define WIDTH_OFFSET TOSTRING(4)
     58 #define HEIGHT_OFFSET TOSTRING(8)
     59 #define FLAGS_OFFSET TOSTRING(12)
     60 #define SPECULAR_EXPONENT_OFFSET TOSTRING(16)
     61 #define CONE_EXPONENT_OFFSET TOSTRING(20)
     62 #define FLOAT_ARGUMENTS_OFFSET TOSTRING(24)
     63 #define DRAWING_CONSTANTS_OFFSET TOSTRING(28)
     64 #define NL "\n"
     65 
     66 // Register allocation
     67 #define PAINTING_DATA_R       "r11"
     68 #define RESET_WIDTH_R         PAINTING_DATA_R
     69 #define PIXELS_R              "r4"
     70 #define WIDTH_R               "r5"
     71 #define HEIGHT_R              "r6"
     72 #define FLAGS_R               "r7"
     73 #define SPECULAR_EXPONENT_R   "r8"
     74 #define CONE_EXPONENT_R       "r10"
     75 #define SCANLINE_R            "r12"
     76 
     77 #define TMP1_Q                "q0"
     78 #define TMP1_D0               "d0"
     79 #define TMP1_S0               "s0"
     80 #define TMP1_S1               "s1"
     81 #define TMP1_D1               "d1"
     82 #define TMP1_S2               "s2"
     83 #define TMP1_S3               "s3"
     84 #define TMP2_Q                "q1"
     85 #define TMP2_D0               "d2"
     86 #define TMP2_S0               "s4"
     87 #define TMP2_S1               "s5"
     88 #define TMP2_D1               "d3"
     89 #define TMP2_S2               "s6"
     90 #define TMP2_S3               "s7"
     91 #define TMP3_Q                "q2"
     92 #define TMP3_D0               "d4"
     93 #define TMP3_S0               "s8"
     94 #define TMP3_S1               "s9"
     95 #define TMP3_D1               "d5"
     96 #define TMP3_S2               "s10"
     97 #define TMP3_S3               "s11"
     98 
     99 #define COSINE_OF_ANGLE       "s12"
    100 #define POWF_INT_S            "s13"
    101 #define POWF_FRAC_S           "s14"
    102 #define SPOT_COLOR_Q          "q4"
    103 
    104 // Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
    105 // must be placed on the same side of the double vector
    106 
    107 // Current pixel position
    108 #define POSITION_Q            "q5"
    109 #define POSITION_X_S          "s20"
    110 #define POSITION_Y_S          "s21"
    111 #define POSITION_Z_S          "s22"
    112 #define CONST_ZERO_HI_D       "d11"
    113 #define CONST_ZERO_S          "s23"
    114 
    115 // -------------------------------
    116 //     Variable arguments
    117 // Misc arguments
    118 #define READ1_RANGE           "d12-d15"
    119 #define READ2_RANGE           "d16-d19"
    120 #define READ3_RANGE           "d20-d21"
    121 
    122 #define SCALE_S               "s24"
    123 #define SCALE_DIV4_S          "s25"
    124 #define DIFFUSE_CONST_S       "s26"
    125 
    126 // Light source position
    127 #define CONE_CUT_OFF_S        "s28"
    128 #define CONE_FULL_LIGHT_S     "s29"
    129 #define CONE_CUT_OFF_RANGE_S  "s30"
    130 #define CONST_ONE_HI_D        "d15"
    131 #define CONST_ONE_S           "s31"
    132 
    133 #define LIGHT_Q               "q8"
    134 #define DIRECTION_Q           "q9"
    135 #define COLOR_Q               "q10"
    136 // -------------------------------
    137 //    Constant coefficients
    138 #define READ4_RANGE           "d22-d25"
    139 #define READ5_RANGE           "d26-d27"
    140 
    141 #define ALPHAX_Q              "q11"
    142 #define ALPHAY_Q              "q12"
    143 #define REMAPX_D              "d26"
    144 #define REMAPY_D              "d27"
    145 // -------------------------------
    146 
    147 #define ALL_ROWS_D            "{d28,d29,d30}"
    148 #define TOP_ROW_D             "d28"
    149 #define MIDDLE_ROW_D          "d29"
    150 #define BOTTOM_ROW_D          "d30"
    151 
    152 #define GET_LENGTH(source, temp) \
    153     "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
    154     "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
    155     "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
    156     "vsqrt.f32 " source##_S3 ", " source##_S3 NL
    157 
    158 // destination##_S3 can contain the multiply of length.
    159 #define DOT_PRODUCT(destination, source1, source2) \
    160     "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
    161     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
    162     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL
    163 
    164 #define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
    165     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
    166     "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
    167     "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
    168     "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL
    169 
    170 #define POWF_SQR(value, exponent, current, remaining) \
    171     "tst " exponent ", #" ASSTRING(current) NL \
    172     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
    173     "tst " exponent ", #" ASSTRING(remaining) NL \
    174     "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL
    175 
    176 #define POWF_SQRT(value, exponent, current, remaining) \
    177     "tst " exponent ", #" ASSTRING(remaining) NL \
    178     "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
    179     "tst " exponent ", #" ASSTRING(current) NL \
    180     "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL
    181 
    182 // This simplified powf function is sufficiently accurate.
    183 #define POWF(value, exponent) \
    184     "tst " exponent ", #0xfc0" NL \
    185     "vmovne.f32 " POWF_INT_S ", " value NL \
    186     "tst " exponent ", #0x03f" NL \
    187     "vmovne.f32 " POWF_FRAC_S ", " value NL \
    188     "vmov.f32 " value ", " CONST_ONE_S NL \
    189     \
    190     POWF_SQR(value, exponent, 0x040, 0xf80) \
    191     POWF_SQR(value, exponent, 0x080, 0xf00) \
    192     POWF_SQR(value, exponent, 0x100, 0xe00) \
    193     POWF_SQR(value, exponent, 0x200, 0xc00) \
    194     POWF_SQR(value, exponent, 0x400, 0x800) \
    195     "tst " exponent ", #0x800" NL \
    196     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
    197     \
    198     POWF_SQRT(value, exponent, 0x20, 0x3f) \
    199     POWF_SQRT(value, exponent, 0x10, 0x1f) \
    200     POWF_SQRT(value, exponent, 0x08, 0x0f) \
    201     POWF_SQRT(value, exponent, 0x04, 0x07) \
    202     POWF_SQRT(value, exponent, 0x02, 0x03) \
    203     POWF_SQRT(value, exponent, 0x01, 0x01)
    204 
    205 // The following algorithm is an ARM-NEON optimized version of
    206 // the main loop found in FELighting.cpp. Since the whole code
    207 // is redesigned to be as effective as possible (ARM specific
    208 // thinking), it is four times faster than its C++ counterpart.
    209 
    210 asm ( // NOLINT
    211 ".globl " TOSTRING(neonDrawLighting) NL
    212 TOSTRING(neonDrawLighting) ":" NL
    213     // Because of the clever register allocation, nothing is stored on the stack
    214     // except the saved registers.
    215     // Stack must be aligned to 8 bytes.
    216     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
    217     "vstmdb sp!, {d8-d15}" NL
    218     "mov " PAINTING_DATA_R ", r0" NL
    219 
    220     // The following two arguments are loaded to SIMD registers.
    221     "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
    222     "ldr r1, [" PAINTING_DATA_R ", #" DRAWING_CONSTANTS_OFFSET "]" NL
    223     "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
    224     "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
    225     "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
    226     "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
    227     "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
    228     "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL
    229 
    230     // Load all data to the SIMD registers with the least number of instructions.
    231     "vld1.f32 { " READ1_RANGE " }, [r0]!" NL
    232     "vld1.f32 { " READ2_RANGE " }, [r0]!" NL
    233     "vld1.f32 { " READ3_RANGE " }, [r0]!" NL
    234     "vld1.s16 {" READ4_RANGE "}, [r1]!" NL
    235     "vld1.s16 {" READ5_RANGE "}, [r1]!" NL
    236 
    237     // Initializing local variables.
    238     "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
    239     "add " SCANLINE_R ", " SCANLINE_R ", #8" NL
    240     "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
    241     "add " PIXELS_R ", " PIXELS_R ", #3" NL
    242     "mov r0, #0" NL
    243     "vmov.f32 " CONST_ZERO_S ", r0" NL
    244     "vmov.f32 " POSITION_Y_S ", " CONST_ONE_S NL
    245     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    246     "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
    247     "mov " RESET_WIDTH_R ", " WIDTH_R NL
    248 
    249 ".mainloop:" NL
    250     "mov r3, #3" NL
    251     "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL
    252 
    253 ".scanline:" NL
    254     // The ROW registers are storing the alpha channel of the last three pixels.
    255     // The alpha channel is stored as signed short (sint16) values. The fourth value
    256     // is garbage. The following instructions are shifting out the unnecessary alpha
    257     // values and load the next ones.
    258     "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
    259     "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
    260     "ldrb r2, [" PIXELS_R "], #4" NL
    261     "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
    262     "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
    263     "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
    264     "vmov.s16 " TOP_ROW_D "[1], r0" NL
    265     "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
    266     "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL
    267 
    268     // The two border pixels (rightmost and leftmost) are skipped when
    269     // the next scanline is reached. It also jumps, when the algorithm
    270     // is started, and the first free alpha values are loaded to each row.
    271     "subs r3, r3, #1" NL
    272     "bne .scanline" NL
    273 
    274     // The light vector goes to TMP1_Q. It is constant in case of distant light.
    275     // The fourth value contains the length of the light vector.
    276     "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL
    277     "beq .distantLight" NL
    278 
    279     "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
    280     "vmov.f32 " POSITION_Z_S ", r3" NL
    281     "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
    282     "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL
    283 
    284     "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
    285     GET_LENGTH(TMP1, TMP2)
    286 
    287     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    288     "bne .cosineOfAngle" NL
    289 ".visiblePixel:" NL
    290 
    291     //     | -1  0  1 |      | -1 -2 -1 |
    292     // X = | -2  0  2 |  Y = |  0  0  0 |
    293     //     | -1  0  1 |      |  1  2  1 |
    294 
    295     // Multiply the alpha values by the X and Y matrices.
    296 
    297     // Moving the 8 alpha value to TMP3.
    298     "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
    299     "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL
    300 
    301     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
    302     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
    303     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    304     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    305     "vmov.s16 r0, " TMP2_D0 "[0]" NL
    306 
    307     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
    308     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
    309     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    310     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    311     "vmov.s16 r1, " TMP2_D0 "[0]" NL
    312 
    313     // r0 and r1 contains the X and Y coordinates of the
    314     // normal vector, respectively.
    315 
    316     // Calculating the spot light strength.
    317     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    318     "beq .endLight" NL
    319 
    320     "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
    321     "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
    322     "beq .coneExpPowf" NL
    323 ".coneExpPowfFinished:" NL
    324 
    325     // Smoothing the cone edge if necessary.
    326     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
    327     "fmstat" NL
    328     "bhi .cutOff" NL
    329 ".cutOffFinished:" NL
    330 
    331     "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
    332     "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL
    333 
    334 ".endLight:" NL
    335     // Summarize:
    336     // r0 and r1 contains the normalVector.
    337     // TMP1_Q contains the light vector and its length.
    338     // COLOR_Q contains the color of the light vector.
    339 
    340     // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
    341     "orrs r2, r0, r1" NL
    342     "bne .normalVectorIsNonZero" NL
    343 
    344     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
    345     "bne .specularLight1" NL
    346 
    347     // Calculate diffuse light strength.
    348     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
    349     "b .lightStrengthCalculated" NL
    350 
    351 ".specularLight1:" NL
    352     // Calculating specular light strength.
    353     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
    354     GET_LENGTH(TMP1, TMP2)
    355 
    356     // When the exponent is 1, we don't need to call an expensive powf function.
    357     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
    358     "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
    359     "beq .specularExpPowf" NL
    360 
    361     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
    362     "b .lightStrengthCalculated" NL
    363 
    364 ".normalVectorIsNonZero:" NL
    365     // Normal vector goes to TMP2, and its length is calculated as well.
    366     "vmov.s32 " TMP2_S0 ", r0" NL
    367     "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
    368     "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
    369     "vmov.s32 " TMP2_S1 ", r1" NL
    370     "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
    371     "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
    372     "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
    373     GET_LENGTH(TMP2, TMP3)
    374 
    375     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
    376     "bne .specularLight2" NL
    377 
    378     // Calculating diffuse light strength.
    379     DOT_PRODUCT(TMP3, TMP2, TMP1)
    380     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
    381     "b .lightStrengthCalculated" NL
    382 
    383 ".specularLight2:" NL
    384     // Calculating specular light strength.
    385     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
    386     GET_LENGTH(TMP1, TMP3)
    387     DOT_PRODUCT(TMP3, TMP2, TMP1)
    388 
    389     // When the exponent is 1, we don't need to call an expensive powf function.
    390     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
    391     "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
    392     "beq .specularExpPowf" NL
    393     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
    394 
    395 ".lightStrengthCalculated:" NL
    396     // TMP2_S1 contains the light strength. Clamp it to [0, 1]
    397     "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
    398     "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
    399     "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
    400     "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
    401     "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
    402     // The color values are stored in-place.
    403     "strb r2, [" PIXELS_R ", #-11]" NL
    404     "strb r3, [" PIXELS_R ", #-10]" NL
    405     "vmov.u32 r2, " TMP3_S2 NL
    406     "strb r2, [" PIXELS_R ", #-9]" NL
    407 
    408     // Continue to the next pixel.
    409 ".blackPixel:" NL
    410     "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
    411     "mov r3, #1" NL
    412     "subs " WIDTH_R ", " WIDTH_R ", #1" NL
    413     "bne .scanline" NL
    414 
    415     // If the end of the scanline is reached, we continue
    416     // to the next scanline.
    417     "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
    418     "mov " WIDTH_R ", " RESET_WIDTH_R NL
    419     "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
    420     "bne .mainloop" NL
    421 
    422     // Return.
    423     "vldmia sp!, {d8-d15}" NL
    424     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
    425 
    426 ".distantLight:" NL
    427     // In case of distant light, the light vector is constant,
    428     // we simply copy it.
    429     "vmov.f32 " TMP1_Q ", " LIGHT_Q NL
    430     "b .visiblePixel" NL
    431 
    432 ".cosineOfAngle:" NL
    433     // If the pixel is outside of the cone angle, it is simply a black pixel.
    434     DOT_PRODUCT(TMP3, TMP1, DIRECTION)
    435     "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
    436     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
    437     "fmstat" NL
    438     "bls .visiblePixel" NL
    439     "mov r0, #0" NL
    440     "strh r0, [" PIXELS_R ", #-11]" NL
    441     "strb r0, [" PIXELS_R ", #-9]" NL
    442     "b .blackPixel" NL
    443 
    444 ".cutOff:" NL
    445     // Smoothing the light strength on the cone edge.
    446     "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
    447     "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
    448     "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
    449     "b .cutOffFinished" NL
    450 
    451 ".coneExpPowf:" NL
    452     POWF(TMP3_S1, CONE_EXPONENT_R)
    453     "b .coneExpPowfFinished" NL
    454 
    455 ".specularExpPowf:" NL
    456     POWF(TMP2_S1, SPECULAR_EXPONENT_R)
    457     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
    458     "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
    459     "b .lightStrengthCalculated" NL
    460 ); // NOLINT
    461 
    462 } // namespace WebCore
    463 
    464 #endif // CPU(ARM_NEON) && COMPILER(GCC)
    465