Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 
     22 @**
     23 @******************************************************************************
     24 @*
     25 @*
     26 @* @brief
     27 @*  This file contains definitions of routines that compute distortion
     28 @*  between two macro/sub blocks of identical dimensions
     29 @*
     30 @* @author
     31 @*  Ittiam
     32 @*
     33 @* @par List of Functions:
     34 @*  - ime_compute_sad_16x16_a9q()
     35 @*  - ime_compute_sad_16x16_fast_a9q()
     36 @*  - ime_compute_sad_16x8_a9q()
     37 @*  - ime_compute_sad_16x16_ea8_a9q()
     38 @*  - ime_calculate_sad2_prog_a9q()
     39 @*  - ime_calculate_sad3_prog_a9q()
     40 @*  - ime_calculate_sad4_prog_a9q()
     41 @*  - ime_sub_pel_compute_sad_16x16_a9q()
     42 @*  - ime_compute_satqd_16x16_lumainter_a9q()
     43 @*  -
     44 @* @remarks
     45 @*  None
     46 @*
     47 @*******************************************************************************
     48 @
     49 
     50 
     51 @**
     52 @******************************************************************************
     53 @*
     54 @* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
     55 @*
     56 @* @par   Description
     57 @*   This functions computes SAD between 2 16x16 blocks. There is a provision
     58 @*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
     59 @*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
     60 @*
     61 @* @param[in] pu1_src
     62 @*  UWORD8 pointer to the source
     63 @*
     64 @* @param[out] pu1_dst
     65 @*  UWORD8 pointer to the destination
     66 @*
     67 @* @param[in] src_strd
     68 @*  integer source stride
     69 @*
     70 @* @param[in] dst_strd
     71 @*  integer destination stride
     72 @*
     73 @* @param[in] i4_max_sad
     74 @*  integer maximum allowed distortion
     75 @*
     76 @* @param[in] pi4_mb_distortion
     77 @*  integer evaluated sad
     78 @*
     79 @* @remarks
     80 @*
     81 @******************************************************************************
     82 @*
     83 .text
     84 .p2align 2
     85 
     86     .global ime_compute_sad_16x16_fast_a9q
     87 
     88 ime_compute_sad_16x16_fast_a9q:
     89 
     90     stmfd         sp!, {r12, lr}
     91     vpush         {d8-d15}
     92     lsl           r2, r2, #1
     93     lsl           r3, r3, #1
     94 
     95     @for bringing buffer2 into cache..., dummy load instructions
     96     @LDR         r12,[r1]
     97 
     98     vld1.8        {d4, d5}, [r0], r2
     99     vld1.8        {d6, d7}, [r1], r3
    100     mov           r12, #6
    101     vld1.8        {d8, d9}, [r0], r2
    102     vabdl.u8      q0, d6, d4
    103     vabdl.u8      q1, d7, d5
    104     vld1.8        {d10, d11}, [r1], r3
    105 
    106 loop_sad_16x16_fast:
    107 
    108     vld1.8        {d4, d5}, [r0], r2
    109     vabal.u8      q0, d10, d8
    110     vabal.u8      q1, d11, d9
    111     vld1.8        {d6, d7}, [r1], r3
    112     subs          r12, #2
    113     vld1.8        {d8, d9}, [r0], r2
    114     vabal.u8      q0, d6, d4
    115     vabal.u8      q1, d7, d5
    116     vld1.8        {d10, d11}, [r1], r3
    117 
    118     bne           loop_sad_16x16_fast
    119 
    120     vabal.u8      q0, d10, d8
    121     vabal.u8      q1, d11, d9
    122 
    123     vadd.i16      q0, q0, q1
    124     vadd.i16      d0, d1, d0
    125     vpop          {d8-d15}
    126     ldr           r12, [sp, #12]
    127     vpaddl.u16    d0, d0
    128     vpaddl.u32    d0, d0
    129     vshl.u32      d0, d0, #1
    130     vst1.32       {d0[0]}, [r12]
    131 
    132     ldmfd         sp!, {r12, pc}
    133 
    134 
    135 
    136 
    137 @**
    138 @******************************************************************************
    139 @*
    140 @*  @brief computes distortion (SAD) between 2 16x8  blocks
    141 @*
    142 @*
    143 @*  @par   Description
    144 @*   This functions computes SAD between 2 16x8 blocks. There is a provision
    145 @*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    146 @*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    147 @*
    148 @* @param[in] pu1_src
    149 @*  UWORD8 pointer to the source
    150 @*
    151 @* @param[out] pu1_dst
    152 @*  UWORD8 pointer to the destination
    153 @*
    154 @* @param[in] src_strd
    155 @*  integer source stride
    156 @*
    157 @* @param[in] dst_strd
    158 @*  integer destination stride
    159 @*
    160 @* @param[in] u4_max_sad
    161 @*  integer maximum allowed distortion
    162 @*
    163 @* @param[in] pi4_mb_distortion
    164 @*  integer evaluated sad
    165 @*
    166 @* @remarks
    167 @*
    168 @******************************************************************************
    169 @*
    170 @
    171     .global ime_compute_sad_16x8_a9q
    172 
    173 ime_compute_sad_16x8_a9q:
    174 
    175     stmfd         sp!, {r12, lr}
    176 
    177     @for bringing buffer2 into cache..., dummy load instructions
    178     @LDR      r12,[r1]
    179 
    180     vld1.8        {d4, d5}, [r0], r2
    181     vld1.8        {d6, d7}, [r1], r3
    182     mov           r12, #6
    183     vpush         {d8-d15}
    184     vld1.8        {d8, d9}, [r0], r2
    185     vabdl.u8      q0, d6, d4
    186     vabdl.u8      q1, d7, d5
    187     vld1.8        {d10, d11}, [r1], r3
    188 
    189 loop_sad_16x8:
    190 
    191     vld1.8        {d4, d5}, [r0], r2
    192     vabal.u8      q0, d10, d8
    193     vabal.u8      q1, d11, d9
    194     vld1.8        {d6, d7}, [r1], r3
    195     subs          r12, #2
    196     vld1.8        {d8, d9}, [r0], r2
    197     vabal.u8      q0, d6, d4
    198     vabal.u8      q1, d7, d5
    199     vld1.8        {d10, d11}, [r1], r3
    200 
    201     bne           loop_sad_16x8
    202 
    203     vabal.u8      q0, d10, d8
    204     vabal.u8      q1, d11, d9
    205 
    206     vadd.i16      q0, q0, q1
    207     vadd.i16      d0, d1, d0
    208     vpop          {d8-d15}
    209     ldr           r12, [sp, #12]
    210     vpaddl.u16    d0, d0
    211     vpaddl.u32    d0, d0
    212 
    213     vst1.32       {d0[0]}, [r12]
    214 
    215     ldmfd         sp!, {r12, pc}
    216 
    217 
    218 
    219 @**
    220 @******************************************************************************
    221 @*
    222 @* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
    223 @*
    224 @* @par   Description
    225 @*   This functions computes SAD between 2 16x16 blocks. There is a provision
    226 @*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    227 @*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    228 @*
    229 @* @param[in] pu1_src
    230 @*  UWORD8 pointer to the source
    231 @*
    232 @* @param[out] pu1_dst
    233 @*  UWORD8 pointer to the destination
    234 @*
    235 @* @param[in] src_strd
    236 @*  integer source stride
    237 @*
    238 @* @param[in] dst_strd
    239 @*  integer destination stride
    240 @*
    241 @* @param[in] i4_max_sad
    242 @*  integer maximum allowed distortion
    243 @*
    244 @* @param[in] pi4_mb_distortion
    245 @*  integer evaluated sad
    246 @*
    247 @* @remarks
    248 @*
    249 @******************************************************************************
    250 @*
    251 
    252     .global ime_compute_sad_16x16_ea8_a9q
    253 
    254 ime_compute_sad_16x16_ea8_a9q:
    255 
    256     stmfd         sp!, {r5-r7, lr}
    257     lsl           r2, r2, #1
    258     lsl           r3, r3, #1
    259 
    260     @for bringing buffer2 into cache..., dummy load instructions
    261     @LDR         r12,[r1]
    262 
    263     vld1.8        {d4, d5}, [r0], r2
    264     vld1.8        {d6, d7}, [r1], r3
    265     mov           r5, #6
    266     ldrd          r6, r7, [sp, #16]
    267     vpush         {d8-d15}
    268     vld1.8        {d8, d9}, [r0], r2
    269     vabdl.u8      q0, d6, d4
    270     vabdl.u8      q1, d7, d5
    271     vld1.8        {d10, d11}, [r1], r3
    272 
    273     @r6 = i4_max_sad, r7 = pi4_mb_distortion
    274 
    275 loop_sad_16x16_ea8_1:
    276 
    277     vld1.8        {d4, d5}, [r0], r2
    278     vabal.u8      q0, d10, d8
    279     vabal.u8      q1, d11, d9
    280     vld1.8        {d6, d7}, [r1], r3
    281     subs          r5, #2
    282     vld1.8        {d8, d9}, [r0], r2
    283     vabal.u8      q0, d6, d4
    284     vabal.u8      q1, d7, d5
    285     vld1.8        {d10, d11}, [r1], r3
    286 
    287     bne           loop_sad_16x16_ea8_1
    288 
    289     vabal.u8      q0, d10, d8
    290     sub           r0, r0, r2, lsl #3
    291     vabal.u8      q1, d11, d9
    292     sub           r1, r1, r3, lsl #3
    293 
    294     vadd.i16      q6, q0, q1
    295     add           r0, r0, r2, asr #1
    296     vadd.i16      d12, d12, d13
    297     add           r1, r1, r3, asr #1
    298 
    299     vpaddl.u16    d12, d12
    300     vld1.8        {d4, d5}, [r0], r2
    301     vld1.8        {d6, d7}, [r1], r3
    302     vpaddl.u32    d12, d12
    303     vld1.8        {d8, d9}, [r0], r2
    304     vabal.u8      q0, d6, d4
    305     vabal.u8      q1, d7, d5
    306 
    307     vst1.32       {d12[0]}, [r7]
    308     ldr           r5, [r7]
    309     cmp           r5, r6
    310     bgt           end_func_16x16_ea8
    311 
    312     vld1.8        {d10, d11}, [r1], r3
    313     mov           r5, #6
    314 
    315 loop_sad_16x16_ea8_2:
    316 
    317     vld1.8        {d4, d5}, [r0], r2
    318     vabal.u8      q0, d10, d8
    319     vabal.u8      q1, d11, d9
    320     vld1.8        {d6, d7}, [r1], r3
    321     subs          r5, #2
    322     vld1.8        {d8, d9}, [r0], r2
    323     vabal.u8      q0, d6, d4
    324     vabal.u8      q1, d7, d5
    325     vld1.8        {d10, d11}, [r1], r3
    326 
    327     bne           loop_sad_16x16_ea8_2
    328 
    329     vabal.u8      q0, d10, d8
    330     vabal.u8      q1, d11, d9
    331 
    332     vadd.i16      q0, q0, q1
    333     vadd.i16      d0, d1, d0
    334 
    335     vpaddl.u16    d0, d0
    336     vpaddl.u32    d0, d0
    337 
    338     vst1.32       {d0[0]}, [r7]
    339 
    340 end_func_16x16_ea8:
    341     vpop          {d8-d15}
    342     ldmfd         sp!, {r5-r7, pc}
    343 
    344 
    345 
    346 @*
    347 @//---------------------------------------------------------------------------
    348 @// Function Name      : Calculate_Mad2_prog()
    349 @//
    350 @// Detail Description : This function find the sad values of 4 Progressive MBs
    351 @//                        at one shot
    352 @//
    353 @// Platform           : CortexA8/NEON            .
    354 @//
    355 @//-----------------------------------------------------------------------------
    356 @*
    357 
    358     .global ime_calculate_sad2_prog_a9q
    359 
    360 ime_calculate_sad2_prog_a9q:
    361 
    362     @ r0    = ref1     <UWORD8 *>
    363     @ r1    = ref2     <UWORD8 *>
    364     @ r2    = src     <UWORD8 *>
    365     @ r3    = RefBufferWidth <UWORD32>
    366     @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
    367 
    368     stmfd         sp!, {r4-r5, lr}
    369 
    370     ldr           r4, [sp, #8]          @ load src stride to r4
    371     mov           r5, #14
    372     vpush         {d8-d15}
    373     @Row 1
    374     vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
    375     vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
    376     vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
    377 
    378     @Row 2
    379     vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
    380     vabdl.u8      q6, d2, d0
    381     vabdl.u8      q7, d3, d1
    382     vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
    383     vabdl.u8      q8, d4, d0
    384     vabdl.u8      q9, d5, d1
    385     vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
    386 
    387 loop_sad2_prog:
    388 
    389     subs          r5, #2
    390     @Row 1
    391     vld1.8        {d0, d1}, [r2], r4    @ load src Row 1
    392     vabal.u8      q6, d8, d6
    393     vabal.u8      q7, d9, d7
    394     vld1.8        {d2, d3}, [r0], r3    @ load ref1 Row 1
    395     vabal.u8      q8, d10, d6
    396     vabal.u8      q9, d11, d7
    397     vld1.8        {d4, d5}, [r1], r3    @ load ref2 Row 1
    398 
    399     @Row 2
    400     vld1.8        {d6, d7}, [r2], r4    @ load src Row 2
    401     vabal.u8      q6, d2, d0
    402     vabal.u8      q7, d3, d1
    403     vld1.8        {d8, d9}, [r0], r3    @ load ref1 Row 2
    404     vabal.u8      q8, d4, d0
    405     vabal.u8      q9, d5, d1
    406     vld1.8        {d10, d11}, [r1], r3  @ load ref2 Row 2
    407 
    408     bne           loop_sad2_prog
    409 
    410     vabal.u8      q6, d8, d6
    411     vabal.u8      q7, d9, d7
    412     vabal.u8      q8, d10, d6
    413     vabal.u8      q9, d11, d7
    414 
    415     @ Compute SAD
    416 
    417     vadd.u16      q6, q6, q7            @ Q6  : sad_ref1
    418     vadd.u16      q8, q8, q9            @ Q8  : sad_ref2
    419 
    420     vadd.u16      d12, d12, d13
    421     ldr           r5, [sp, #16]         @ loading pi4_sad to r5
    422     vadd.u16      d16, d16, d17
    423 
    424     vpadd.u16     d12, d12, d16
    425     vpaddl.u16    d12, d12
    426 
    427     vst1.64       {d12}, [r5]!
    428     vpop          {d8-d15}
    429     ldmfd         sp!, {r4-r5, pc}
    430 
    431 
    432 
    433 @*
    434 @//---------------------------------------------------------------------------
    435 @// Function Name      : Calculate_Mad3_prog()
    436 @//
    437 @// Detail Description : This function find the sad values of 4 Progressive MBs
    438 @//                        at one shot
    439 @//
    440 @// Platform           : CortexA8/NEON            .
    441 @//
    442 @//-----------------------------------------------------------------------------
    443 @*
    444 
    445     .global ime_calculate_sad3_prog_a9q
    446 
    447 ime_calculate_sad3_prog_a9q:
    448 
    449     @ r0    = ref1     <UWORD8 *>
    450     @ r1    = ref2     <UWORD8 *>
    451     @ r2    = ref3     <UWORD8 *>
    452     @ r3    = src      <UWORD8 *>
    453     @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
    454 
    455 
    456     stmfd         sp!, {r4-r6, lr}
    457 
    458     ldrd          r4, r5, [sp, #16]     @ load ref stride to r4, src stride to r5
    459     mov           r6, #14
    460     vpush         {d8-d15}
    461     @Row 1
    462     vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
    463     vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
    464     vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
    465     vabdl.u8      q8, d2, d0
    466     vabdl.u8      q9, d3, d1
    467     vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
    468     vabdl.u8      q10, d4, d0
    469     vabdl.u8      q11, d5, d1
    470 
    471     @Row 2
    472     vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
    473     vabdl.u8      q12, d6, d0
    474     vabdl.u8      q13, d7, d1
    475     vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
    476     vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
    477     vabal.u8      q8, d10, d8
    478     vabal.u8      q9, d11, d9
    479     vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
    480     vabal.u8      q10, d12, d8
    481     vabal.u8      q11, d13, d9
    482 
    483 loop_sad3_prog:
    484 
    485     @Row 1
    486     vld1.8        {d0, d1}, [r3], r5    @ load src Row 1
    487     vabal.u8      q12, d14, d8
    488     vabal.u8      q13, d15, d9
    489     vld1.8        {d2, d3}, [r0], r4    @ load ref1 Row 1
    490     vld1.8        {d4, d5}, [r1], r4    @ load ref2 Row 1
    491     vabal.u8      q8, d2, d0
    492     vabal.u8      q9, d3, d1
    493     vld1.8        {d6, d7}, [r2], r4    @ load ref3 Row 1
    494     vabal.u8      q10, d4, d0
    495     vabal.u8      q11, d5, d1
    496 
    497     @Row 2
    498     vld1.8        {d8, d9}, [r3], r5    @ load src Row 1
    499     vabal.u8      q12, d6, d0
    500     vabal.u8      q13, d7, d1
    501     vld1.8        {d10, d11}, [r0], r4  @ load ref1 Row 1
    502     subs          r6, #2
    503     vld1.8        {d12, d13}, [r1], r4  @ load ref2 Row 1
    504     vabal.u8      q8, d10, d8
    505     vabal.u8      q9, d11, d9
    506     vld1.8        {d14, d15}, [r2], r4  @ load ref3 Row 1
    507     vabal.u8      q10, d12, d8
    508     vabal.u8      q11, d13, d9
    509 
    510     bne           loop_sad3_prog
    511 
    512     vabal.u8      q12, d14, d8
    513     vabal.u8      q13, d15, d9
    514 
    515     @ Compute SAD
    516 
    517     vadd.u16      q8, q8, q9            @ Q8  : sad_ref1
    518     vadd.u16      q10, q10, q11         @ Q10 : sad_ref2
    519     vadd.u16      q12, q12, q13         @ Q12 : sad_ref3
    520 
    521     vadd.u16      d16, d16, d17
    522     vadd.u16      d20, d20, d21
    523     vadd.u16      d24, d24, d25
    524 
    525     vpadd.u16     d16, d16, d20
    526     vpadd.u16     d24, d24, d24
    527 
    528     ldr           r6, [sp, #24]         @ loading pi4_sad to r6
    529     vpaddl.u16    d16, d16
    530     vpaddl.u16    d24, d24
    531 
    532     vst1.64       {d16}, [r6]!
    533     vst1.32       {d24[0]}, [r6]
    534     vpop          {d8-d15}
    535     ldmfd         sp!, {r4-r6, pc}
    536 
    537 
    538 
    539 @**
    540 @******************************************************************************
    541 @*
    542 @* @brief computes distortion (SAD) for sub-pel motion estimation
    543 @*
    544 @* @par   Description
    545 @*   This functions computes SAD for all the 8 half pel points
    546 @*
    547 @* @param[out] pi4_sad
    548 @*  integer evaluated sad
    549 @*  pi4_sad[0] - half x
    550 @*  pi4_sad[1] - half x - 1
    551 @*  pi4_sad[2] - half y
    552 @*  pi4_sad[3] - half y - 1
    553 @*  pi4_sad[4] - half xy
    554 @*  pi4_sad[5] - half xy - 1
    555 @*  pi4_sad[6] - half xy - strd
    556 @*  pi4_sad[7] - half xy - 1 - strd
    557 @*
    558 @* @remarks
    559 @*
    560 @******************************************************************************
    561 @*
    562 
    563 .text
    564 .p2align 2
    565 
    566     .global ime_sub_pel_compute_sad_16x16_a9q
    567 
    568 ime_sub_pel_compute_sad_16x16_a9q:
    569 
    570     stmfd         sp!, {r4-r11, lr}     @store register values to stack
    571 
    572     ldr           r9, [sp, #36]
    573     ldr           r10, [sp, #40]
    574     vpush         {d8-d15}
    575     sub           r4, r1, #1            @ x left
    576     sub           r5, r2, r10           @ y top
    577 
    578     sub           r6, r3, #1            @ xy left
    579     sub           r7, r3, r10           @ xy top
    580 
    581     sub           r8, r7, #1            @ xy top-left
    582     mov           r11, #15
    583 
    584     @for bringing buffer2 into cache..., dummy load instructions
    585     @ LDR         r12,[r1]
    586     @ LDR         r12,[sp,#12]
    587 
    588     vld1.8        {d0, d1}, [r0], r9    @ src
    589     vld1.8        {d2, d3}, [r5], r10   @ y top LOAD
    590     vld1.8        {d4, d5}, [r7], r10   @ xy top LOAD
    591     vld1.8        {d6, d7}, [r8], r10   @ xy top-left LOAD
    592 
    593     vabdl.u8      q6, d2, d0            @ y top ABS1
    594     vabdl.u8      q7, d4, d0            @ xy top ABS1
    595     vld1.8        {d8, d9}, [r1], r10   @ x LOAD
    596     vabdl.u8      q8, d6, d0            @ xy top-left ABS1
    597     vabdl.u8      q9, d8, d0            @ x ABS1
    598     vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
    599 
    600     vabal.u8      q6, d3, d1            @ y top ABS2
    601     vabal.u8      q7, d5, d1            @ xy top ABS2
    602     vld1.8        {d2, d3}, [r2], r10   @ y LOAD
    603     vabal.u8      q8, d7, d1            @ xy top-left ABS2
    604     vabal.u8      q9, d9, d1            @ x ABS2
    605     vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
    606 
    607     vabdl.u8      q10, d10, d0          @ x left ABS1
    608     vabdl.u8      q11, d2, d0           @ y ABS1
    609     vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
    610     vabdl.u8      q12, d4, d0           @ xy ABS1
    611     vabdl.u8      q13, d6, d0           @ xy left ABS1
    612 
    613 loop_sub_pel_16x16:
    614 
    615     vabal.u8      q10, d11, d1          @ x left ABS2
    616     vabal.u8      q11, d3, d1           @ y ABS2
    617     subs          r11, #1
    618     vabal.u8      q12, d5, d1           @ xy ABS2
    619     vabal.u8      q13, d7, d1           @ xy left ABS2
    620 
    621     vld1.8        {d0, d1}, [r0], r9    @ src
    622     vabal.u8      q6, d2, d0            @ y top ABS1
    623     vabal.u8      q7, d4, d0            @ xy top ABS1
    624     vld1.8        {d8, d9}, [r1], r10   @ x LOAD
    625     vabal.u8      q8, d6, d0            @ xy top-left ABS1
    626     vabal.u8      q9, d8, d0            @ x ABS1
    627     vld1.8        {d10, d11}, [r4], r10 @ x left LOAD
    628 
    629     vabal.u8      q6, d3, d1            @ y top ABS2
    630     vabal.u8      q7, d5, d1            @ xy top ABS2
    631     vld1.8        {d2, d3}, [r2], r10   @ y LOAD
    632     vabal.u8      q8, d7, d1            @ xy top-left ABS2
    633     vabal.u8      q9, d9, d1            @ x ABS2
    634     vld1.8        {d4, d5}, [r3], r10   @ xy LOAD
    635 
    636     vabal.u8      q10, d10, d0          @ x left ABS1
    637     vabal.u8      q11, d2, d0           @ y ABS1
    638     vld1.8        {d6, d7}, [r6], r10   @ xy left LOAD
    639     vabal.u8      q12, d4, d0           @ xy ABS1
    640     vabal.u8      q13, d6, d0           @ xy left ABS1
    641 
    642     bne           loop_sub_pel_16x16
    643 
    644     vabal.u8      q10, d11, d1          @ x left ABS2
    645     vabal.u8      q11, d3, d1           @ y ABS2
    646     vabal.u8      q12, d5, d1           @ xy ABS2
    647     vabal.u8      q13, d7, d1           @ xy left ABS2
    648 
    649     vadd.i16      d0, d18, d19          @ x
    650     vadd.i16      d3, d12, d13          @ y top
    651     vadd.i16      d6, d14, d15          @ xy top
    652     vadd.i16      d5, d26, d27          @ xy left
    653     vadd.i16      d1, d20, d21          @ x left
    654     vadd.i16      d2, d22, d23          @ y
    655     vadd.i16      d4, d24, d25          @ xy
    656     vadd.i16      d7, d16, d17          @ xy top left
    657 
    658     vpadd.i16     d0, d0, d1
    659     vpadd.i16     d2, d2, d3
    660     vpadd.i16     d4, d4, d5
    661     vpadd.i16     d6, d6, d7
    662 
    663     vpaddl.u16    d0, d0
    664     vpaddl.u16    d2, d2
    665     vpop          {d8-d15}
    666     ldr           r11, [sp, #44]
    667     vpaddl.u16    d4, d4
    668     vpaddl.u16    d6, d6
    669 
    670     vst1.32       {d0}, [r11]!
    671     vst1.32       {d2}, [r11]!
    672     vst1.32       {d4}, [r11]!
    673     vst1.32       {d6}, [r11]!
    674 
    675     ldmfd         sp!, {r4-r11, pc}     @Restoring registers from stack
    676 
    677 
    678 
    679 @**
    680 @******************************************************************************
    681 @*
    682 @* @brief computes distortion (SAD) between 2 16x16 blocks
    683 @*
    684 @* @par   Description
    685 @*   This functions computes SAD between 2 16x16 blocks. There is a provision
    686 @*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    687 @*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    688 @*
    689 @* @param[in] pu1_src
    690 @*  UWORD8 pointer to the source
    691 @*
    692 @* @param[out] pu1_dst
    693 @*  UWORD8 pointer to the destination
    694 @*
    695 @* @param[in] src_strd
    696 @*  integer source stride
    697 @*
    698 @* @param[in] dst_strd
    699 @*  integer destination stride
    700 @*
    701 @* @param[in] i4_max_sad
    702 @*  integer maximum allowed distortion
    703 @*
    704 @* @param[in] pi4_mb_distortion
    705 @*  integer evaluated sad
    706 @*
    707 @* @remarks
    708 @*
    709 @******************************************************************************
    710 @*
    711 
    712 .text
    713 .p2align 2
    714 
    715     .global ime_compute_sad_16x16_a9q
    716 
    717 ime_compute_sad_16x16_a9q:
    718 
    719 
    720     @STMFD       sp!,{r12,lr}
    721     stmfd         sp!, {r12, r14}       @store register values to stack
    722 
    723     @for bringing buffer2 into cache..., dummy load instructions
    724     @ LDR         r12,[r1]
    725     @ LDR         r12,[sp,#12]
    726 
    727     vld1.8        {d4, d5}, [r0], r2
    728     vld1.8        {d6, d7}, [r1], r3
    729     vpush         {d8-d15}
    730     mov           r12, #14
    731     vld1.8        {d8, d9}, [r0], r2
    732     vabdl.u8      q0, d4, d6
    733     vld1.8        {d10, d11}, [r1], r3
    734     vabdl.u8      q1, d5, d7
    735 
    736 loop_sad_16x16:
    737 
    738     vld1.8        {d4, d5}, [r0], r2
    739     vabal.u8      q0, d8, d10
    740     vld1.8        {d6, d7}, [r1], r3
    741     vabal.u8      q1, d9, d11
    742 
    743     vld1.8        {d8, d9}, [r0], r2
    744     vabal.u8      q0, d4, d6
    745     subs          r12, #2
    746     vld1.8        {d10, d11}, [r1], r3
    747     vabal.u8      q1, d5, d7
    748 
    749     bne           loop_sad_16x16
    750 
    751     vabal.u8      q0, d8, d10
    752     vabal.u8      q1, d9, d11
    753 
    754     vadd.i16      q0, q0, q1
    755     vadd.i16      d0, d1, d0
    756     vpop          {d8-d15}
    757     ldr           r12, [sp, #12]
    758 
    759     vpaddl.u16    d0, d0
    760     vpaddl.u32    d0, d0
    761     vst1.32       {d0[0]}, [r12]
    762 
    763     ldmfd         sp!, {r12, pc}        @Restoring registers from stack
    764 
    765 
    766 @*
    767 @//---------------------------------------------------------------------------
    768 @// Function Name      : Calculate_Mad4_prog()
    769 @//
    770 @// Detail Description : This function find the sad values of 4 Progressive MBs
    771 @//                        at one shot
    772 @//
    773 @// Platform           : CortexA8/NEON            .
    774 @//
    775 @//-----------------------------------------------------------------------------
    776 @*
    777 
    778     .global ime_calculate_sad4_prog_a9q
    779 
    780 ime_calculate_sad4_prog_a9q:
    781     @ r0    = temp_frame     <UWORD8 *>
    782     @ r1    = buffer_ptr     <UWORD8 *>
    783     @ r2    = RefBufferWidth <UWORD32>
    784     @ r3    = CurBufferWidth <UWORD32>
    785     @ stack = psad           <UWORD32 *> {at 0x34}
    786 
    787     stmfd         sp!, {r4-r7, lr}
    788 
    789     @UWORD8 *left_ptr       = temp_frame - 1;
    790     @UWORD8 *right_ptr      = temp_frame + 1;
    791     @UWORD8 *top_ptr        = temp_frame - RefBufferWidth;
    792     @UWORD8 *bot_ptr        = temp_frame + RefBufferWidth;
    793 
    794     mov           r7, #14
    795     sub           r4, r0, #0x01         @r4 = left_ptr
    796     add           r5, r0, #0x1          @r5 = right_ptr
    797     sub           r6, r0, r2            @r6 = top_ptr
    798     add           r0, r0, r2            @r0 = bot_ptr
    799                                         @r1 = buffer_ptr
    800     vpush         {d8-d15}
    801     @D0:D1  : buffer
    802     @D2:D3  : top
    803     @D4:D5  : left
    804     @D6:D7  : right
    805     @D8:D9  : bottom
    806 
    807     @Row 1
    808     vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
    809     vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
    810     vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
    811 
    812     vabdl.u8      q5, d2, d0
    813     vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
    814     vabdl.u8      q6, d3, d1
    815 
    816     vabdl.u8      q7, d0, d4
    817     vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
    818     vabdl.u8      q8, d1, d5
    819 
    820     @Row 2
    821     vabdl.u8      q9, d0, d6
    822     vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
    823     vabdl.u8      q10, d1, d7
    824 
    825     vabdl.u8      q11, d0, d8
    826     vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
    827     vabdl.u8      q12, d1, d9
    828 
    829 loop_sad4_prog:
    830 
    831     vabal.u8      q5, d26, d2
    832     vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
    833     vabal.u8      q6, d27, d3
    834 
    835     vabal.u8      q7, d26, d4
    836     vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
    837     vabal.u8      q8, d27, d5
    838 
    839     vabal.u8      q9, d26, d6
    840     vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
    841     vabal.u8      q10, d27, d7
    842 
    843     @Row 1
    844     vabal.u8      q11, d26, d8
    845     vld1.8        {d0, d1}, [r1], r3    @ load src Row 1
    846     vabal.u8      q12, d27, d9
    847 
    848     vld1.8        {d2, d3}, [r6], r2    @ load top Row 1
    849     subs          r7, #2
    850     vld1.8        {d4, d5}, [r4], r2    @ load left Row 1
    851 
    852     vabal.u8      q5, d0, d2
    853     vld1.8        {d6, d7}, [r5], r2    @ load right Row 1
    854     vabal.u8      q6, d1, d3
    855 
    856     vabal.u8      q7, d0, d4
    857     vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 1
    858     vabal.u8      q8, d1, d5
    859 
    860     @Row 2
    861     vabal.u8      q9, d0, d6
    862     vld1.8        {d26, d27}, [r1], r3  @ load src Row 2
    863     vabal.u8      q10, d1, d7
    864 
    865     vabal.u8      q11, d0, d8
    866     vld1.8        {d2, d3}, [r6], r2    @ load top Row 2
    867     vabal.u8      q12, d1, d9
    868 
    869     bne           loop_sad4_prog
    870 
    871     vabal.u8      q5, d26, d2
    872     vld1.8        {d4, d5}, [r4], r2    @ load left Row 2
    873     vabal.u8      q6, d27, d3
    874 
    875     vabal.u8      q7, d26, d4
    876     vld1.8        {d6, d7}, [r5], r2    @ load right Row 2
    877     vabal.u8      q8, d27, d5
    878 
    879     vabal.u8      q9, d26, d6
    880     vld1.8        {d8, d9}, [r0], r2    @ load bottom Row 2
    881     vabal.u8      q10, d27, d7
    882 
    883     vabal.u8      q11, d26, d8
    884     vabal.u8      q12, d27, d9
    885 
    886     @;Q5:Q6   : sad_top
    887     @;Q7:Q8   : sad_left
    888     @;Q9:Q10  : sad_right
    889     @;Q11:Q12 : sad_bot
    890 
    891     vadd.u16      q5, q5, q6
    892     vadd.u16      q7, q7, q8
    893     vadd.u16      q9, q9, q10
    894     vadd.u16      q11, q11, q12
    895 
    896     @; Free :-
    897     @; Q6,Q8,Q10,Q12
    898 
    899     @;Q5  -> D10:D11
    900     @;Q7  -> D14:D15
    901     @;Q9  -> D18:D19
    902     @;Q11 -> D22:D23
    903 
    904     vadd.u16      d10, d10, d11
    905     vadd.u16      d14, d14, d15
    906     vadd.u16      d18, d18, d19
    907     vadd.u16      d22, d22, d23
    908 
    909     @;D10  : sad_top
    910     @;D14  : sad_left
    911     @;D18  : sad_right
    912     @;D22  : sad_bot
    913 
    914 
    915     vpaddl.u16    d11, d10
    916     vpaddl.u16    d15, d14
    917     vpaddl.u16    d19, d18
    918     vpaddl.u16    d23, d22
    919 
    920     @;D11  : sad_top
    921     @;D15  : sad_left
    922     @;D19  : sad_right
    923     @;D23  : sad_bot
    924 
    925     vpaddl.u32    d10, d11
    926     vpaddl.u32    d22, d23
    927     vpaddl.u32    d14, d15
    928     vpaddl.u32    d18, d19
    929 
    930     @;D10  : sad_top
    931     @;D14  : sad_left
    932     @;D18  : sad_right
    933     @;D22  : sad_bot
    934 
    935     ldr           r4, [sp, #84]         @;Can be rearranged
    936 
    937     vsli.64       d10, d22, #32
    938     vsli.64       d14, d18, #32
    939 
    940     vst1.64       {d14}, [r4]!
    941     vst1.64       {d10}, [r4]!
    942     vpop          {d8-d15}
    943     ldmfd         sp!, {r4-r7, pc}
    944 
    945 
    946 
    947 
    948 @*****************************************************************************
    949 @*
    950 @* Function Name        : ime_compute_satqd_16x16_lumainter_a9
    951 @* Description          : This fucntion computes SAD for a 16x16 block.
    952 @                       : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
    953 @
    954 @  Arguments            :   R0 :pointer to src buffer
    955 @                           R1 :pointer to est buffer
    956 @                           R2 :source stride
    957 @                           R3 :est stride
    958 @                           STACk :Threshold,distotion,is_nonzero
    959 @*
    960 @* Values Returned   : NONE
    961 @*
    962 @* Register Usage    : R0-R11
    963 @* Stack Usage       :
    964 @* Cycles            : Around
    965 @* Interruptiaility  : Interruptable
    966 @*
    967 @* Known Limitations
    968 @*   \Assumptions    :
    969 @*
    970 @* Revision History  :
    971 @*         DD MM YYYY    Author(s)          Changes
    972 @*         14 04 2014    Harinarayanan K K  First version
    973 @*
    974 @*****************************************************************************
    975     .global ime_compute_satqd_16x16_lumainter_a9q
    976 ime_compute_satqd_16x16_lumainter_a9q:
    977     @R0 :pointer to src buffer
    978     @R1 :pointer to est buffer
    979     @R2 :Source stride
    980     @R3 :Pred stride
    981     @R4 :Threshold pointer
    982     @R5 :Distortion,ie SAD
    983     @R6 :is nonzero
    984 
    985     push          {r4-r12, lr}          @push all the variables first
    986     @ADD      SP,SP,#40         ;decrement stack pointer,to accomodate two variables
    987     ldr           r4, [sp, #40]         @load the threshold address
    988     vpush         {d8-d15}
    989     mov           r8, #8                @Number of 4x8 blocks to be processed
    990     mov           r10, #0               @Sad
    991     mov           r7, #0                @Nonzero info
    992     @----------------------------------------------------
    993 
    994     vld1.u8       d30, [r0], r2         @I  load 8 pix src row 1
    995 
    996     vld1.u8       d31, [r1], r3         @I  load 8 pix pred row 1
    997 
    998     vld1.u8       d28, [r0], r2         @I  load 8 pix src row 2
    999 
   1000     vld1.u8       d29, [r1], r3         @I  load 8 pix pred row 2
   1001 
   1002     vld1.u8       d26, [r0], r2         @I  load 8 pix src row 3
   1003     vabdl.u8      q0, d30, d31          @I  Abs diff r1 blk 12
   1004 
   1005     vld1.u8       d27, [r1], r3         @I  load 8 pix pred row 3
   1006 
   1007     vld1.u8       d24, [r0], r2         @I  load 8 pix src row 4
   1008 
   1009     vld1.u8       d25, [r1], r3         @I  load 8 pix pred row 4
   1010     vabdl.u8      q1, d28, d29          @I  Abs diff r1 blk 12
   1011 
   1012     vld1.u16      {q11}, [r4]           @I  load the threhold
   1013     vabdl.u8      q2, d26, d27          @I  Abs diff r1 blk 12
   1014 
   1015     vabdl.u8      q3, d24, d25          @I  Abs diff r1 blk 12
   1016 
   1017 
   1018 
   1019 core_loop:
   1020                                         @S1  S2  S3  S4     A1  A2  A3  A4
   1021                                         @S5  S6  S7  S8     A5  A6  A7  A8
   1022                                         @S9  S10 S11 S12    A9  A10 A11 A12
   1023                                         @S13 S14 S15 S16    A13 A14 A15 A16
   1024     ands          r11, r8, #1           @II See if we are at even or odd block
   1025     vadd.u16      q4 , q0, q3           @I  Add r1 r4
   1026     lsl           r11, r2, #2           @II Move back src 4 rows
   1027 
   1028     subeq         r0, r0, r11           @II Move back src 4 rows if we are at even block
   1029     vadd.u16      q5 , q1, q2           @I  Add r2 r3
   1030     addeq         r0, r0, #8            @II Move src 8 cols forward if we are at even block
   1031 
   1032     lsl           r11, r3, #2           @II Move back pred 4 rows
   1033     vtrn.16       d8 , d10              @I trnspse 1
   1034     subeq         r1, r1, r11           @II Move back pred 4 rows if we are at even block
   1035 
   1036     addeq         r1, r1, #8            @II Move pred 8 cols forward if we are at even block
   1037     vtrn.16       d9 , d11              @I trnspse 2
   1038     subne         r0, r0, #8            @II Src 8clos back for odd rows
   1039 
   1040     subne         r1, r1, #8            @II Pred 8 cols back for odd rows
   1041     vtrn.32       d10, d11              @I trnspse 4
   1042 
   1043 
   1044     vtrn.32       d8 , d9               @I trnspse 3
   1045     vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
   1046                                         @D8     S1 S4 A1 A4
   1047                                         @D9     S2 S3 A2 A3
   1048                                         @D11    S1 S4 A1 A4
   1049                                         @D10    S2 S3 A2 A3
   1050 
   1051     vadd.s16      q6, q4, q5            @I  Get s1 s4
   1052     vld1.u8       d30, [r0], r2         @II load first 8 pix src row 1
   1053 
   1054     vtrn.s16      d12, d13              @I  Get s2 s3
   1055                                         @D12 S1 S4 A1 A4
   1056                                         @D13 S2 S3 A2 A3
   1057 
   1058     vshl.s16      q7, q6 , #1           @I  si  = si<<1
   1059     vld1.u8       d31, [r1], r3         @II load first 8 pix pred row 1
   1060 
   1061     vpadd.s16     d16, d12, d13         @I  (s1 + s4) (s2 + s3)
   1062     vld1.u8       d28, [r0], r2         @II load first 8 pix src row 2
   1063                                         @   D16  S14 A14 S23 A23
   1064     vrev32.16     d0, d16               @I
   1065     vuzp.s16      d16, d0               @I
   1066                                         @D16  S14 S23 A14 A23
   1067     vadd.s16      d17, d12, d13         @I  (s1 + s2) (s3 + s4)
   1068     vld1.u8       d29, [r1], r3         @II load first 8 pix pred row 2
   1069                                         @D17  S12 S34 A12 A34
   1070 
   1071     vrev32.16     q9, q7                @I  Rearrange si's
   1072                                         @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
   1073 
   1074                                         @D12    S1 S4 A1 A4
   1075                                         @D19    Z3 Z2 Y3 Y2
   1076     vsub.s16      d8, d12, d19          @I  (s1 - (s3<<1)) (s4 - (s2<<1))
   1077     vld1.u8       d26, [r0], r2         @II load first 8 pix src row 3
   1078                                         @D13    S2 S3 A2 A3
   1079                                         @D18    Z4 Z1 Y4 Y1
   1080     vsub.s16      d9, d13, d18          @I  (s2 - (s4<<1)) (s3 - (s1<<1))
   1081     vld1.u8       d27, [r1], r3         @II load first 8 pix pred row 3
   1082                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
   1083 
   1084                                         @D16  S14 S23 A14 A23
   1085     vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
   1086     vld1.u8       d24, [r0], r2         @II load first 8 pix src row 4
   1087                                         @D22 SAD1 SAD2 junk junk
   1088 
   1089 
   1090                                         @Q8     S2 S1 A2 A1 S6 S3 A6 A3
   1091                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
   1092     vtrn.32       q8, q4                @I  Rearrange to make ls of each block togather
   1093                                         @Q8     S2 S1 S8 S5 S6 S3 S7 S4
   1094                                         @Q10    A2 A1 A8 A5 A6 A3 A7 A4
   1095 
   1096 
   1097     ldrh          r11, [r4, #16]        @I  Load the threshold for DC val blk 1
   1098     vdup.s16      q6, d10[0]            @I  Get the sad blk 1
   1099     vabdl.u8      q0, d30, d31          @II Abs diff r1 blk 12
   1100 
   1101     vshl.s16      q7, q6, #1            @I  sad_2 = sad_1<<1
   1102     vmov.s16      r9, d10[0]            @I  Get the sad for block 1
   1103 
   1104     vsub.s16      q9, q7, q8            @I  Add to the lss
   1105     vmov.s16      r5, d10[1]            @I  Get the sad for block 2
   1106 
   1107     vcle.s16      q7, q11, q9           @I  Add to the lss
   1108     vld1.u8       d25, [r1], r3         @II load first 8 pix pred row 4
   1109 
   1110     vdup.s16      q15, d10[1]           @I  Get the sad blk 1
   1111     vabdl.u8      q1, d28, d29          @II Abs diff r1 blk 12
   1112 
   1113 
   1114     vshl.s16      q14, q15, #1          @I  sad_2 = sad_1<<1
   1115     vsub.s16      q3, q14, q4           @I  Add to the lss
   1116     vcle.s16      q15, q11, q3          @I  Add to the lss
   1117 
   1118     ADD           R10, R10, R9          @I  Add to  the global sad blk 1
   1119     vtrn.u8       q15, q7               @I  get all comparison bits to one reg
   1120     vabdl.u8      q2, d26, d27          @II Abs diff r1 blk 12
   1121 
   1122     ADD           R10, R10, R5          @I  Add to  the global sad blk 2
   1123     vshr.u8       q14, q15, #7          @I  Shift the bits so that no  overflow occurs
   1124     cmp           r11, r9
   1125 
   1126     movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 1                   ;I  Compare with threshold blk 1
   1127     vadd.u8       d28, d28, d29         @I  Add the bits
   1128     cmp           r11, r5               @I  Compare with threshold blk 2
   1129 
   1130     movle         r7, #0xf              @I  If not met mark it by mvoing non zero val to R7 blk 2
   1131     vpadd.u8      d28, d28, d29         @I  Add the bits
   1132 
   1133     vmov.u32      r11, d28[0]           @I  Since a set bit now represents a unstatisofrd contifon store it in r11
   1134     vabdl.u8      q3, d24, d25          @II Abs diff r1 blk 12
   1135 
   1136     orr           r7, r7, r11           @I  get the guy to r11
   1137 
   1138 
   1139     sub           r8, r8, #1            @I  Decremrnt block count
   1140 
   1141     cmp           r7, #0                @I  If we have atlest one non zero block
   1142     bne           compute_sad_only      @I  if a non zero block is der,From now on compute sad only
   1143 
   1144     cmp           r8, #1                @I  See if we are at the last block
   1145     bne           core_loop             @I  If the blocks are zero, lets continue the satdq
   1146 
   1147 
   1148     @EPILOUGE for core loop
   1149                                         @S1  S2  S3  S4     A1  A2  A3  A4
   1150                                         @S5  S6  S7  S8     A5  A6  A7  A8
   1151                                         @S9  S10 S11 S12    A9  A10 A11 A12
   1152                                         @S13 S14 S15 S16    A13 A14 A15 A16
   1153     vadd.u16      q4 , q0, q3           @Add r1 r4
   1154     vadd.u16      q5 , q1, q2           @Add r2 r3
   1155                                         @D8     S1 S2 S2 S1
   1156                                         @D10    S4 S3 S3 S4
   1157                                         @D9     A1 A2 A2 A1
   1158                                         @D11    A4 A3 A3 A4
   1159     vtrn.16       d8 , d10              @I trnspse 1
   1160     vtrn.16       d9 , d11              @I trnspse 2
   1161     vtrn.32       d8 , d9               @I trnspse 3
   1162     vtrn.32       d10, d11              @I trnspse 4
   1163 
   1164     vswp          d10, d11              @I rearrange so that the q4 and q5 add properly
   1165                                         @D8     S1 S4 A1 A4
   1166                                         @D9     S2 S3 A2 A3
   1167                                         @D11    S1 S4 A1 A4
   1168                                         @D10    S2 S3 A2 A3
   1169     vadd.s16      q6, q4, q5            @Get s1 s4
   1170     vtrn.s16      d12, d13              @Get s2 s3
   1171                                         @D12 S1 S4 A1 A4
   1172                                         @D13 S2 S3 A2 A3
   1173 
   1174     vshl.s16      q7, q6 , #1           @si  = si<<1
   1175     vmov.s16      r9, d10[0]            @Get the sad for block 1
   1176 
   1177     vpadd.s16     d16, d12, d13         @(s1 + s4) (s2 + s3)
   1178     vmov.s16      r5, d10[1]            @Get the sad for block 2
   1179                                         @D16  S14 A14 S23 A23
   1180     vrev32.16     d30, d16              @
   1181     vuzp.s16      d16, d30              @
   1182                                         @D16  S14 S23 A14 A23
   1183     vadd.s16      d17, d12, d13         @(s1 + s2) (s3 + s4)
   1184                                         @D17  S12 S34 A12 A34
   1185 
   1186     vrev32.16     q9, q7                @Rearrange si's
   1187                                         @Q9  Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
   1188 
   1189                                         @D12    S1 S4 A1 A4
   1190                                         @D19    Z3 Z2 Y3 Y2
   1191     vsub.s16      d8, d12, d19          @(s1 - (s3<<1)) (s4 - (s2<<1))
   1192                                         @D13    S2 S3 A2 A3
   1193                                         @D18    Z4 Z1 Y4 Y1
   1194     vsub.s16      d9, d13, d18          @(s2 - (s4<<1)) (s3 - (s1<<1))
   1195                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
   1196 
   1197                                         @D16  S14 S23 A14 A23
   1198     vpadd.s16     d10, d16, d17         @I  Get sad by adding s1 s2 s3 s4
   1199                                         @D22 SAD1 SAD2 junk junk
   1200     vmov.u16      r9, d10[0]            @Get the sad for block 1
   1201     vmov.u16      r5, d10[1]            @Get the sad for block 2
   1202 
   1203                                         @Q8     S2 S1 A2 A1 S6 S3 A6 A3
   1204                                         @Q10    S8 S5 A8 A5 S7 S4 A7 A4
   1205     ldrh          r11, [r4, #16]        @Load the threshold for DC val blk 1
   1206     vtrn.32       q8, q4                @Rearrange to make ls of each block togather
   1207     ADD           R10, R10, R9          @Add to  the global sad blk 1
   1208 
   1209                                         @Q8     S2 S1 S8 S5 S6 S3 S7 S4
   1210                                         @Q10    A2 A1 A8 A5 A6 A3 A7 A4
   1211 
   1212     vld1.u16      {q11}, [r4]           @load the threhold
   1213     ADD           R10, R10, R5          @Add to  the global sad blk 2
   1214 
   1215     vdup.u16      q6, d10[0]            @Get the sad blk 1
   1216 
   1217     cmp           r11, r9               @Compare with threshold blk 1
   1218     vshl.u16      q7, q6, #1            @sad_2 = sad_1<<1
   1219 
   1220     vsub.s16      q9, q7, q8            @Add to the lss
   1221 
   1222     vcle.s16      q15, q11, q9          @Add to the lss
   1223     movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 1
   1224 
   1225     cmp           r11, r5               @Compare with threshold blk 2
   1226     vdup.u16      q14, d10[1]           @Get the sad blk 1
   1227 
   1228     vshl.u16      q13, q14, #1          @sad_2 = sad_1<<1
   1229     vsub.s16      q12, q13, q4          @Add to the lss
   1230     vcle.s16      q14, q11, q12         @Add to the lss
   1231     movle         r7, #0xf              @If not met mark it by mvoing non zero val to R7 blk 2
   1232 
   1233     vtrn.u8       q14, q15              @get all comparison bits to one reg
   1234     vshr.u8       q14, q14, #7          @Shift the bits so that no  overflow occurs
   1235     vadd.u8       d28, d28, d29         @Add the bits
   1236     vpadd.u8      d28, d28, d29         @Add the bits
   1237     vmov.u32      r11, d28[0]           @Since a set bit now represents a unstatisofrd contifon store it in r11
   1238     orr           r7, r7, r11           @get the guy to r11
   1239 
   1240     b             funcend_sad_16x16     @Since all blocks ar processed nw, got to end
   1241 
   1242 compute_sad_only:                       @This block computes SAD only, so will be lighter
   1243                                         @IT will start processign at n odd block
   1244                                         @It will compute sad for odd blok,
   1245                                         @and then for two blocks at a time
   1246                                         @The counter is r7, hence r7 blocks will be processed
   1247 
   1248     and           r11, r8, #1           @Get the last bit of counter
   1249     cmp           r11, #0               @See if we are at even or odd block
   1250                                         @iif the blk is even we just have to set the pointer to the
   1251                                         @start of current row
   1252 
   1253     lsleq         r11, r2, #2           @I  Move back src 4 rows
   1254     subeq         r0, r0, r11           @I  Move back src 4 rows if we are at even block
   1255 
   1256     lsleq         r11, r3, #2           @I  Move back pred 4 rows
   1257     subeq         r1, r1, r11           @I  Move back pred 4 rows if we are at even block
   1258     @ADDEQ R8,R8,#2         ;Inc counter
   1259     beq           skip_odd_blk          @If the blk is odd we have to compute sad
   1260 
   1261 
   1262     vadd.u16      q4, q0, q1            @Add SAD of row1 and row2
   1263     vadd.u16      q5, q2, q3            @Add SAD of row3 and row4
   1264     vadd.u16      q6, q4, q5            @Add SAD of row 1-4
   1265     vadd.u16      d14, d12, d13         @Add Blk1 and blk2
   1266     vpadd.u16     d16, d14, d15         @Add col 1-2 and 3-4
   1267     vpadd.u16     d18, d16, d17         @Add col 12-34
   1268 
   1269     vmov.u16      r9, d18[0]            @Move sad to arm
   1270     ADD           R10, R10, R9          @Add to  the global sad
   1271 
   1272     sub           r8, r8, #1            @Dec counter
   1273     cmp           r8, #0                @See if we processed last block
   1274     beq           funcend_sad_16x16     @if lprocessed last block goto end of func
   1275 
   1276     sub           r0, r0, #8            @Since we processed od block move back src by 8 cols
   1277     sub           r1, r1, #8            @Since we processed od block move back pred by 8 cols
   1278 
   1279 skip_odd_blk:
   1280 
   1281     vmov.s16      q0, #0                @Initialize the accumulator
   1282     vmov.s16      q1, #0                @Initialize the accumulator
   1283 
   1284     vld1.u8       {q15}, [r0], r2       @load src r1
   1285     vld1.u8       {q14}, [r1], r3       @load pred r1
   1286 
   1287     vld1.u8       {q13}, [r0], r2       @load src r2
   1288     vld1.u8       {q12}, [r1], r3       @load pred r2
   1289 
   1290     vld1.u8       {q11}, [r0], r2       @load src r3
   1291     vld1.u8       {q10}, [r1], r3       @load pred r2
   1292 
   1293     vld1.u8       {q9}, [r0], r2        @load src r4
   1294     vld1.u8       {q8}, [r1], r3        @load pred r4
   1295 
   1296     cmp           r8, #2
   1297     beq           sad_epilouge
   1298 
   1299 sad_loop:
   1300 
   1301     vabal.u8      q0, d30, d28          @I  accumulate Abs diff R1
   1302     vabal.u8      q1, d31, d29          @I  accumulate Abs diff R1
   1303 
   1304     vld1.u8       {q15}, [r0], r2       @II load r1 src
   1305     vabal.u8      q0, d26, d24          @I  accumulate Abs diff R2
   1306 
   1307     vld1.u8       {q14}, [r1], r3       @II load r1 pred
   1308     vabal.u8      q1, d27, d25          @I  accumulate Abs diff R2
   1309 
   1310     vld1.u8       {q13}, [r0], r2       @II load r3 src
   1311     vabal.u8      q0, d22, d20          @I  accumulate Abs diff R3
   1312 
   1313     vld1.u8       {q12}, [r1], r3       @II load r2 pred
   1314     vabal.u8      q1, d23, d21          @I  accumulate Abs diff R3
   1315 
   1316     vld1.u8       {q11}, [r0], r2       @II load r3 src
   1317     vabal.u8      q0, d18, d16          @I  accumulate Abs diff R4
   1318 
   1319 
   1320     sub           r8, r8, #2            @Since we processe 16 pix @a time, dec by 2
   1321     vld1.u8       {q10}, [r1], r3       @II load r3 pred
   1322     vabal.u8      q1, d19, d17          @I  accumulate Abs diff R4
   1323 
   1324     cmp           r8, #2                @Check if last loop
   1325     vld1.u8       {q9}, [r0], r2        @II load r4 src
   1326     vld1.u8       {q8}, [r1], r3        @II load r4 pred
   1327 
   1328     bne           sad_loop              @Go back to SAD computation
   1329 
   1330 sad_epilouge:
   1331     vabal.u8      q0, d30, d28          @Accumulate Abs diff R1
   1332     vabal.u8      q1, d31, d29          @Accumulate Abs diff R1
   1333 
   1334     vabal.u8      q0, d26, d24          @Accumulate Abs diff R2
   1335     vabal.u8      q1, d27, d25          @Accumulate Abs diff R2
   1336 
   1337     vabal.u8      q0, d22, d20          @Accumulate Abs diff R3
   1338     vabal.u8      q1, d23, d21          @Aaccumulate Abs diff R3
   1339 
   1340     vabal.u8      q0, d18, d16          @Accumulate Abs diff R4
   1341     vabal.u8      q1, d19, d17          @Accumulate Abs diff R4
   1342 
   1343     vadd.u16      q2, q0, q1            @ADD two accumulators
   1344     vadd.u16      d6, d4, d5            @Add two blk sad
   1345     vpadd.u16     d8, d6, d7            @Add col 1-2 and 3-4 sad
   1346     vpadd.u16     d10, d8, d9           @Add col 12-34 sad
   1347 
   1348     vmov.u16      r9, d10[0]            @move SAD to ARM
   1349     ADD           R10, R10, R9          @Add to  the global sad
   1350 
   1351 funcend_sad_16x16:                      @End of fucntion process
   1352 
   1353     vpop          {d8-d15}
   1354     ldr           r5, [sp, #44]
   1355     ldr           r6, [sp, #48]
   1356 
   1357     str           r7, [r6]              @Store the is zero reg
   1358     str           r10, [r5]             @Store sad
   1359 
   1360     @SUB SP,SP,#40
   1361     pop           {r4-r12, pc}
   1362 
   1363 
   1364