Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 //**
     21 
     22 ///**
     23 //******************************************************************************
     24 //*
     25 //*
     26 //* @brief
     27 //*  This file contains definitions of routines that compute distortion
     28 //*  between two macro/sub blocks of identical dimensions
     29 //*
     30 //* @author
     31 //*  Ittiam
     32 //*
     33 //* @par List of Functions:
     34 //*  - ime_compute_sad_16x16()
     35 //*  - ime_compute_sad_8x8()
     36 //*  - ime_compute_sad_4x4()
     37 //*  - ime_compute_sad_16x8()
     38 //*  - ime_compute_satqd_16x16_lumainter_av8()
     39 //*
     40 //* @remarks
     41 //*  None
     42 //*
     43 //*******************************************************************************
     44 //
     45 
     46 
     47 ///**
     48 //******************************************************************************
     49 //*
     50 //* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
     51 //*
     52 //* @par   Description
     53 //*   This functions computes SAD between 2 16x16 blocks. There is a provision
     54 //*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
     55 //*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
     56 //*
     57 //* @param[in] pu1_src
     58 //*  UWORD8 pointer to the source
     59 //*
     60 //* @param[out] pu1_dst
     61 //*  UWORD8 pointer to the destination
     62 //*
     63 //* @param[in] src_strd
     64 //*  integer source stride
     65 //*
     66 //* @param[in] dst_strd
     67 //*  integer destination stride
     68 //*
     69 //* @param[in] i4_max_sad
     70 //*  integer maximum allowed distortion
     71 //*
     72 //* @param[in] pi4_mb_distortion
     73 //*  integer evaluated sad
     74 //*
     75 //* @remarks
     76 //*
     77 //******************************************************************************
     78 //*/
     79 .text
     80 .p2align 2
     81 
     82 .macro push_v_regs
     83     stp       d8, d9, [sp, #-16]!
     84     stp       d10, d11, [sp, #-16]!
     85     stp       d12, d13, [sp, #-16]!
     86     stp       d14, d15, [sp, #-16]!
     87 .endm
     88 .macro pop_v_regs
     89     ldp       d14, d15, [sp], #16
     90     ldp       d12, d13, [sp], #16
     91     ldp       d10, d11, [sp], #16
     92     ldp       d8, d9, [sp], #16
     93 .endm
     94 
     95     .global ime_compute_sad_16x16_fast_av8
     96 ime_compute_sad_16x16_fast_av8:
     97     push_v_regs
     98     sxtw      x2, w2
     99     sxtw      x3, w3
    100     lsl       x2, x2, #1
    101     lsl       x3, x3, #1
    102 
    103     mov       x6, #2
    104     movi      v30.8h, #0
    105 
    106 core_loop_ime_compute_sad_16x16_fast_av8:
    107 
    108     ld1       {v0.16b}, [x0], x2
    109     ld1       {v1.16b}, [x1], x3
    110     ld1       {v2.16b}, [x0], x2
    111     ld1       {v3.16b}, [x1], x3
    112 
    113     uabal     v30.8h, v0.8b, v1.8b
    114     uabal2    v30.8h, v0.16b, v1.16b
    115 
    116     uabal     v30.8h, v2.8b, v3.8b
    117     uabal2    v30.8h, v2.16b, v3.16b
    118 
    119     ld1       {v4.16b}, [x0], x2
    120     ld1       {v5.16b}, [x1], x3
    121     ld1       {v6.16b}, [x0], x2
    122     ld1       {v7.16b}, [x1], x3
    123 
    124     uabal     v30.8h, v4.8b, v5.8b
    125     uabal2    v30.8h, v4.16b, v5.16b
    126 
    127     uabal     v30.8h, v6.8b, v7.8b
    128     uabal2    v30.8h, v6.16b, v7.16b
    129 
    130     subs      x6, x6, #1
    131     bne       core_loop_ime_compute_sad_16x16_fast_av8
    132 
    133 
    134     addp      v30.8h, v30.8h, v30.8h
    135     uaddlp    v30.4s, v30.8h
    136     addp      v30.2s, v30.2s, v30.2s
    137     shl       v30.2s, v30.2s, #1
    138 
    139     st1       {v30.s}[0], [x5]
    140     pop_v_regs
    141     ret
    142 
    143 
    144 ///**
    145 //******************************************************************************
    146 //*
    147 //*  @brief computes distortion (SAD) between 2 16x8  blocks
    148 //*
    149 //*
    150 //*  @par   Description
    151 //*   This functions computes SAD between 2 16x8 blocks. There is a provision
    152 //*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    153 //*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    154 //*
    155 //* @param[in] pu1_src
    156 //*  UWORD8 pointer to the source
    157 //*
    158 //* @param[out] pu1_dst
    159 //*  UWORD8 pointer to the destination
    160 //*
    161 //* @param[in] src_strd
    162 //*  integer source stride
    163 //*
    164 //* @param[in] dst_strd
    165 //*  integer destination stride
    166 //*
    167 //* @param[in] u4_max_sad
    168 //*  integer maximum allowed distortion
    169 //*
    170 //* @param[in] pi4_mb_distortion
    171 //*  integer evaluated sad
    172 //*
    173 //* @remarks
    174 //*
    175 //******************************************************************************
    176 //*/
    177 //
    178     .global ime_compute_sad_16x8_av8
    179 ime_compute_sad_16x8_av8:
    180 
    181     //chheck what stride incremtn to use
    182     //earlier code did not have this lsl
    183     push_v_regs
    184     sxtw      x2, w2
    185     sxtw      x3, w3
    186     mov       x6, #2
    187     movi      v30.8h, #0
    188 
    189 core_loop_ime_compute_sad_16x8_av8:
    190 
    191     ld1       {v0.16b}, [x0], x2
    192     ld1       {v1.16b}, [x1], x3
    193     ld1       {v2.16b}, [x0], x2
    194     ld1       {v3.16b}, [x1], x3
    195 
    196     uabal     v30.8h, v0.8b, v1.8b
    197     uabal2    v30.8h, v0.16b, v1.16b
    198 
    199     uabal     v30.8h, v2.8b, v3.8b
    200     uabal2    v30.8h, v2.16b, v3.16b
    201 
    202     ld1       {v4.16b}, [x0], x2
    203     ld1       {v5.16b}, [x1], x3
    204     ld1       {v6.16b}, [x0], x2
    205     ld1       {v7.16b}, [x1], x3
    206 
    207     uabal     v30.8h, v4.8b, v5.8b
    208     uabal2    v30.8h, v4.16b, v5.16b
    209 
    210     uabal     v30.8h, v6.8b, v7.8b
    211     uabal2    v30.8h, v6.16b, v7.16b
    212 
    213     subs      x6, x6, #1
    214     bne       core_loop_ime_compute_sad_16x8_av8
    215 
    216 
    217     addp      v30.8h, v30.8h, v30.8h
    218     uaddlp    v30.4s, v30.8h
    219     addp      v30.2s, v30.2s, v30.2s
    220 
    221     st1       {v30.s}[0], [x5]
    222     pop_v_regs
    223     ret
    224 
    225 ///**
    226 //******************************************************************************
    227 //*
    228 //* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
    229 //*
    230 //* @par   Description
    231 //*   This functions computes SAD between 2 16x16 blocks. There is a provision
    232 //*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    233 //*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    234 //*
    235 //* @param[in] pu1_src
    236 //*  UWORD8 pointer to the source
    237 //*
    238 //* @param[out] pu1_dst
    239 //*  UWORD8 pointer to the destination
    240 //*
    241 //* @param[in] src_strd
    242 //*  integer source stride
    243 //*
    244 //* @param[in] dst_strd
    245 //*  integer destination stride
    246 //*
    247 //* @param[in] i4_max_sad
    248 //*  integer maximum allowed distortion
    249 //*
    250 //* @param[in] pi4_mb_distortion
    251 //*  integer evaluated sad
    252 //*
    253 //* @remarks
    254 //*
    255 //******************************************************************************
    256 //*/
    257 
    258     .global ime_compute_sad_16x16_ea8_av8
    259 ime_compute_sad_16x16_ea8_av8:
    260 
    261     push_v_regs
    262     sxtw      x2, w2
    263     sxtw      x3, w3
    264     movi      v30.8h, #0
    265 
    266     add       x7, x0, x2
    267     add       x8, x1, x3
    268 
    269     lsl       x2, x2, #1
    270     lsl       x3, x3, #1
    271 
    272     ld1       {v0.16b}, [x0], x2
    273     ld1       {v1.16b}, [x1], x3
    274     ld1       {v2.16b}, [x0], x2
    275     ld1       {v3.16b}, [x1], x3
    276     ld1       {v8.16b}, [x0], x2
    277     ld1       {v9.16b}, [x1], x3
    278     ld1       {v10.16b}, [x0], x2
    279     ld1       {v11.16b}, [x1], x3
    280     ld1       {v12.16b}, [x0], x2
    281     ld1       {v13.16b}, [x1], x3
    282     ld1       {v14.16b}, [x0], x2
    283     ld1       {v15.16b}, [x1], x3
    284     ld1       {v16.16b}, [x0], x2
    285     ld1       {v17.16b}, [x1], x3
    286     ld1       {v18.16b}, [x0], x2
    287     ld1       {v19.16b}, [x1], x3
    288 
    289     uabal     v30.8h, v0.8b, v1.8b
    290     uabal2    v30.8h, v0.16b, v1.16b
    291 
    292     uabal     v30.8h, v2.8b, v3.8b
    293     uabal2    v30.8h, v2.16b, v3.16b
    294 
    295     uabal     v30.8h, v8.8b, v9.8b
    296     uabal2    v30.8h, v8.16b, v9.16b
    297 
    298     uabal     v30.8h, v10.8b, v11.8b
    299     uabal2    v30.8h, v10.16b, v11.16b
    300 
    301     uabal     v30.8h, v12.8b, v13.8b
    302     uabal2    v30.8h, v12.16b, v13.16b
    303 
    304     uabal     v30.8h, v14.8b, v15.8b
    305     uabal2    v30.8h, v14.16b, v15.16b
    306 
    307     uabal     v30.8h, v16.8b, v17.8b
    308     uabal2    v30.8h, v16.16b, v17.16b
    309 
    310     uabal     v30.8h, v18.8b, v19.8b
    311     uabal2    v30.8h, v18.16b, v19.16b
    312 
    313     addp      v31.8h, v30.8h, v30.8h
    314     uaddlp    v31.4s, v31.8h
    315     addp      v31.2s, v31.2s, v31.2s
    316     mov       w6, v31.s[0]
    317     cmp       w6, w4
    318     bgt       end_func_16x16
    319 
    320     //do the stuff again
    321     ld1       {v0.16b}, [x7], x2
    322     ld1       {v1.16b}, [x8], x3
    323     ld1       {v2.16b}, [x7], x2
    324     ld1       {v3.16b}, [x8], x3
    325     ld1       {v8.16b}, [x7], x2
    326     ld1       {v9.16b}, [x8], x3
    327     ld1       {v10.16b}, [x7], x2
    328     ld1       {v11.16b}, [x8], x3
    329     ld1       {v12.16b}, [x7], x2
    330     ld1       {v13.16b}, [x8], x3
    331     ld1       {v14.16b}, [x7], x2
    332     ld1       {v15.16b}, [x8], x3
    333     ld1       {v16.16b}, [x7], x2
    334     ld1       {v17.16b}, [x8], x3
    335     ld1       {v18.16b}, [x7], x2
    336     ld1       {v19.16b}, [x8], x3
    337 
    338     uabal     v30.8h, v0.8b, v1.8b
    339     uabal2    v30.8h, v0.16b, v1.16b
    340 
    341     uabal     v30.8h, v2.8b, v3.8b
    342     uabal2    v30.8h, v2.16b, v3.16b
    343 
    344     uabal     v30.8h, v8.8b, v9.8b
    345     uabal2    v30.8h, v8.16b, v9.16b
    346 
    347     uabal     v30.8h, v10.8b, v11.8b
    348     uabal2    v30.8h, v10.16b, v11.16b
    349 
    350     uabal     v30.8h, v12.8b, v13.8b
    351     uabal2    v30.8h, v12.16b, v13.16b
    352 
    353     uabal     v30.8h, v14.8b, v15.8b
    354     uabal2    v30.8h, v14.16b, v15.16b
    355 
    356     uabal     v30.8h, v16.8b, v17.8b
    357     uabal2    v30.8h, v16.16b, v17.16b
    358 
    359     uabal     v30.8h, v18.8b, v19.8b
    360     uabal2    v30.8h, v18.16b, v19.16b
    361 
    362     addp      v31.8h, v30.8h, v30.8h
    363     uaddlp    v31.4s, v31.8h
    364     addp      v31.2s, v31.2s, v31.2s
    365 
    366 end_func_16x16:
    367     st1       {v31.s}[0], [x5]
    368     pop_v_regs
    369     ret
    370 
    371 
    372 ///*
    373 ////---------------------------------------------------------------------------
    374 //// Function Name      : ime_calculate_sad2_prog_av8()
    375 ////
    376 //// Detail Description : This function find the sad values of 4 Progressive MBs
    377 ////                        at one shot
    378 ////
    379 //// Platform           : CortexAv8/NEON            .
    380 ////
    381 ////-----------------------------------------------------------------------------
    382 //*/
    383 
    384     .global ime_calculate_sad2_prog_av8
    385 ime_calculate_sad2_prog_av8:
    386 
    387     // x0    = ref1     <UWORD8 *>
    388     // x1    = ref2     <UWORD8 *>
    389     // x2    = src     <UWORD8 *>
    390     // w3    = RefBufferWidth <UWORD32>
    391     // w4    = CurBufferWidth <UWORD32>
    392     // x5    = psad <UWORD32 *>
    393     push_v_regs
    394     sxtw      x3, w3
    395     sxtw      x4, w4
    396     mov       x6, #8
    397     movi      v30.8h, #0
    398     movi      v31.8h, #0
    399 
    400 core_loop_ime_calculate_sad2_prog_av8:
    401 
    402     ld1       {v0.16b}, [x0], x3
    403     ld1       {v1.16b}, [x1], x3
    404     ld1       {v2.16b}, [x3], x4
    405 
    406     ld1       {v3.16b}, [x0], x3
    407     ld1       {v4.16b}, [x1], x3
    408     ld1       {v5.16b}, [x3], x4
    409 
    410 
    411     uabal     v30.8h, v0.8b, v2.8b
    412     uabal2    v30.8h, v0.16b, v2.16b
    413     uabal     v31.8h, v1.8b, v2.8b
    414     uabal2    v31.8h, v1.16b, v2.16b
    415 
    416     uabal     v30.8h, v3.8b, v5.8b
    417     uabal2    v30.8h, v3.16b, v5.16b
    418     uabal     v31.8h, v4.8b, v5.8b
    419     uabal2    v31.8h, v4.16b, v5.16b
    420 
    421 
    422     ld1       {v6.16b}, [x0], x3
    423     ld1       {v7.16b}, [x1], x3
    424     ld1       {v8.16b}, [x3], x4
    425 
    426     ld1       {v9.16b}, [x0], x3
    427     ld1       {v10.16b}, [x1], x3
    428     ld1       {v11.16b}, [x3], x4
    429 
    430     uabal     v30.8h, v6.8b, v8.8b
    431     uabal2    v30.8h, v6.16b, v8.16b
    432     uabal     v31.8h, v7.8b, v8.8b
    433     uabal2    v31.8h, v7.16b, v8.16b
    434 
    435     uabal     v30.8h, v9.8b, v11.8b
    436     uabal2    v30.8h, v9.16b, v11.16b
    437     uabal     v31.8h, v10.8b, v11.8b
    438     uabal2    v31.8h, v0.16b, v11.16b
    439 
    440     subs      x6, x6, #1
    441     bne       core_loop_ime_calculate_sad2_prog_av8
    442 
    443     addp      v30.8h, v30.8h, v31.8h
    444     uaddlp    v30.4s, v30.8h
    445     addp      v30.2s, v30.2s, v30.2s
    446     shl       v30.2s, v30.2s, #1
    447 
    448     st1       {v30.2s}, [x5]
    449     pop_v_regs
    450     ret
    451 
    452 ///*
    453 ////---------------------------------------------------------------------------
    454 //// Function Name      : Calculate_Mad3_prog()
    455 ////
    456 //// Detail Description : This function find the sad values of 4 Progressive MBs
    457 ////                        at one shot
    458 ////
    459 //// Platform           : CortexA8/NEON            .
    460 ////
    461 ////-----------------------------------------------------------------------------
    462 //*/
    463 
    464     .global ime_calculate_sad3_prog_av8
    465 ime_calculate_sad3_prog_av8:
    466 
    467     // x0    = ref1     <UWORD8 *>
    468     // x1    = ref2     <UWORD8 *>
    469     // x2    = ref3     <UWORD8 *>
    470     // x3    = src     <UWORD8 *>
    471     // w4    = RefBufferWidth <UWORD32>
    472     // w5    = CurBufferWidth <UWORD32>
    473     // x6    = psad <UWORD32 *>
    474 
    475 
    476     push_v_regs
    477     sxtw      x4, w4
    478     sxtw      x5, w5
    479     mov       x7, #16
    480     movi      v29.8h, #0
    481     movi      v30.8h, #0
    482     movi      v31.8h, #0
    483 
    484 core_loop_ime_calculate_sad3_prog_av8:
    485 
    486     ld1       {v0.16b}, [x0], x4
    487     ld1       {v1.16b}, [x1], x4
    488     ld1       {v2.16b}, [x2], x4
    489     ld1       {v3.16b}, [x3], x5
    490 
    491     uabal     v29.8h, v0.8b, v3.8b
    492     uabal2    v29.8h, v0.16b, v3.16b
    493     uabal     v30.8h, v1.8b, v3.8b
    494     uabal2    v30.8h, v1.16b, v3.16b
    495     uabal     v31.8h, v2.8b, v3.8b
    496     uabal2    v31.8h, v2.16b, v3.16b
    497 
    498     ld1       {v4.16b}, [x0], x4
    499     ld1       {v5.16b}, [x1], x4
    500     ld1       {v6.16b}, [x2], x4
    501     ld1       {v7.16b}, [x3], x5
    502 
    503     uabal     v29.8h, v4.8b, v7.8b
    504     uabal2    v29.8h, v4.16b, v7.16b
    505     uabal     v30.8h, v5.8b, v7.8b
    506     uabal2    v30.8h, v5.16b, v7.16b
    507     uabal     v31.8h, v6.8b, v7.8b
    508     uabal2    v31.8h, v6.16b, v7.16b
    509 
    510     subs      x7, x7, #1
    511     bne       core_loop_ime_calculate_sad3_prog_av8
    512 
    513     addp      v30.8h, v30.8h, v31.8h
    514     uaddlp    v30.4s, v30.8h
    515     addp      v30.2s, v30.2s, v30.2s
    516     shl       v30.2s, v30.2s, #1
    517 
    518     st1       {v30.2s}, [x6]
    519     pop_v_regs
    520     ret
    521 
    522 
    523 
    524 
    525 ///**
    526 //******************************************************************************
    527 //*
    528 //* @brief computes distortion (SAD) for sub-pel motion estimation
    529 //*
    530 //* @par   Description
    531 //*   This functions computes SAD for all the 8 half pel points
    532 //*
    533 //* @param[out] pi4_sad
    534 //*  integer evaluated sad
    535 //*  pi4_sad[0] - half x
    536 //*  pi4_sad[1] - half x - 1
    537 //*  pi4_sad[2] - half y
    538 //*  pi4_sad[3] - half y - 1
    539 //*  pi4_sad[4] - half xy
    540 //*  pi4_sad[5] - half xy - 1
    541 //*  pi4_sad[6] - half xy - strd
    542 //*  pi4_sad[7] - half xy - 1 - strd
    543 //*
    544 //* @remarks
    545 //*
    546 //******************************************************************************
    547 //*/
    548 
    549 .text
    550 .p2align 2
    551 
    552     .global ime_sub_pel_compute_sad_16x16_av8
    553 ime_sub_pel_compute_sad_16x16_av8:
    554     push_v_regs
    555     sxtw      x4, w4
    556     sxtw      x5, w5
    557     sub       x7, x1, #1                //x left
    558     sub       x8, x2, x5                //y top
    559     sub       x9, x3, #1                //xy  left
    560     sub       x10, x3, x5               //xy top
    561     sub       x11, x10, #1              //xy top left
    562 
    563     movi      v24.8h, #0
    564     movi      v25.8h, #0
    565     movi      v26.8h, #0
    566     movi      v27.8h, #0
    567     movi      v28.8h, #0
    568     movi      v29.8h, #0
    569     movi      v30.8h, #0
    570     movi      v31.8h, #0
    571 
    572     mov       x12, #16
    573 core_loop_ime_sub_pel_compute_sad_16x16_av8:
    574 
    575     ld1       {v0.16b}, [x0], x4        //src
    576     ld1       {v1.16b}, [x1], x5        //x
    577     ld1       {v2.16b}, [x7], x5        //x left
    578     ld1       {v3.16b}, [x2], x5        //y
    579     ld1       {v9.16b}, [x8], x5        //y top
    580     ld1       {v10.16b}, [x3], x5       //xy
    581     ld1       {v11.16b}, [x9], x5       //xy left
    582     ld1       {v12.16b}, [x10], x5      //xy top
    583     ld1       {v13.16b}, [x11], x5      //xy top left
    584 
    585     uabal     v24.8h, v0.8b, v1.8b
    586     uabal2    v24.8h, v0.16b, v1.16b
    587     uabal     v25.8h, v0.8b, v2.8b
    588     uabal2    v25.8h, v0.16b, v2.16b
    589     uabal     v26.8h, v0.8b, v3.8b
    590     uabal2    v26.8h, v0.16b, v3.16b
    591     uabal     v27.8h, v0.8b, v9.8b
    592     uabal2    v27.8h, v0.16b, v9.16b
    593     uabal     v28.8h, v0.8b, v10.8b
    594     uabal2    v28.8h, v0.16b, v10.16b
    595     uabal     v29.8h, v0.8b, v11.8b
    596     uabal2    v29.8h, v0.16b, v11.16b
    597     uabal     v30.8h, v0.8b, v12.8b
    598     uabal2    v30.8h, v0.16b, v12.16b
    599     uabal     v31.8h, v0.8b, v13.8b
    600     uabal2    v31.8h, v0.16b, v13.16b
    601 
    602     subs      x12, x12, #1
    603     bne       core_loop_ime_sub_pel_compute_sad_16x16_av8
    604 
    605     addp      v24.8h, v24.8h, v25.8h
    606     addp      v26.8h, v26.8h, v27.8h
    607     addp      v28.8h, v28.8h, v29.8h
    608     addp      v30.8h, v30.8h, v31.8h
    609 
    610     uaddlp    v24.4s, v24.8h
    611     uaddlp    v26.4s, v26.8h
    612     uaddlp    v28.4s, v28.8h
    613     uaddlp    v30.4s, v30.8h
    614 
    615     addp      v24.4s, v24.4s, v26.4s
    616     addp      v25.4s, v28.4s, v30.4s
    617 
    618     st1       {v24.4s-v25.4s}, [x6]
    619 
    620 
    621     pop_v_regs
    622     ret
    623 
    624 
    625 ///**
    626 //******************************************************************************
    627 //*
    628 //* @brief computes distortion (SAD) between 2 16x16 blocks
    629 //*
    630 //* @par   Description
    631 //*   This functions computes SAD between 2 16x16 blocks. There is a provision
    632 //*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
    633 //*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
    634 //*
    635 //* @param[in] pu1_src
    636 //*  UWORD8 pointer to the source
    637 //*
    638 //* @param[out] pu1_dst
    639 //*  UWORD8 pointer to the destination
    640 //*
    641 //* @param[in] src_strd
    642 //*  integer source stride
    643 //*
    644 //* @param[in] dst_strd
    645 //*  integer destination stride
    646 //*
    647 //* @param[in] i4_max_sad
    648 //*  integer maximum allowed distortion
    649 //*
    650 //* @param[in] pi4_mb_distortion
    651 //*  integer evaluated sad
    652 //*
    653 //* @remarks
    654 //*
    655 //******************************************************************************
    656 //*/
    657     .global ime_compute_sad_16x16_av8
    658 ime_compute_sad_16x16_av8:
    659     push_v_regs
    660     sxtw      x2, w2
    661     sxtw      x3, w3
    662     mov       x6, #4
    663     movi      v30.8h, #0
    664 
    665 core_loop_ime_compute_sad_16x16_av8:
    666 
    667     ld1       {v0.16b}, [x0], x2
    668     ld1       {v1.16b}, [x1], x3
    669     ld1       {v2.16b}, [x0], x2
    670     ld1       {v3.16b}, [x1], x3
    671 
    672     uabal     v30.8h, v0.8b, v1.8b
    673     uabal2    v30.8h, v0.16b, v1.16b
    674 
    675     uabal     v30.8h, v2.8b, v3.8b
    676     uabal2    v30.8h, v2.16b, v3.16b
    677 
    678     ld1       {v4.16b}, [x0], x2
    679     ld1       {v5.16b}, [x1], x3
    680     ld1       {v6.16b}, [x0], x2
    681     ld1       {v7.16b}, [x1], x3
    682 
    683     uabal     v30.8h, v4.8b, v5.8b
    684     uabal2    v30.8h, v4.16b, v5.16b
    685 
    686     uabal     v30.8h, v6.8b, v7.8b
    687     uabal2    v30.8h, v6.16b, v7.16b
    688 
    689     subs      x6, x6, #1
    690     bne       core_loop_ime_compute_sad_16x16_av8
    691 
    692 
    693     addp      v30.8h, v30.8h, v30.8h
    694     uaddlp    v30.4s, v30.8h
    695     addp      v30.2s, v30.2s, v30.2s
    696 
    697     st1       {v30.s}[0], [x5]
    698     pop_v_regs
    699     ret
    700 
    701 
    702 ///*
    703 ////---------------------------------------------------------------------------
    704 //// Function Name      : Calculate_Mad4_prog()
    705 ////
    706 //// Detail Description : This function find the sad values of 4 Progressive MBs
    707 ////                        at one shot
    708 ////
    709 //// Platform           : CortexA8/NEON            .
    710 ////
    711 ////-----------------------------------------------------------------------------
    712 //*/
    713 
    714     .global ime_calculate_sad4_prog_av8
    715 ime_calculate_sad4_prog_av8:
    716     push_v_regs
    717     sxtw      x2, w2
    718     sxtw      x3, w3
    719     sub       x5, x0, #1                //left
    720     add       x6, x0, #1                //right
    721     sub       x7, x0, x2                //top
    722     add       x8, x0, x2                //bottom
    723 
    724     movi      v28.8h, #0
    725     movi      v29.8h, #0
    726     movi      v30.8h, #0
    727     movi      v31.8h, #0
    728 
    729     mov       x9, #16
    730 core_loop_ime_calculate_sad4_prog_av8:
    731 
    732     ld1       {v0.16b}, [x1], x3
    733     ld1       {v1.16b}, [x5], x2
    734     ld1       {v2.16b}, [x6], x2
    735     ld1       {v3.16b}, [x7], x2
    736     ld1       {v9.16b}, [x8], x2
    737 
    738     uabal     v28.8h, v0.8b, v1.8b
    739     uabal2    v28.8h, v0.16b, v1.16b
    740     uabal     v29.8h, v0.8b, v2.8b
    741     uabal2    v29.8h, v0.16b, v2.16b
    742     uabal     v30.8h, v0.8b, v3.8b
    743     uabal2    v30.8h, v0.16b, v3.16b
    744     uabal     v31.8h, v0.8b, v9.8b
    745     uabal2    v31.8h, v0.16b, v9.16b
    746 
    747     subs      x9, x9, #1
    748     bne       core_loop_ime_calculate_sad4_prog_av8
    749 
    750     addp      v28.8h, v28.8h, v29.8h
    751     addp      v30.8h, v30.8h, v31.8h
    752 
    753     uaddlp    v28.4s, v28.8h
    754     uaddlp    v30.4s, v30.8h
    755 
    756     addp      v28.4s, v28.4s, v30.4s
    757     st1       {v28.4s}, [x4]
    758     pop_v_regs
    759     ret
    760 
    761 
    762 
    763 //*****************************************************************************
    764 //*
    765 //* Function Name         : ime_compute_satqd_16x16_lumainter_av8
    766 //* Description           : This fucntion computes SAD for a 16x16 block.
    767 //                        : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
    768 //
    769 //  Arguments             :   x0 :pointer to src buffer
    770 //                            x1 :pointer to est buffer
    771 //                            x2 :source stride
    772 //                            x3 :est stride
    773 //                            STACk :Threshold,distotion,is_nonzero
    774 //*
    775 //* Values Returned   : NONE
    776 //*
    777 //* Register Usage    : x0-x11
    778 //* Stack Usage       :
    779 //* Cycles            : Around
    780 //* Interruptiaility  : Interruptable
    781 //*
    782 //* Known Limitations
    783 //*   \Assumptions    :
    784 //*
    785 //* Revision History  :
    786 //*         DD MM YYYY    Author(s)           Changes
    787 //*         14 04 2014    Harinarayanan K K  First version
    788 //*
    789 //*****************************************************************************
    790     .global ime_compute_satqd_16x16_lumainter_av8
    791 ime_compute_satqd_16x16_lumainter_av8:
    792     //x0 :pointer to src buffer
    793     //x1 :pointer to est buffer
    794     //w2 :Source stride
    795     //w3 :Pred stride
    796     //x4 :Threshold pointer
    797     //x5 :Distortion,ie SAD
    798     //x6 :is nonzero
    799     //x7 :loop counter
    800     push_v_regs
    801     sxtw      x2, w2
    802     sxtw      x3, w3
    803     stp       d8, d9, [sp, #-16]!
    804     stp       d10, d11, [sp, #-16]!
    805     stp       d12, d13, [sp, #-16]!
    806     stp       d14, d15, [sp, #-16]!
    807 
    808     ld1       {v30.8h}, [x4]
    809 
    810     dup       v20.4h, v30.h[1]          //ls1
    811     dup       v24.4h, v30.h[0]          //ls2
    812     dup       v21.4h, v30.h[5]          //ls3
    813     dup       v25.4h, v30.h[7]          //ls4
    814     dup       v22.4h, v30.h[3]          //ls5
    815     dup       v26.4h, v30.h[4]          //ls6
    816     dup       v23.4h, v30.h[6]          //ls7
    817     dup       v27.4h, v30.h[2]          //ls8
    818 
    819     mov       v20.d[1], v24.d[0]
    820     mov       v21.d[1], v25.d[0]
    821     mov       v22.d[1], v26.d[0]
    822     mov       v23.d[1], v27.d[0]
    823 
    824     add       x4, x4, #16
    825     ld1       {v29.h}[0], [x4]
    826     dup       v29.4h, v29.h[0]
    827 
    828     movi      v31.8h, #0
    829 
    830     mov       x7, #4
    831 core_loop_satqd_ime_compute_satqd_16x16_lumainter:
    832     ld1       {v0.16b}, [x0], x2
    833     ld1       {v1.16b}, [x1], x3
    834     ld1       {v2.16b}, [x0], x2
    835     ld1       {v3.16b}, [x1], x3
    836     ld1       {v4.16b}, [x0], x2
    837     ld1       {v5.16b}, [x1], x3
    838     ld1       {v6.16b}, [x0], x2
    839     ld1       {v7.16b}, [x1], x3
    840 
    841     uabdl     v10.8h, v0.8b, v1.8b
    842     uabdl2    v15.8h, v0.16b, v1.16b
    843     uabdl     v11.8h, v2.8b, v3.8b
    844     uabdl2    v16.8h, v2.16b, v3.16b
    845     uabdl     v12.8h, v4.8b, v5.8b
    846     uabdl2    v17.8h, v4.16b, v5.16b
    847     uabdl     v13.8h, v6.8b, v7.8b
    848     uabdl2    v18.8h, v6.16b, v7.16b
    849 
    850     add       v0.8h, v10.8h, v13.8h
    851     add       v1.8h, v11.8h, v12.8h
    852     add       v2.8h, v15.8h, v18.8h
    853     add       v3.8h, v16.8h, v17.8h
    854 
    855     //v0 : S1     S4     S4     S1        A1    A4    A4    A1
    856     //v1 : S2     S3     S3     S2        A2    A3    A3    A2
    857     //v2 : B1     B4     B4     B1        X1    X4    X4    X1
    858     //v3 : B3     B2     B2     B3        X3    X2    X2    X3
    859 
    860     trn1      v4.8h, v0.8h, v1.8h
    861     trn2      v5.8h, v0.8h, v1.8h
    862     trn1      v6.8h, v2.8h, v3.8h
    863     trn2      v7.8h, v2.8h, v3.8h
    864 
    865     trn1      v0.4s, v4.4s, v6.4s
    866     trn2      v2.4s, v4.4s, v6.4s
    867     trn1      v1.4s, v5.4s, v7.4s
    868     trn2      v3.4s, v5.4s, v7.4s
    869 
    870     add       v4.8h, v0.8h, v3.8h
    871     add       v5.8h, v1.8h, v2.8h
    872     //v4 : S1     S2     B1     B2      A1    A2    X1    X2
    873     //v5 : S4     S3     B4     B3      A4    A3    X4    X3
    874 
    875     //compute sad for each 4x4 block
    876     add       v6.8h, v4.8h, v5.8h
    877     addp      v19.8h, v6.8h, v6.8h
    878     //duplicate the sad into 128 bit so that we can compare using 128bit
    879     add       v31.4h, v31.4h, v19.4h
    880 
    881     //sad_2 = sad_1<<1;
    882     shl       v28.8h, v19.8h, #1
    883 
    884     //sad_2 - pu2_thrsh
    885     sub       v24.8h, v28.8h, v20.8h
    886     sub       v25.8h, v28.8h, v21.8h
    887     sub       v26.8h, v28.8h, v22.8h
    888     sub       v27.8h, v28.8h, v23.8h
    889 
    890     trn1      v0.4s, v4.4s, v5.4s
    891     trn2      v1.4s, v4.4s, v5.4s
    892     //v0 : S1     S2     S4     S3      A1    A2    A4    A3
    893     //v1 : B1     B2     B4     B3      X1    X2    X4    X3
    894 
    895     trn1      v4.8h, v0.8h, v1.8h
    896     trn2      v5.8h, v0.8h, v1.8h
    897     //v4 : S1     B1     S4     B4      A1    X1    A4    X4
    898     //v5 : S2     B2     S3     B3      A2    X2    A3    X3
    899 
    900     mov       v7.s[0], v4.s[1]
    901     mov       v7.s[1], v4.s[3]
    902     mov       v6.s[0], v5.s[1]          // V4 //S1 B1 A1 X1
    903     mov       v6.s[1], v5.s[3]          // V5 //S2 B2 A2 X2
    904     mov       v4.s[1], v4.s[2]          // V6 //S3 B3 A3 X3
    905     mov       v5.s[1], v5.s[2]          // V7 //S4 B4 A4 X4
    906 
    907     shl       v0.4h, v4.4h, #1          //S1<<1
    908     shl       v1.4h, v5.4h, #1          //S2<<1
    909     shl       v2.4h, v6.4h, #1          //S3<<1
    910     shl       v3.4h, v7.4h, #1          //S4<<1
    911 
    912     add       v8.4h, v5.4h, v6.4h       //(s2[j] + s3[j]))
    913     add       v9.4h, v4.4h, v7.4h       //(s1[j] + s4[j]))
    914     add       v10.4h, v6.4h, v7.4h      //(s3[j] + s4[j]))
    915     sub       v11.4h, v6.4h, v0.4h      //(s3[j] - (s1[j]<<1))
    916     sub       v12.4h, v7.4h, v1.4h      //(s4[j] - (s2[j]<<1))
    917     add       v13.4h, v4.4h, v5.4h      //(s1[j] + s2[j]))
    918     sub       v14.4h, v5.4h, v3.4h      //(s2[j] - (s4[j]<<1)))
    919     sub       v15.4h, v4.4h, v2.4h      //(s1[j] - (s3[j]<<1)))
    920 
    921     mov       v8.d[1], v9.d[0]
    922     mov       v10.d[1], v11.d[0]
    923     mov       v12.d[1], v13.d[0]
    924     mov       v14.d[1], v15.d[0]
    925 
    926     cmge      v0.8h, v24.8h, v8.8h      //ls1 ls2
    927     cmge      v1.8h, v25.8h, v10.8h     //ls3 ls4
    928     cmge      v2.8h, v26.8h, v12.8h     //ls5 ls6
    929     cmge      v3.8h, v27.8h, v14.8h     //ls7 ls8
    930     cmge      v4.4h, v19.4h, v29.4h     //sad
    931 
    932     orr       v0.16b, v0.16b, v1.16b
    933     orr       v2.16b, v2.16b, v3.16b
    934     orr       v2.16b, v0.16b, v2.16b
    935     xtn       v2.8b, v2.8h
    936     orr       v2.8b, v2.8b, v4.8b
    937 
    938     //if the comparison is non zero, out
    939     mov       x4, v2.d[0]
    940     cmp       x4, #0
    941     bne       core_loop_compute_sad_pre
    942 
    943     subs      x7, x7, #1
    944     bne       core_loop_satqd_ime_compute_satqd_16x16_lumainter
    945     b         satdq_end_func
    946 
    947 
    948 core_loop_compute_sad:
    949     ld1       {v0.16b}, [x0], x2
    950     ld1       {v1.16b}, [x1], x3
    951     ld1       {v2.16b}, [x0], x2
    952     ld1       {v3.16b}, [x1], x3
    953 
    954     uabal     v31.8h, v0.8b, v1.8b
    955     uabal2    v31.8h, v0.16b, v1.16b
    956 
    957     uabal     v31.8h, v2.8b, v3.8b
    958     uabal2    v31.8h, v2.16b, v3.16b
    959 
    960     ld1       {v4.16b}, [x0], x2
    961     ld1       {v5.16b}, [x1], x3
    962     ld1       {v6.16b}, [x0], x2
    963     ld1       {v7.16b}, [x1], x3
    964 
    965     uabal     v31.8h, v4.8b, v5.8b
    966     uabal2    v31.8h, v4.16b, v5.16b
    967 
    968     uabal     v31.8h, v6.8b, v7.8b
    969     uabal2    v31.8h, v6.16b, v7.16b
    970 
    971 core_loop_compute_sad_pre:
    972     subs      x7, x7, #1
    973     bne       core_loop_compute_sad
    974 
    975 satdq_end_func:
    976 
    977     mov       x7, #1
    978     cmp       x4, #0
    979     csel      x7, x4, x7, eq
    980     str       w7, [x6]
    981 
    982     addp      v31.8h, v31.8h, v31.8h
    983     uaddlp    v31.4s, v31.8h
    984     addp      v31.2s, v31.2s, v31.2s
    985     st1       {v31.s}[0], [x5]
    986 
    987 
    988     ldp       d14, d15, [sp], #16
    989     ldp       d12, d13, [sp], #16
    990     ldp       d10, d11, [sp], #16
    991     ldp       d8, d9, [sp], #16
    992     pop_v_regs
    993     ret
    994