Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 ///*
     22 ////----------------------------------------------------------------------------
     23 //// File Name            : impeg2_inter_pred.s
     24 ////
     25 //// Description          : This file has motion compensation related
     26 ////                        interpolation functions on Neon + CortexA-8 platform
     27 ////
     28 //// Reference Document   :
     29 ////
     30 //// Revision History     :
     31 ////      Date            Author                  Detail Description
     32 ////   ------------    ----------------    ----------------------------------
     33 ////   18 jun 2010      S Hamsalekha              Created
     34 ////
     35 ////-------------------------------------------------------------------------
     36 //*/
     37 
     38 ///*
     39 //// ----------------------------------------------------------------------------
     40 //// Include Files
     41 //// ----------------------------------------------------------------------------
     42 //*/
     43 //              PRESERVE8
     44 .text
     45 .include "impeg2_neon_macros.s"
     46 
     47 ///*
     48 //// ----------------------------------------------------------------------------
     49 //// Struct/Union Types and Define
     50 //// ----------------------------------------------------------------------------
     51 //*/
     52 
     53 
     54 ///*
     55 //// ----------------------------------------------------------------------------
     56 //// Static Global Data section variables
     57 //// ----------------------------------------------------------------------------
     58 //*/
     59 //// -------------------------- NONE --------------------------------------------
     60 
     61 
     62 ///*
     63 //// ----------------------------------------------------------------------------
     64 //// Static Prototype Functions
     65 //// ----------------------------------------------------------------------------
     66 //*/
     67 //// -------------------------- NONE --------------------------------------------
     68 
     69 ///*
     70 //// ----------------------------------------------------------------------------
     71 //// Exported functions
     72 //// ----------------------------------------------------------------------------
     73 //*/
     74 
     75 
     76 ///*
     77 ////---------------------------------------------------------------------------
     78 //// Function Name      :   impeg2_copy_mb_av8()
     79 ////
     80 //// Detail Description : Copies one MB worth of data from src to the dst
     81 ////
     82 //// Inputs             : x0 - pointer to src
     83 ////                      x1 - pointer to dst
     84 ////                      x2 - source width
     85 ////                      x3 - destination width
     86 //// Registers Used     : v0, v1
     87 ////
     88 //// Stack Usage        : 64 bytes
     89 ////
     90 //// Outputs            :
     91 ////
     92 //// Return Data        : None
     93 ////
     94 //// Programming Note   : <program limitation>
     95 ////-----------------------------------------------------------------------------
     96 //*/
     97 
     98 
     99 
    100 .global impeg2_copy_mb_av8
    101 
    102 
    103 impeg2_copy_mb_av8:
    104 
    105 //STMFD   x13!,{x4,x5,x12,x14}
    106     push_v_regs
    107 
    108 
    109     ldr             x4, [x0]            //src->y
    110     ldr             x5, [x1]            //dst->y
    111 
    112     //Read one row of data from the src
    113     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    114     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    115 
    116     ////Repeat 15 times for y
    117     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    118     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    119     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    120     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    121     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    122     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    123     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    124     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    125     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    126     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    127     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    128     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    129     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    130     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    131     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    132     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    133     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    134     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    135     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    136     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    137     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    138     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    139     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    140     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    141     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    142     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    143     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    144     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    145     ld1             {v0.8b, v1.8b}, [x4], x2 //Load and increment src
    146     st1             {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
    147 
    148     lsr             x2, x2, #1          //src_offset /= 2
    149     lsr             x3, x3, #1          //dst_offset /= 2
    150 
    151     ldr             x4, [x0, #8]        //src->u
    152     ldr             x5, [x1, #8]        //dst->u
    153 
    154     //Read one row of data from the src
    155     ld1             {v0.8b}, [x4], x2   //Load and increment src
    156     st1             {v0.8b}, [x5], x3   //Store and increment dst
    157 
    158     ////Repeat 7 times for u
    159     ld1             {v0.8b}, [x4], x2   //Load and increment src
    160     st1             {v0.8b}, [x5], x3   //Store and increment dst
    161     ld1             {v0.8b}, [x4], x2   //Load and increment src
    162     st1             {v0.8b}, [x5], x3   //Store and increment dst
    163     ld1             {v0.8b}, [x4], x2   //Load and increment src
    164     st1             {v0.8b}, [x5], x3   //Store and increment dst
    165     ld1             {v0.8b}, [x4], x2   //Load and increment src
    166     st1             {v0.8b}, [x5], x3   //Store and increment dst
    167     ld1             {v0.8b}, [x4], x2   //Load and increment src
    168     st1             {v0.8b}, [x5], x3   //Store and increment dst
    169     ld1             {v0.8b}, [x4], x2   //Load and increment src
    170     st1             {v0.8b}, [x5], x3   //Store and increment dst
    171     ld1             {v0.8b}, [x4], x2   //Load and increment src
    172     st1             {v0.8b}, [x5], x3   //Store and increment dst
    173 
    174     ldr             x4, [x0, #16]       //src->v
    175     ldr             x5, [x1, #16]       //dst->v
    176 
    177     //Read one row of data from the src
    178     ld1             {v0.8b}, [x4], x2   //Load and increment src
    179     st1             {v0.8b}, [x5], x3   //Store and increment dst
    180 
    181     ////Repeat 7 times for v
    182     ld1             {v0.8b}, [x4], x2   //Load and increment src
    183     st1             {v0.8b}, [x5], x3   //Store and increment dst
    184     ld1             {v0.8b}, [x4], x2   //Load and increment src
    185     st1             {v0.8b}, [x5], x3   //Store and increment dst
    186     ld1             {v0.8b}, [x4], x2   //Load and increment src
    187     st1             {v0.8b}, [x5], x3   //Store and increment dst
    188     ld1             {v0.8b}, [x4], x2   //Load and increment src
    189     st1             {v0.8b}, [x5], x3   //Store and increment dst
    190     ld1             {v0.8b}, [x4], x2   //Load and increment src
    191     st1             {v0.8b}, [x5], x3   //Store and increment dst
    192     ld1             {v0.8b}, [x4], x2   //Load and increment src
    193     st1             {v0.8b}, [x5], x3   //Store and increment dst
    194     ld1             {v0.8b}, [x4], x2   //Load and increment src
    195     st1             {v0.8b}, [x5], x3   //Store and increment dst
    196 
    197 //LDMFD   x13!,{x4,x5,x12,PC}
    198     pop_v_regs
    199     ret
    200 
    201 
    202 ///*
    203 ////---------------------------------------------------------------------------
    204 //// Function Name      :   impeg2_mc_fullx_halfy_8x8_av8()
    205 ////
    206 //// Detail Description : This function pastes the reference block in the
    207 ////                      current frame buffer.This function is called for
    208 ////                      blocks that are not coded and have motion vectors
    209 ////                      with a half pel resolution.
    210 ////
    211 //// Inputs             : x0 - out    : Current Block Pointer
    212 ////                      x1 - ref     : Refernce Block Pointer
    213 ////                      x2 - ref_wid   : Refernce Block Width
    214 ////                      x3 - out_wid    @ Current Block Width
    215 ////
    216 //// Registers Used     : x14, D0-D9
    217 ////
    218 //// Stack Usage        : 64 bytes
    219 ////
    220 //// Outputs            : The Motion Compensated Block
    221 ////
    222 //// Return Data        : None
    223 ////
    224 //// Programming Note   : <program limitation>
    225 ////-----------------------------------------------------------------------------
    226 //*/
    227 
    228 .global impeg2_mc_fullx_halfy_8x8_av8
    229 
    230 impeg2_mc_fullx_halfy_8x8_av8:
    231 
    232 //STMFD       x13!,{x12,x14}
    233     push_v_regs
    234     add             x14, x1, x2
    235     lsl             x2, x2, #1
    236 
    237 ///* Load 8 + 1 rows from reference block */
    238 ///* Do the addition with out rounding off as rounding value is 1 */
    239     ld1             {v0.8b}, [x1], x2   //// first row hence x1 = D0
    240     ld1             {v2.8b}, [x14], x2  //// second row hence x2 = D2
    241     ld1             {v4.8b}, [x1], x2   //// third row hence x3 = D4
    242     ld1             {v6.8b}, [x14], x2  //// fourth row hence x4 = D6
    243     ld1             {v1.8b}, [x1], x2   //// fifth row hence x5 = D1
    244     ld1             {v3.8b}, [x14], x2  //// sixth row hence x6 = D3
    245     urhadd          v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
    246     ld1             {v5.8b}, [x1], x2   //// seventh row hence x7 = D5
    247     urhadd          v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
    248     urhadd          v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
    249     ld1             {v7.8b}, [x14], x2  //// eighth row hence x8 = D7
    250     urhadd          v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
    251     urhadd          v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
    252     ld1             {v8.8b}, [x1], x2   //// ninth row hence x9 = D8
    253     urhadd          v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
    254     urhadd          v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
    255 
    256     add             x14, x0, x3
    257     lsl             x3, x3, #1
    258 
    259 ///* Store the eight rows calculated above */
    260     st1             {v2.8b}, [x14], x3  //// second row hence D2
    261     urhadd          v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
    262     st1             {v0.8b}, [x0], x3   //// first row hence D0
    263     st1             {v9.8b}, [x14], x3  //// fourth row hence D9
    264     st1             {v4.8b}, [x0], x3   //// third row hence D4
    265     st1             {v3.8b}, [x14], x3  //// sixth row hence x6 = D3
    266     st1             {v1.8b}, [x0], x3   //// fifth row hence x5 = D1
    267     st1             {v7.8b}, [x14], x3  //// eighth row hence x8 = D7
    268     st1             {v5.8b}, [x0], x3   //// seventh row hence x7 = D5
    269 
    270 // LDMFD sp!,{x12,pc}
    271     pop_v_regs
    272     ret
    273 
    274 
    275 
    276 
    277 
    278 ///*
    279 ////---------------------------------------------------------------------------
    280 //// Function Name      :   impeg2_mc_halfx_fully_8x8_av8()
    281 ////
    282 //// Detail Description : This function pastes the reference block in the
    283 ////                      current frame buffer.This function is called for
    284 ////                      blocks that are not coded and have motion vectors
    285 ////                      with a half pel resolutionand VopRoundingType is 0 ..
    286 ////
    287 //// Inputs             : x0 - out    : Current Block Pointer
    288 ////                      x1 - ref     : Refernce Block Pointer
    289 ////                      x2 - ref_wid   : Refernce Block Width
    290 ////                      x3 - out_wid    @ Current Block Width
    291 ////
    292 //// Registers Used     : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
    293 
    294 ////
    295 //// Stack Usage        : 64 bytes
    296 ////
    297 //// Outputs            : The Motion Compensated Block
    298 ////
    299 //// Return Data        : None
    300 ////
    301 //// Programming Note   : <program limitation>
    302 ////-----------------------------------------------------------------------------
    303 //*/
    304 
    305 
    306 
    307 .global impeg2_mc_halfx_fully_8x8_av8
    308 
    309 
    310 
    311 impeg2_mc_halfx_fully_8x8_av8:
    312 
    313     // STMFD sp!,{x12,x14}
    314     push_v_regs
    315 
    316     add             x14, x1, x2, lsl #2
    317 
    318     add             x12, x0, x3, lsl#2
    319 
    320     ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
    321 
    322     ld1             {v2.8b, v3.8b}, [x14], x2 // row5
    323 
    324 
    325     ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
    326 
    327     ld1             {v6.8b, v7.8b}, [x14], x2 //row6
    328 
    329 
    330     ext             v8.8b, v0.8b , v1.8b , #1
    331 
    332     ext             v12.8b, v2.8b , v3.8b , #1
    333 
    334     ext             v16.8b, v4.8b , v5.8b , #1
    335 
    336     ext             v20.8b, v6.8b , v7.8b , #1
    337 
    338 
    339     ld1             {v9.8b, v10.8b}, [x1], x2 //load row3
    340 
    341     ld1             {v13.8b, v14.8b}, [x14], x2 //load row7
    342 
    343     ld1             {v17.8b, v18.8b}, [x1], x2 //load  row4
    344 
    345     ld1             {v21.8b, v22.8b}, [x14], x2 //load  row8
    346 
    347 
    348     ext             v1.8b, v9.8b , v10.8b , #1
    349 
    350     ext             v3.8b, v13.8b , v14.8b , #1
    351 
    352 
    353 
    354     ext             v5.8b, v17.8b , v18.8b , #1
    355 
    356     ext             v7.8b, v21.8b , v22.8b , #1
    357 
    358 
    359     urhadd          v0.16b, v0.16b , v8.16b //operate on row1 and row3
    360     urhadd          v1.16b, v1.16b , v9.16b //operate on row1 and row3
    361 
    362     urhadd          v2.16b, v2.16b , v12.16b //operate on row5 and row7
    363     urhadd          v3.16b, v3.16b , v13.16b //operate on row5 and row7
    364 
    365 
    366     urhadd          v4.16b, v4.16b , v16.16b //operate on row2 and row4
    367     urhadd          v5.16b, v5.16b , v17.16b //operate on row2 and row4
    368 
    369 
    370     urhadd          v6.16b, v6.16b , v20.16b //operate on row6 and row8
    371     urhadd          v7.16b, v7.16b , v21.16b //operate on row6 and row8
    372 
    373     st1             {v0.8b}, [x0], x3   //store row1
    374 
    375     st1             {v2.8b}, [x12], x3  //store row5
    376 
    377     st1             {v4.8b}, [x0], x3   //store row2
    378 
    379     st1             {v6.8b}, [x12], x3  //store row6
    380 
    381     st1             {v1.8b}, [x0], x3   //store row3
    382 
    383     st1             {v3.8b}, [x12], x3  //store row7
    384 
    385     st1             {v5.8b}, [x0], x3   //store row4
    386 
    387     st1             {v7.8b}, [x12], x3  //store row8
    388 
    389 
    390 
    391     // LDMFD sp!,{x12,pc}
    392     pop_v_regs
    393     ret
    394 
    395 
    396 
    397 
    398 
    399 
    400 
    401 ///*
    402 ////---------------------------------------------------------------------------
    403 //// Function Name      :   impeg2_mc_halfx_halfy_8x8_av8()
    404 ////
    405 //// Detail Description : This function pastes the reference block in the
    406 ////                      current frame buffer.This function is called for
    407 ////                      blocks that are not coded and have motion vectors
    408 ////                      with a half pel resolutionand VopRoundingType is 0 ..
    409 ////
    410 //// Inputs             : x0 - out    : Current Block Pointer
    411 ////                      x1 - ref     : Refernce Block Pointer
    412 ////                      x2 - ref_wid   : Refernce Block Width
    413 ////                      x3 - out_wid    @ Current Block Width
    414 ////
    415 //// Registers Used     : x14, v0-v18, v22, v24, v26, v28, v30
    416 
    417 ////
    418 //// Stack Usage        : 64 bytes
    419 ////
    420 //// Outputs            : The Motion Compensated Block
    421 ////
    422 //// Return Data        : None
    423 ////
    424 //// Programming Note   : <program limitation>
    425 ////-----------------------------------------------------------------------------
    426 //*/
    427 
    428 
    429 .global impeg2_mc_halfx_halfy_8x8_av8
    430 
    431 impeg2_mc_halfx_halfy_8x8_av8:
    432 
    433     // STMFD sp!,{x12,x14}
    434     push_v_regs
    435 
    436     add             x14, x1, x2, lsl #2
    437 
    438     ld1             {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of  row1
    439 
    440     ld1             {v2.8b, v3.8b}, [x14], x2 // row5
    441 
    442     ld1             {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
    443 
    444     ld1             {v6.8b, v7.8b}, [x14], x2 //row6
    445 
    446     ext             v1.8b, v0.8b , v1.8b , #1
    447 
    448 
    449 
    450     ext             v3.8b, v2.8b , v3.8b , #1
    451 
    452 
    453 
    454     ext             v5.8b, v4.8b , v5.8b , #1
    455 
    456     ext             v7.8b, v6.8b , v7.8b , #1
    457 
    458 
    459 
    460 
    461     ld1             {v8.8b, v9.8b}, [x1], x2 //load row3
    462 
    463 
    464 
    465     ld1             {v10.8b, v11.8b}, [x14], x2 //load row7
    466 
    467     ld1             {v12.8b, v13.8b}, [x1], x2 //load  row4
    468 
    469     ld1             {v14.8b, v15.8b}, [x14], x2 //load  row8
    470 
    471     ext             v9.8b, v8.8b , v9.8b , #1
    472 
    473     ld1             {v16.8b, v17.8b}, [x14], x2 //load  row9
    474 
    475 
    476 
    477 
    478 
    479     ext             v11.8b, v10.8b , v11.8b , #1
    480 
    481 
    482 
    483     ext             v13.8b, v12.8b , v13.8b , #1
    484 
    485 
    486 
    487     ext             v15.8b, v14.8b , v15.8b , #1
    488 
    489     ext             v17.8b, v16.8b , v17.8b , #1
    490 
    491 
    492     //interpolation in x direction
    493 
    494     uaddl           v0.8h, v0.8b, v1.8b //operate row1
    495 
    496     uaddl           v2.8h, v2.8b, v3.8b //operate row5
    497 
    498     uaddl           v4.8h, v4.8b, v5.8b //operate row2
    499 
    500     uaddl           v6.8h, v6.8b, v7.8b //operate row6
    501 
    502     uaddl           v8.8h, v8.8b, v9.8b //operate row3
    503 
    504     uaddl           v10.8h, v10.8b, v11.8b //operate row7
    505 
    506     uaddl           v12.8h, v12.8b, v13.8b //operate row4
    507 
    508     uaddl           v14.8h, v14.8b, v15.8b //operate row8
    509 
    510     uaddl           v16.8h, v16.8b, v17.8b //operate row9
    511 
    512     //interpolation in y direction
    513 
    514     add             x14, x0, x3, lsl #2
    515 
    516 
    517 
    518     add             v18.8h, v0.8h , v4.8h //operate row1 and row2
    519 
    520     add             v26.8h, v2.8h , v6.8h //operate row5 and row6
    521 
    522     add             v20.8h, v4.8h , v8.8h //operate row2 and row3
    523 
    524     add             v28.8h, v6.8h , v10.8h //operate row6 and row7
    525 
    526     rshrn           v18.8b, v18.8h, #2  //row1
    527 
    528     rshrn           v26.8b, v26.8h, #2  //row5
    529 
    530     rshrn           v20.8b, v20.8h, #2  //row2
    531 
    532     rshrn           v28.8b, v28.8h, #2  //row6
    533 
    534     add             v22.8h, v8.8h , v12.8h //operate row3 and row4
    535 
    536     st1             {v18.8b}, [x0], x3  //store row1
    537 
    538     add             v30.8h, v10.8h , v14.8h //operate row7 and row8
    539 
    540     st1             {v26.8b}, [x14], x3 //store row5
    541 
    542     add             v24.8h, v12.8h , v2.8h //operate row4 and row5
    543 
    544     st1             {v20.8b}, [x0], x3  //store row2
    545 
    546     add             v14.8h, v14.8h , v16.8h //operate row8 and row9
    547 
    548     st1             {v28.8b}, [x14], x3 //store row6
    549 
    550 
    551 
    552     rshrn           v22.8b, v22.8h, #2  //row3
    553 
    554     rshrn           v30.8b, v30.8h, #2  //row7
    555 
    556     rshrn           v24.8b, v24.8h, #2  //row4
    557 
    558     rshrn           v14.8b, v14.8h, #2  //row8
    559 
    560 
    561     st1             {v22.8b}, [x0], x3  //store row3
    562     st1             {v30.8b}, [x14], x3 //store row7
    563     st1             {v24.8b}, [x0], x3  //store row4
    564     st1             {v14.8b}, [x14], x3 //store row8
    565 
    566 
    567 
    568     // LDMFD sp!,{x12,pc}
    569     pop_v_regs
    570     ret
    571 
    572 
    573 
    574 
    575 ///*
    576 ////---------------------------------------------------------------------------
    577 //// Function Name      :   impeg2_mc_fullx_fully_8x8_av8()
    578 ////
    579 //// Detail Description : This function pastes the reference block in the
    580 ////                      current frame buffer.This function is called for
    581 ////                      blocks that are not coded and have motion vectors
    582 ////                      with a half pel resolutionand ..
    583 ////
    584 //// Inputs             : x0 - out    : Current Block Pointer
    585 ////                      x1 - ref     : Refernce Block Pointer
    586 ////                      x2 - ref_wid   : Refernce Block Width
    587 ////                      x3 - out_wid    @ Current Block Width
    588 ////
    589 //// Registers Used     : x12, x14, v0-v3
    590 
    591 ////
    592 //// Stack Usage        : 64 bytes
    593 ////
    594 //// Outputs            : The Motion Compensated Block
    595 ////
    596 //// Return Data        : None
    597 ////
    598 //// Programming Note   : <program limitation>
    599 ////-----------------------------------------------------------------------------
    600 //*/
    601 
    602 
    603 .global impeg2_mc_fullx_fully_8x8_av8
    604 impeg2_mc_fullx_fully_8x8_av8:
    605 
    606 
    607     // STMFD sp!,{x12,x14}
    608     push_v_regs
    609 
    610     add             x14, x1, x2, lsl #2
    611 
    612     add             x12, x0, x3, lsl #2
    613 
    614 
    615     ld1             {v0.8b}, [x1], x2   //load row1
    616 
    617     ld1             {v1.8b}, [x14], x2  //load row4
    618 
    619     ld1             {v2.8b}, [x1], x2   //load row2
    620 
    621     ld1             {v3.8b}, [x14], x2  //load row5
    622 
    623 
    624     st1             {v0.8b}, [x0], x3   //store row1
    625 
    626     st1             {v1.8b}, [x12], x3  //store row4
    627 
    628     st1             {v2.8b}, [x0], x3   //store row2
    629 
    630     st1             {v3.8b}, [x12], x3  //store row5
    631 
    632 
    633     ld1             {v0.8b}, [x1], x2   //load row3
    634 
    635     ld1             {v1.8b}, [x14], x2  //load row6
    636 
    637     ld1             {v2.8b}, [x1], x2   //load row4
    638 
    639     ld1             {v3.8b}, [x14], x2  //load row8
    640 
    641 
    642     st1             {v0.8b}, [x0], x3   //store row3
    643 
    644     st1             {v1.8b}, [x12], x3  //store row6
    645 
    646     st1             {v2.8b}, [x0], x3   //store row4
    647 
    648     st1             {v3.8b}, [x12], x3  //store row8
    649 
    650 
    651     // LDMFD sp!,{x12,pc}
    652     pop_v_regs
    653     ret
    654 
    655 
    656 
    657 
    658 ///*
    659 ////---------------------------------------------------------------------------
    660 //// Function Name      :   impeg2_interpolate_av8()
    661 ////
    662 //// Detail Description : interpolates two buffers and adds pred
    663 ////
    664 //// Inputs             : x0 - pointer to src1
    665 ////                      x1 - pointer to src2
    666 ////                      x2 - dest buf
    667 ////                         x3 - dst stride
    668 //// Registers Used     : x12, v0-v15
    669 ////
    670 //// Stack Usage        : 64 bytes
    671 ////
    672 //// Outputs            : The Motion Compensated Block
    673 ////
    674 //// Return Data        : None
    675 ////
    676 //// Programming Note   : <program limitation>
    677 ////-----------------------------------------------------------------------------
    678 //*/
    679 
    680 
    681 .global impeg2_interpolate_av8
    682 
    683 
    684 impeg2_interpolate_av8:
    685 
    686 //STMFD    x13!,{x4-x7,x12,x14}
    687     push_v_regs
    688 
    689     ldr             x4, [x0, #0]        //ptr_y src1
    690 
    691     ldr             x5, [x1, #0]        //ptr_y src2
    692 
    693     ldr             x7, [x2, #0]        //ptr_y dst buf
    694 
    695     mov             x12, #4             //counter for number of blocks
    696 
    697 
    698 interp_lumablocks_stride:
    699     ld1             {v0.16b}, [x4], #16 //row1 src1
    700 
    701     ld1             {v2.16b}, [x4], #16 //row2 src1
    702 
    703     ld1             {v4.16b}, [x4], #16 //row3 src1
    704 
    705     ld1             {v6.16b}, [x4], #16 //row4 src1
    706 
    707 
    708     ld1             {v8.16b}, [x5], #16 //row1 src2
    709 
    710     ld1             {v10.16b}, [x5], #16 //row2 src2
    711 
    712     ld1             {v12.16b}, [x5], #16 //row3 src2
    713 
    714     ld1             {v14.16b}, [x5], #16 //row4 src2
    715 
    716     urhadd          v0.16b, v0.16b , v8.16b //operate on row1
    717 
    718     urhadd          v2.16b, v2.16b , v10.16b //operate on row2
    719 
    720     urhadd          v4.16b, v4.16b , v12.16b //operate on row3
    721 
    722     urhadd          v6.16b, v6.16b , v14.16b //operate on row4
    723     st1             {v0.16b}, [x7], x3  //row1
    724 
    725     st1             {v2.16b}, [x7], x3  //row2
    726 
    727     st1             {v4.16b}, [x7], x3  //row3
    728 
    729     st1             {v6.16b}, [x7], x3  //row4
    730 
    731     subs            x12, x12, #1
    732 
    733     bne             interp_lumablocks_stride
    734 
    735 
    736     lsr             x3, x3, #1          //stride >> 1
    737 
    738     ldr             x4, [x0, #8]        //ptr_u src1
    739 
    740     ldr             x5, [x1, #8]        //ptr_u src2
    741 
    742     ldr             x7 , [x2, #8]       //ptr_u dst buf
    743 
    744     mov             x12, #2             //counter for number of blocks
    745 
    746 
    747 
    748 //chroma blocks
    749 
    750 interp_chromablocks_stride:
    751     ld1             {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
    752 
    753     ld1             {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
    754 
    755     ld1             {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
    756 
    757     ld1             {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
    758 
    759 
    760     ld1             {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
    761 
    762     ld1             {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
    763 
    764     ld1             {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
    765 
    766     ld1             {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
    767 
    768     urhadd          v0.16b, v0.16b , v8.16b //operate on row1 & 2
    769     urhadd          v1.16b, v1.16b , v9.16b //operate on row1 & 2
    770 
    771     urhadd          v2.16b, v2.16b , v10.16b //operate on row3 & 4
    772     urhadd          v3.16b, v3.16b , v11.16b //operate on row3 & 4
    773 
    774     urhadd          v4.16b, v4.16b , v12.16b //operate on row5 & 6
    775     urhadd          v5.16b, v5.16b , v13.16b //operate on row5 & 6
    776 
    777     urhadd          v6.16b, v6.16b , v14.16b //operate on row7 & 8
    778     urhadd          v7.16b, v7.16b , v15.16b //operate on row7 & 8
    779 
    780     st1             {v0.8b}, [x7], x3   //row1
    781 
    782     st1             {v1.8b}, [x7], x3   //row2
    783 
    784     st1             {v2.8b}, [x7], x3   //row3
    785 
    786     st1             {v3.8b}, [x7], x3   //row4
    787 
    788     st1             {v4.8b}, [x7], x3   //row5
    789 
    790     st1             {v5.8b}, [x7], x3   //row6
    791 
    792     st1             {v6.8b}, [x7], x3   //row7
    793 
    794     st1             {v7.8b}, [x7], x3   //row8
    795 
    796 
    797     ldr             x4, [x0, #16]       //ptr_v src1
    798 
    799     ldr             x5, [x1, #16]       //ptr_v src2
    800 
    801     ldr             x7, [x2, #16]       //ptr_v dst buf
    802 
    803     subs            x12, x12, #1
    804 
    805     bne             interp_chromablocks_stride
    806 
    807 
    808     //LDMFD  x13!,{x4-x7,x12,PC}
    809     pop_v_regs
    810     ret
    811 
    812 
    813 
    814 
    815