Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 
     21 @/*
     22 @//----------------------------------------------------------------------------
     23 @// File Name            : impeg2_inter_pred.s
     24 @//
     25 @// Description          : This file has motion compensation related
     26 @//                        interpolation functions on Neon + CortexA-8 platform
     27 @//
     28 @// Reference Document   :
     29 @//
     30 @// Revision History     :
     31 @//      Date            Author                  Detail Description
     32 @//   ------------    ----------------    ----------------------------------
     33 @//   18 jun 2010     S Hamsalekha              Created
     34 @//
     35 @//-------------------------------------------------------------------------
     36 @*/
     37 
     38 @/*
     39 @// ----------------------------------------------------------------------------
     40 @// Include Files
     41 @// ----------------------------------------------------------------------------
     42 @*/
     43 .text
     44 .p2align 2
     45 
     46 
     47 @/*
     48 @// ----------------------------------------------------------------------------
     49 @// Struct/Union Types and Define
     50 @// ----------------------------------------------------------------------------
     51 @*/
     52 
     53 
     54 @/*
     55 @// ----------------------------------------------------------------------------
     56 @// Static Global Data section variables
     57 @// ----------------------------------------------------------------------------
     58 @*/
     59 @// -------------------------- NONE --------------------------------------------
     60 
     61 
     62 @/*
     63 @// ----------------------------------------------------------------------------
     64 @// Static Prototype Functions
     65 @// ----------------------------------------------------------------------------
     66 @*/
     67 @// -------------------------- NONE --------------------------------------------
     68 
     69 @/*
     70 @// ----------------------------------------------------------------------------
     71 @// Exported functions
     72 @// ----------------------------------------------------------------------------
     73 @*/
     74 
     75 @//---------------------------------------------------------------------------
     76 @// Function Name      :   impeg2_copy_mb_a9q()
     77 @//
     78 @// Detail Description : Copies one MB worth of data from src to the dst
     79 @//
     80 @// Inputs             : r0 - pointer to src
     81 @//                      r1 - pointer to dst
     82 @//                      r2 - source width
     83 @//                      r3 - destination width
     84 @// Registers Used     : r4, r5, d0, d1
     85 @//
     86 @// Stack Usage        : 12 bytes
     87 @//
     88 @// Outputs            :
     89 @//
     90 @// Return Data        : None
     91 @//
     92 @// Programming Note   : <program limitation>
     93 @//-----------------------------------------------------------------------------
     94 @*/
     95 
     96 
     97 
     98         .global impeg2_copy_mb_a9q
     99 
    100 
    101 impeg2_copy_mb_a9q:
    102 
    103     stmfd           r13!, {r4, r5, r14}
    104 
    105 
    106     ldr             r4, [r0]            @src->y
    107     ldr             r5, [r1]            @dst->y
    108     @Read one row of data from the src
    109     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    110     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    111 
    112     @//Repeat 15 times for y
    113     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    114     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    115     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    116     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    117     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    118     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    119     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    120     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    121     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    122     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    123     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    124     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    125     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    126     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    127     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    128     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    129     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    130     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    131     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    132     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    133     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    134     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    135     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    136     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    137     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    138     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    139     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    140     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    141     vld1.8          {d0, d1}, [r4], r2  @Load and increment src
    142     vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
    143 
    144     mov             r2, r2, lsr #1      @src_offset /= 2
    145     mov             r3, r3, lsr #1      @dst_offset /= 2
    146 
    147     ldr             r4, [r0, #4]        @src->u
    148     ldr             r5, [r1, #4]        @dst->u
    149     @Read one row of data from the src
    150     vld1.8          {d0}, [r4], r2      @Load and increment src
    151     vst1.8          {d0}, [r5], r3      @Store and increment dst
    152 
    153     @//Repeat 7 times for u
    154     vld1.8          {d0}, [r4], r2      @Load and increment src
    155     vst1.8          {d0}, [r5], r3      @Store and increment dst
    156     vld1.8          {d0}, [r4], r2      @Load and increment src
    157     vst1.8          {d0}, [r5], r3      @Store and increment dst
    158     vld1.8          {d0}, [r4], r2      @Load and increment src
    159     vst1.8          {d0}, [r5], r3      @Store and increment dst
    160     vld1.8          {d0}, [r4], r2      @Load and increment src
    161     vst1.8          {d0}, [r5], r3      @Store and increment dst
    162     vld1.8          {d0}, [r4], r2      @Load and increment src
    163     vst1.8          {d0}, [r5], r3      @Store and increment dst
    164     vld1.8          {d0}, [r4], r2      @Load and increment src
    165     vst1.8          {d0}, [r5], r3      @Store and increment dst
    166     vld1.8          {d0}, [r4], r2      @Load and increment src
    167     vst1.8          {d0}, [r5], r3      @Store and increment dst
    168 
    169     ldr             r4, [r0, #8]        @src->v
    170     ldr             r5, [r1, #8]        @dst->v
    171     @Read one row of data from the src
    172     vld1.8          {d0}, [r4], r2      @Load and increment src
    173     vst1.8          {d0}, [r5], r3      @Store and increment dst
    174 
    175     @//Repeat 7 times for v
    176     vld1.8          {d0}, [r4], r2      @Load and increment src
    177     vst1.8          {d0}, [r5], r3      @Store and increment dst
    178     vld1.8          {d0}, [r4], r2      @Load and increment src
    179     vst1.8          {d0}, [r5], r3      @Store and increment dst
    180     vld1.8          {d0}, [r4], r2      @Load and increment src
    181     vst1.8          {d0}, [r5], r3      @Store and increment dst
    182     vld1.8          {d0}, [r4], r2      @Load and increment src
    183     vst1.8          {d0}, [r5], r3      @Store and increment dst
    184     vld1.8          {d0}, [r4], r2      @Load and increment src
    185     vst1.8          {d0}, [r5], r3      @Store and increment dst
    186     vld1.8          {d0}, [r4], r2      @Load and increment src
    187     vst1.8          {d0}, [r5], r3      @Store and increment dst
    188     vld1.8          {d0}, [r4], r2      @Load and increment src
    189     vst1.8          {d0}, [r5], r3      @Store and increment dst
    190 
    191     ldmfd           r13!, {r4, r5, pc}
    192 
    193 
    194 
    195 
    196 @/*
    197 @//---------------------------------------------------------------------------
    198 @// Function Name      :   impeg2_mc_fullx_halfy_8x8_a9q()
    199 @//
    200 @// Detail Description : This function pastes the reference block in the
    201 @//                      current frame buffer.This function is called for
    202 @//                      blocks that are not coded and have motion vectors
    203 @//                      with a half pel resolution.
    204 @//
    205 @// Inputs             : r0 - out    : Current Block Pointer
    206 @//                      r1 - ref     : Refernce Block Pointer
    207 @//                      r2 - ref_wid   : Refernce Block Width
    208 @//                      r3 - out_wid   ; Current Block Width
    209 @//
    210 @// Registers Used     : D0-D9
    211 @//
    212 @// Stack Usage        : 4 bytes
    213 @//
    214 @// Outputs            : The Motion Compensated Block
    215 @//
    216 @// Return Data        : None
    217 @//
    218 @// Programming Note   : <program limitation>
    219 @//-----------------------------------------------------------------------------
    220 @*/
    221 
    222         .global impeg2_mc_fullx_halfy_8x8_a9q
    223 
    224 impeg2_mc_fullx_halfy_8x8_a9q:
    225 
    226     stmfd           r13!, {r14}
    227     add             r14, r1, r2
    228     mov             r2, r2, lsl #1
    229 
    230 @/* Load 8 + 1 rows from reference block */
    231 @/* Do the addition with out rounding off as rounding value is 1 */
    232     vld1.8          {d0}, [r1], r2      @// first row hence r1 = D0
    233     vld1.8          {d2}, [r14], r2     @// second row hence r2 = D2
    234     vld1.8          {d4}, [r1], r2      @// third row hence r3 = D4
    235     vld1.8          {d6}, [r14], r2     @// fourth row hence r4 = D6
    236     vld1.8          {d1}, [r1], r2      @// fifth row hence r5 = D1
    237     vld1.8          {d3}, [r14], r2     @// sixth row hence r6 = D3
    238     vrhadd.u8       d9, d1, d6          @// estimated row 4 = D9
    239     vld1.8          {d5}, [r1], r2      @// seventh row hence r7 = D5
    240     vrhadd.u8       q0, q0, q1          @// estimated row 1 = D0, row 5 = D1
    241     vld1.8          {d7}, [r14], r2     @// eighth row hence r8 = D7
    242     vrhadd.u8       q1, q1, q2          @// estimated row 2 = D2, row 6 = D3
    243     vld1.8          {d8}, [r1], r2      @// ninth row hence r9 = D8
    244     vrhadd.u8       q2, q2, q3          @// estimated row 3 = D4, row 7 = D5
    245 
    246     add             r14, r0, r3
    247     mov             r3, r3, lsl #1
    248 
    249 @/* Store the eight rows calculated above */
    250     vst1.8          {d2}, [r14], r3     @// second row hence D2
    251     vrhadd.u8       d7, d7, d8          @// estimated row 8 = D7
    252     vst1.8          {d0}, [r0], r3      @// first row hence D0
    253     vst1.8          {d9}, [r14], r3     @// fourth row hence D9
    254     vst1.8          {d4}, [r0], r3      @// third row hence D4
    255     vst1.8          {d3}, [r14], r3     @// sixth row hence r6 = D3
    256     vst1.8          {d1}, [r0], r3      @// fifth row hence r5 = D1
    257     vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
    258     vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
    259 
    260     ldmfd           sp!, {pc}
    261 
    262 
    263 
    264 
    265 
    266 
    267 @/*
    268 @//---------------------------------------------------------------------------
    269 @// Function Name      :   impeg2_mc_halfx_fully_8x8_a9q()
    270 @//
    271 @// Detail Description : This function pastes the reference block in the
    272 @//                      current frame buffer.This function is called for
    273 @//                      blocks that are not coded and have motion vectors
    274 @//                      with a half pel resolutionand VopRoundingType is 0 ..
    275 @//
    276 @// Inputs             : r0 - out    : Current Block Pointer
    277 @//                      r1 - ref     : Refernce Block Pointer
    278 @//                      r2 - ref_wid   : Refernce Block Width
    279 @//                      r3 - out_wid   ; Current Block Width
    280 @//
    281 @// Registers Used     : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
    282 
    283 @//
    284 @// Stack Usage        : 8 bytes
    285 @//
    286 @// Outputs            : The Motion Compensated Block
    287 @//
    288 @// Return Data        : None
    289 @//
    290 @// Programming Note   : <program limitation>
    291 @//-----------------------------------------------------------------------------
    292 @*/
    293 
    294 
    295 
    296         .global impeg2_mc_halfx_fully_8x8_a9q
    297 
    298 
    299 
    300 impeg2_mc_halfx_fully_8x8_a9q:
    301 
    302     stmfd           sp!, {r12, lr}
    303 
    304     add             r14, r1, r2, lsl #2
    305 
    306     add             r12, r0, r3, lsl#2
    307 
    308     vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
    309 
    310     vld1.8          {d2, d3}, [r14], r2 @ row5
    311 
    312 
    313     vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
    314 
    315     vld1.8          {d6, d7}, [r14], r2 @row6
    316 
    317 
    318     vext.8          d8, d0, d1, #1      @Extract pixels (1-8) of row1
    319 
    320     vext.8          d12, d2, d3, #1     @Extract pixels (1-8) of row5
    321 
    322     vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
    323 
    324     vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
    325 
    326 
    327     vld1.8          {d9, d10}, [r1], r2 @load row3
    328 
    329     vld1.8          {d13, d14}, [r14], r2 @load row7
    330 
    331     vld1.8          {d17, d18}, [r1], r2 @load  row4
    332 
    333     vld1.8          {d21, d22}, [r14], r2 @load  row8
    334 
    335 
    336     vext.8          d1, d9, d10, #1     @Extract pixels (1-8) of row3
    337 
    338     vext.8          d3, d13, d14, #1    @Extract pixels (1-8) of row7
    339 
    340 
    341 
    342     vext.8          d5, d17, d18, #1    @Extract pixels (1-8) of row4
    343 
    344     vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
    345 
    346 
    347     vrhadd.u8       q0, q0, q4          @operate on row1 and row3
    348 
    349     vrhadd.u8       q1, q1, q6          @operate on row5 and row7
    350 
    351 
    352     vrhadd.u8       q2, q2, q8          @operate on row2 and row4
    353 
    354 
    355 
    356     vrhadd.u8       q3, q3, q10         @operate on row6 and row8
    357 
    358     vst1.8          d0, [r0], r3        @store row1
    359 
    360     vst1.8          d2, [r12], r3       @store row5
    361 
    362     vst1.8          d4, [r0], r3        @store row2
    363 
    364     vst1.8          d6, [r12], r3       @store row6
    365 
    366     vst1.8          d1, [r0], r3        @store row3
    367 
    368     vst1.8          d3, [r12], r3       @store row7
    369 
    370     vst1.8          d5, [r0], r3        @store row4
    371 
    372     vst1.8          d7, [r12], r3       @store row8
    373 
    374 
    375 
    376     ldmfd           sp!, {r12, pc}
    377 
    378 
    379 
    380 
    381 
    382 
    383 
    384 
    385 @/*
    386 @//---------------------------------------------------------------------------
    387 @// Function Name      :   impeg2_mc_halfx_halfy_8x8_a9q()
    388 @//
    389 @// Detail Description : This function pastes the reference block in the
    390 @//                      current frame buffer.This function is called for
    391 @//                      blocks that are not coded and have motion vectors
    392 @//                      with a half pel resolutionand VopRoundingType is 0 ..
    393 @//
    394 @// Inputs             : r0 - out    : Current Block Pointer
    395 @//                      r1 - ref     : Refernce Block Pointer
    396 @//                      r2 - ref_wid   : Refernce Block Width
    397 @//                      r3 - out_wid   ; Current Block Width
    398 @//
    399 @// Registers Used     : r14, q0-q15
    400 
    401 @//
    402 @// Stack Usage        : 4 bytes
    403 @//
    404 @// Outputs            : The Motion Compensated Block
    405 @//
    406 @// Return Data        : None
    407 @//
    408 @// Programming Note   : <program limitation>
    409 @//-----------------------------------------------------------------------------
    410 @*/
    411 
    412 
    413         .global impeg2_mc_halfx_halfy_8x8_a9q
    414 
    415 impeg2_mc_halfx_halfy_8x8_a9q:
    416 
    417     stmfd           sp!, {r14}
    418 
    419     add             r14, r1, r2, lsl #2
    420 
    421     vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
    422 
    423     vld1.8          {d2, d3}, [r14], r2 @ row5
    424 
    425     vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
    426 
    427     vld1.8          {d6, d7}, [r14], r2 @row6
    428 
    429     vext.8          d1, d0, d1, #1      @Extract pixels (1-8) of row1
    430 
    431 
    432 
    433     vext.8          d3, d2, d3, #1      @Extract pixels (1-8) of row5
    434 
    435 
    436 
    437     vext.8          d5, d4, d5, #1      @Extract pixels (1-8) of row2
    438 
    439     vext.8          d7, d6, d7, #1      @Extract pixels (1-8) of row6
    440 
    441 
    442 
    443 
    444     vld1.8          {d8, d9}, [r1], r2  @load row3
    445 
    446 
    447 
    448     vld1.8          {d10, d11}, [r14], r2 @load row7
    449 
    450     vld1.8          {d12, d13}, [r1], r2 @load  row4
    451 
    452     vld1.8          {d14, d15}, [r14], r2 @load  row8
    453 
    454     vext.8          d9, d8, d9, #1      @Extract pixels (1-8) of row3
    455 
    456     vld1.8          {d16, d17}, [r14], r2 @load  row9
    457 
    458 
    459 
    460 
    461 
    462     vext.8          d11, d10, d11, #1   @Extract pixels (1-8) of row7
    463 
    464 
    465 
    466     vext.8          d13, d12, d13, #1   @Extract pixels (1-8) of row4
    467 
    468 
    469 
    470     vext.8          d15, d14, d15, #1   @Extract pixels (1-8) of row8
    471 
    472     vext.8          d17, d16, d17, #1   @Extract pixels (1-8) of row9
    473 
    474 
    475     @interpolation in x direction
    476 
    477     vaddl.u8        q0, d0, d1          @operate row1
    478 
    479     vaddl.u8        q1, d2, d3          @operate row5
    480 
    481     vaddl.u8        q2, d4, d5          @operate row2
    482 
    483     vaddl.u8        q3, d6, d7          @operate row6
    484 
    485     vaddl.u8        q4, d8, d9          @operate row3
    486 
    487     vaddl.u8        q5, d10, d11        @operate row7
    488 
    489     vaddl.u8        q6, d12, d13        @operate row4
    490 
    491     vaddl.u8        q7, d14, d15        @operate row8
    492 
    493     vaddl.u8        q8, d16, d17        @operate row9
    494 
    495     @interpolation in y direction
    496 
    497     add             r14, r0, r3, lsl #2
    498 
    499 
    500 
    501     vadd.u16        q9, q0, q2          @operate row1 and row2
    502 
    503     vadd.u16        q13, q1, q3         @operate row5 and row6
    504 
    505     vadd.u16        q10, q2, q4         @operate row2 and row3
    506 
    507     vadd.u16        q14, q3, q5         @operate row6 and row7
    508 
    509     vrshrn.u16      d18, q9, #2         @row1
    510 
    511     vrshrn.u16      d26, q13, #2        @row5
    512 
    513     vrshrn.u16      d20, q10, #2        @row2
    514 
    515     vrshrn.u16      d28, q14, #2        @row6
    516 
    517     vadd.u16        q11, q4, q6         @operate row3 and row4
    518 
    519     vst1.8          d18, [r0], r3       @store row1
    520 
    521     vadd.u16        q15, q5, q7         @operate row7 and row8
    522 
    523     vst1.8          d26, [r14], r3      @store row5
    524 
    525     vadd.u16        q12, q6, q1         @operate row4 and row5
    526 
    527     vst1.8          d20, [r0], r3       @store row2
    528 
    529     vadd.u16        q7, q7, q8          @operate row8 and row9
    530 
    531     vst1.8          d28, [r14], r3      @store row6
    532 
    533 
    534 
    535     vrshrn.u16      d22, q11, #2        @row3
    536 
    537     vrshrn.u16      d30, q15, #2        @row7
    538 
    539     vrshrn.u16      d24, q12, #2        @row4
    540 
    541     vrshrn.u16      d14, q7, #2         @row8
    542 
    543 
    544     vst1.8          d22, [r0], r3       @store row3
    545     vst1.8          d30, [r14], r3      @store row7
    546     vst1.8          d24, [r0], r3       @store row4
    547     vst1.8          d14, [r14], r3      @store row8
    548 
    549 
    550 
    551     ldmfd           sp!, {pc}
    552 
    553 
    554 
    555 
    556 
    557 @/*
    558 @//---------------------------------------------------------------------------
    559 @// Function Name      :   impeg2_mc_fullx_fully_8x8_a9q()
    560 @//
    561 @// Detail Description : This function pastes the reference block in the
    562 @//                      current frame buffer.This function is called for
    563 @//                      blocks that are not coded and have motion vectors
    564 @//                      with a half pel resolutionand ..
    565 @//
    566 @// Inputs             : r0 - out    : Current Block Pointer
    567 @//                      r1 - ref     : Refernce Block Pointer
    568 @//                      r2 - ref_wid   : Refernce Block Width
    569 @//                      r3 - out_wid   ; Current Block Width
    570 @//
    571 @// Registers Used     : r12, r14, d0-d3
    572 
    573 @//
    574 @// Stack Usage        : 8 bytes
    575 @//
    576 @// Outputs            : The Motion Compensated Block
    577 @//
    578 @// Return Data        : None
    579 @//
    580 @// Programming Note   : <program limitation>
    581 @//-----------------------------------------------------------------------------
    582 @*/
    583 
    584 
    585         .global impeg2_mc_fullx_fully_8x8_a9q
    586 impeg2_mc_fullx_fully_8x8_a9q:
    587 
    588 
    589     stmfd           sp!, {r12, lr}
    590 
    591     add             r14, r1, r2, lsl #2
    592 
    593     add             r12, r0, r3, lsl #2
    594 
    595 
    596     vld1.8          d0, [r1], r2        @load row1
    597 
    598     vld1.8          d1, [r14], r2       @load row4
    599 
    600     vld1.8          d2, [r1], r2        @load row2
    601 
    602     vld1.8          d3, [r14], r2       @load row5
    603 
    604 
    605     vst1.8          d0, [r0], r3        @store row1
    606 
    607     vst1.8          d1, [r12], r3       @store row4
    608 
    609     vst1.8          d2, [r0], r3        @store row2
    610 
    611     vst1.8          d3, [r12], r3       @store row5
    612 
    613 
    614     vld1.8          d0, [r1], r2        @load row3
    615 
    616     vld1.8          d1, [r14], r2       @load row6
    617 
    618     vld1.8          d2, [r1], r2        @load row4
    619 
    620     vld1.8          d3, [r14], r2       @load row8
    621 
    622 
    623     vst1.8          d0, [r0], r3        @store row3
    624 
    625     vst1.8          d1, [r12], r3       @store row6
    626 
    627     vst1.8          d2, [r0], r3        @store row4
    628 
    629     vst1.8          d3, [r12], r3       @store row8
    630 
    631 
    632     ldmfd           sp!, {r12, pc}
    633 
    634 
    635 
    636 
    637 
    638 @/*
    639 @//---------------------------------------------------------------------------
    640 @// Function Name      :   impeg2_interpolate_a9q()
    641 @//
    642 @// Detail Description : interpolates two buffers and adds pred
    643 @//
    644 @// Inputs             : r0 - pointer to src1
    645 @//                      r1 - pointer to src2
    646 @//                      r2 - dest buf
    647 @//                      r3 - dst stride
    648 @// Registers Used     : r4, r5, r7, r14, d0-d15
    649 @//
    650 @// Stack Usage        : 20 bytes
    651 @//
    652 @// Outputs            : The Motion Compensated Block
    653 @//
    654 @// Return Data        : None
    655 @//
    656 @// Programming Note   : <program limitation>
    657 @//-----------------------------------------------------------------------------
    658 @*/
    659 
    660 
    661         .global impeg2_interpolate_a9q
    662 
    663 
    664 impeg2_interpolate_a9q:
    665 
    666     stmfd           r13!, {r4, r5, r7, r12, r14}
    667 
    668     ldr             r4, [r0, #0]        @ptr_y src1
    669 
    670     ldr             r5, [r1, #0]        @ptr_y src2
    671 
    672     ldr             r7, [r2, #0]        @ptr_y dst buf
    673 
    674     mov             r12, #4             @counter for number of blocks
    675 
    676 
    677 interp_lumablocks_stride:
    678 
    679     vld1.8          {d0, d1}, [r4]!     @row1 src1
    680 
    681     vld1.8          {d2, d3}, [r4]!     @row2 src1
    682 
    683     vld1.8          {d4, d5}, [r4]!     @row3 src1
    684 
    685     vld1.8          {d6, d7}, [r4]!     @row4 src1
    686 
    687 
    688     vld1.8          {d8, d9}, [r5]!     @row1 src2
    689 
    690     vld1.8          {d10, d11}, [r5]!   @row2 src2
    691 
    692     vld1.8          {d12, d13}, [r5]!   @row3 src2
    693 
    694     vld1.8          {d14, d15}, [r5]!   @row4 src2
    695 
    696 
    697 
    698 
    699     vrhadd.u8       q0, q0, q4          @operate on row1
    700 
    701     vrhadd.u8       q1, q1, q5          @operate on row2
    702 
    703     vrhadd.u8       q2, q2, q6          @operate on row3
    704 
    705     vrhadd.u8       q3, q3, q7          @operate on row4
    706 
    707 
    708 
    709     vst1.8          {d0, d1}, [r7], r3  @row1
    710 
    711     vst1.8          {d2, d3}, [r7], r3  @row2
    712 
    713     vst1.8          {d4, d5}, [r7], r3  @row3
    714 
    715     vst1.8          {d6, d7}, [r7], r3  @row4
    716 
    717     subs            r12, r12, #1
    718 
    719     bne             interp_lumablocks_stride
    720 
    721 
    722     mov             r3, r3, lsr #1      @stride >> 1
    723 
    724     ldr             r4, [r0, #4]        @ptr_u src1
    725 
    726     ldr             r5, [r1, #4]        @ptr_u src2
    727 
    728     ldr             r7 , [r2, #4]       @ptr_u dst buf
    729 
    730     mov             r12, #2             @counter for number of blocks
    731 
    732 
    733 
    734 @chroma blocks
    735 
    736 interp_chromablocks_stride:
    737 
    738     vld1.8          {d0, d1}, [r4]!     @row1 & 2 src1
    739 
    740     vld1.8          {d2, d3}, [r4]!     @row3 & 4 src1
    741 
    742     vld1.8          {d4, d5}, [r4]!     @row5 & 6 src1
    743 
    744     vld1.8          {d6, d7}, [r4]!     @row7 & 8 src1
    745 
    746 
    747     vld1.8          {d8, d9}, [r5]!     @row1 & 2 src2
    748 
    749     vld1.8          {d10, d11}, [r5]!   @row3 & 4 src2
    750 
    751     vld1.8          {d12, d13}, [r5]!   @row5 & 6 src2
    752 
    753     vld1.8          {d14, d15}, [r5]!   @row7 & 8 src2
    754 
    755 
    756 
    757 
    758     vrhadd.u8       q0, q0, q4          @operate on row1 & 2
    759 
    760     vrhadd.u8       q1, q1, q5          @operate on row3 & 4
    761 
    762     vrhadd.u8       q2, q2, q6          @operate on row5 & 6
    763 
    764     vrhadd.u8       q3, q3, q7          @operate on row7 & 8
    765 
    766 
    767     vst1.8          {d0}, [r7], r3      @row1
    768 
    769     vst1.8          {d1}, [r7], r3      @row2
    770 
    771     vst1.8          {d2}, [r7], r3      @row3
    772 
    773     vst1.8          {d3}, [r7], r3      @row4
    774 
    775     vst1.8          {d4}, [r7], r3      @row5
    776 
    777     vst1.8          {d5}, [r7], r3      @row6
    778 
    779     vst1.8          {d6}, [r7], r3      @row7
    780 
    781     vst1.8          {d7}, [r7], r3      @row8
    782 
    783 
    784 
    785     ldr             r4, [r0, #8]        @ptr_v src1
    786 
    787     ldr             r5, [r1, #8]        @ptr_v src2
    788 
    789     ldr             r7, [r2, #8]        @ptr_v dst buf
    790 
    791     subs            r12, r12, #1
    792 
    793     bne             interp_chromablocks_stride
    794 
    795 
    796     ldmfd           r13!, {r4, r5, r7, r12, pc}
    797 
    798 
    799 
    800 
    801 
    802