Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 
     21 @/*
     22 @//----------------------------------------------------------------------------
     23 @// File Name            : impeg2_format_conv.s
     24 @//
     25 @// Description          : This file has the Idct Implementations for the
     26 @//                        MPEG4 SP decoder on neon platform.
     27 @//
     28 @// Reference Document   :
     29 @//
     30 @// Revision History     :
     31 @//      Date            Author                  Detail Description
     32 @//   ------------    ----------------    ----------------------------------
     33 @//   Jul 07, 2008     Naveen Kumar T                Created
     34 @//
     35 @//-------------------------------------------------------------------------
     36 @*/
     37 
     38 @/*
     39 @// ----------------------------------------------------------------------------
     40 @// Include Files
     41 @// ----------------------------------------------------------------------------
     42 @*/
     43 .text
     44 .p2align 2
     45 .equ log2_16 ,  4
     46 .equ log2_2  ,  1
     47 @/*
     48 @// ----------------------------------------------------------------------------
     49 @// Struct/Union Types and Define
     50 @// ----------------------------------------------------------------------------
     51 @*/
     52 
     53 @/*
     54 @// ----------------------------------------------------------------------------
     55 @// Static Global Data section variables
     56 @// ----------------------------------------------------------------------------
     57 @*/
     58 @//--------------------------- NONE --------------------------------------------
     59 
     60 @/*
     61 @// ----------------------------------------------------------------------------
     62 @// Static Prototype Functions
     63 @// ----------------------------------------------------------------------------
     64 @*/
     65 @// -------------------------- NONE --------------------------------------------
     66 
     67 @/*
     68 @// ----------------------------------------------------------------------------
     69 @// Exported functions
     70 @// ----------------------------------------------------------------------------
     71 @*/
     72 
     73 @/*****************************************************************************
     74 @*                                                                            *
     75 @*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q()                      *
     76 @*                                                                            *
     77 @*  Description      : This function conversts the image from YUV420P color   *
     78 @*                     space to 420SP color space(UV interleaved).        *
     79 @*                                                                            *
     80 @*  Arguments        : R0           pu1_y                                     *
     81 @*                     R1           pu1_u                                     *
     82 @*                     R2           pu1_v                                     *
     83 @*                     R3           pu1_dest_y                                *
     84 @*                     [R13 #40]    pu1_dest_uv                               *
     85 @*                     [R13 #44]    u2_height                                 *
     86 @*                     [R13 #48]    u2_width                                  *
     87 @*                     [R13 #52]    u2_stridey                                *
     88 @*                     [R13 #56]    u2_strideu                                *
     89 @*                     [R13 #60]    u2_stridev                                *
     90 @*                     [R13 #64]    u2_dest_stride_y                          *
     91 @*                     [R13 #68]    u2_dest_stride_uv                         *
     92 @*                     [R13 #72]    convert_uv_only                           *
     93 @*                                                                            *
     94 @*  Values Returned  : None                                                   *
     95 @*                                                                            *
     96 @*  Register Usage   : R0 - R8, Q0                                            *
     97 @*                                                                            *
     98 @*  Stack Usage      : 24 Bytes                                               *
     99 @*                                                                            *
    100 @*  Interruptibility : Interruptible                                          *
    101 @*                                                                            *
    102 @*  Known Limitations                                                         *
    103 @*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
    104 @*                     greater than or equal to 16                *
    105 @*                     Image Height:    Assumed to be even.                   *
    106 @*                                                                            *
    107 @*  Revision History :                                                        *
    108 @*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
    109 @*         07 06 2010   Varshita        Draft                                 *
    110 @*         07 06 2010   Naveen Kr T     Completed                             *
    111 @*                                                                            *
    112 @*****************************************************************************/
    113                 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
    114 impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:
    115 
    116     @// push the registers on the stack
    117     stmfd           sp!, {r4-r8, lr}
    118 
    119     ldr             r4, [sp, #56]       @// Load convert_uv_only
    120 
    121     cmp             r4, #1
    122     beq             yuv420sp_uv_chroma
    123     @/* Do the preprocessing before the main loops start */
    124     @// Load the parameters from stack
    125     ldr             r4, [sp, #28]       @// Load u2_height from stack
    126 
    127     ldr             r5, [sp, #32]       @// Load u2_width from stack
    128 
    129     ldr             r7, [sp, #36]       @// Load u2_stridey from stack
    130 
    131     ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
    132 
    133     sub             r7, r7, r5          @// Source increment
    134 
    135     sub             r8, r8, r5          @// Destination increment
    136 
    137 
    138 yuv420sp_uv_row_loop_y:
    139     mov             r6, r5
    140 
    141 yuv420sp_uv_col_loop_y:
    142     pld             [r0, #128]
    143     vld1.8          {q0}, [r0]!
    144     vst1.8          {q0}, [r3]!
    145     sub             r6, r6, #16
    146     cmp             r6, #15
    147     bgt             yuv420sp_uv_col_loop_y
    148 
    149     cmp             r6, #0
    150     beq             yuv420sp_uv_row_loop_end_y
    151     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    152     @//Ex if width is 162, above loop will process 160 pixels. And
    153     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    154     @// and written using VLD1 and VST1
    155     rsb             r6, r6, #16
    156     sub             r0, r0, r6
    157     sub             r3, r3, r6
    158 
    159     vld1.8          {q0}, [r0]!
    160     vst1.8          {q0}, [r3]!
    161 
    162 yuv420sp_uv_row_loop_end_y:
    163     add             r0, r0, r7
    164     add             r3, r3, r8
    165     subs            r4, r4, #1
    166     bgt             yuv420sp_uv_row_loop_y
    167 
    168 yuv420sp_uv_chroma:
    169 
    170     ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
    171 
    172     ldr             r4, [sp, #28]       @// Load u2_height from stack
    173 
    174     ldr             r5, [sp, #32]       @// Load u2_width from stack
    175 
    176 
    177     ldr             r7, [sp, #40]       @// Load u2_strideu from stack
    178 
    179     ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
    180 
    181     sub             r7, r7, r5, lsr #1  @// Source increment
    182 
    183     sub             r8, r8, r5          @// Destination increment
    184 
    185     mov             r5, r5, lsr #1
    186     mov             r4, r4, lsr #1
    187     ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
    188 yuv420sp_uv_row_loop_uv:
    189     mov             r6, r5
    190 
    191 
    192 yuv420sp_uv_col_loop_uv:
    193     pld             [r1, #128]
    194     pld             [r2, #128]
    195     vld1.8          d0, [r1]!
    196     vld1.8          d1, [r2]!
    197     vst2.8          {d0, d1}, [r3]!
    198     sub             r6, r6, #8
    199     cmp             r6, #7
    200     bgt             yuv420sp_uv_col_loop_uv
    201 
    202     cmp             r6, #0
    203     beq             yuv420sp_uv_row_loop_end_uv
    204     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    205     @//Ex if width is 162, above loop will process 160 pixels. And
    206     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    207     @// and written using VLD1 and VST1
    208     rsb             r6, r6, #8
    209     sub             r1, r1, r6
    210     sub             r2, r2, r6
    211     sub             r3, r3, r6, lsl #1
    212 
    213     vld1.8          d0, [r1]!
    214     vld1.8          d1, [r2]!
    215     vst2.8          {d0, d1}, [r3]!
    216 
    217 yuv420sp_uv_row_loop_end_uv:
    218     add             r1, r1, r7
    219     add             r2, r2, r7
    220     add             r3, r3, r8
    221     subs            r4, r4, #1
    222     bgt             yuv420sp_uv_row_loop_uv
    223     @//POP THE REGISTERS
    224     ldmfd           sp!, {r4-r8, pc}
    225 
    226 
    227 
    228 
    229 
    230 @/*****************************************************************************
    231 @*                                                                            *
    232 @*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q()                      *
    233 @*                                                                            *
    234 @*  Description      : This function conversts the image from YUV420P color   *
    235 @*                     space to 420SP color space(VU interleaved).        *
    236 @*             This function is similar to above function         *
    237 @*             IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
    238 @*             VLD1.8 for chroma - order of registers is different    *
    239 @*                                                                            *
    240 @*  Arguments        : R0           pu1_y                                     *
    241 @*                     R1           pu1_u                                     *
    242 @*                     R2           pu1_v                                     *
    243 @*                     R3           pu1_dest_y                                *
    244 @*                     [R13 #40]    pu1_dest_uv                               *
    245 @*                     [R13 #44]    u2_height                                 *
    246 @*                     [R13 #48]    u2_width                                  *
    247 @*                     [R13 #52]    u2_stridey                                *
    248 @*                     [R13 #56]    u2_strideu                                *
    249 @*                     [R13 #60]    u2_stridev                                *
    250 @*                     [R13 #64]    u2_dest_stride_y                          *
    251 @*                     [R13 #68]    u2_dest_stride_uv                         *
    252 @*                     [R13 #72]    convert_uv_only                           *
    253 @*                                                                            *
    254 @*  Values Returned  : None                                                   *
    255 @*                                                                            *
    256 @*  Register Usage   : R0 - R8, Q0                                            *
    257 @*                                                                            *
    258 @*  Stack Usage      : 24 Bytes                                               *
    259 @*                                                                            *
    260 @*  Interruptibility : Interruptible                                          *
    261 @*                                                                            *
    262 @*  Known Limitations                                                         *
    263 @*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
    264 @*                     greater than or equal to 16                *
    265 @*                     Image Height:    Assumed to be even.                   *
    266 @*                                                                            *
    267 @*  Revision History :                                                        *
    268 @*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
    269 @*         07 06 2010   Varshita        Draft                                 *
    270 @*         07 06 2010   Naveen Kr T     Completed                             *
    271 @*                                                                            *
    272 @*****************************************************************************/
    273 
    274                 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
    275 impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:
    276 
    277     @// push the registers on the stack
    278     stmfd           sp!, {r4-r8, lr}
    279 
    280     ldr             r4, [sp, #56]       @// Load convert_uv_only
    281 
    282     cmp             r4, #1
    283     beq             yuv420sp_vu_chroma
    284 
    285     @/* Do the preprocessing before the main loops start */
    286     @// Load the parameters from stack
    287     ldr             r4, [sp, #28]       @// Load u2_height from stack
    288 
    289     ldr             r5, [sp, #32]       @// Load u2_width from stack
    290 
    291     ldr             r7, [sp, #36]       @// Load u2_stridey from stack
    292 
    293     ldr             r8, [sp, #48]       @// Load u2_dest_stride_y from stack
    294 
    295     sub             r7, r7, r5          @// Source increment
    296 
    297     sub             r8, r8, r5          @// Destination increment
    298 
    299 
    300 yuv420sp_vu_row_loop_y:
    301     mov             r6, r5
    302 
    303 yuv420sp_vu_col_loop_y:
    304     pld             [r0, #128]
    305     vld1.8          {q0}, [r0]!
    306     vst1.8          {q0}, [r3]!
    307     sub             r6, r6, #16
    308     cmp             r6, #15
    309     bgt             yuv420sp_vu_col_loop_y
    310 
    311     cmp             r6, #0
    312     beq             yuv420sp_vu_row_loop_end_y
    313     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    314     @//Ex if width is 162, above loop will process 160 pixels. And
    315     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    316     @// and written using VLD1 and VST1
    317     rsb             r6, r6, #16
    318     sub             r0, r0, r6
    319     sub             r3, r3, r6
    320 
    321     vld1.8          {q0}, [r0]!
    322     vst1.8          {q0}, [r3]!
    323 
    324 yuv420sp_vu_row_loop_end_y:
    325     add             r0, r0, r7
    326     add             r3, r3, r8
    327     subs            r4, r4, #1
    328     bgt             yuv420sp_vu_row_loop_y
    329 
    330 yuv420sp_vu_chroma:
    331 
    332     ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
    333 
    334     ldr             r4, [sp, #28]       @// Load u2_height from stack
    335 
    336     ldr             r5, [sp, #32]       @// Load u2_width from stack
    337 
    338 
    339     ldr             r7, [sp, #40]       @// Load u2_strideu from stack
    340 
    341     ldr             r8, [sp, #52]       @// Load u2_dest_stride_uv from stack
    342 
    343     sub             r7, r7, r5, lsr #1  @// Source increment
    344 
    345     sub             r8, r8, r5          @// Destination increment
    346 
    347     mov             r5, r5, lsr #1
    348     mov             r4, r4, lsr #1
    349     ldr             r3, [sp, #24]       @// Load pu1_dest_uv from stack
    350 yuv420sp_vu_row_loop_uv:
    351     mov             r6, r5
    352 
    353 
    354 yuv420sp_vu_col_loop_uv:
    355     pld             [r1, #128]
    356     pld             [r2, #128]
    357     vld1.8          d1, [r1]!
    358     vld1.8          d0, [r2]!
    359     vst2.8          {d0, d1}, [r3]!
    360     sub             r6, r6, #8
    361     cmp             r6, #7
    362     bgt             yuv420sp_vu_col_loop_uv
    363 
    364     cmp             r6, #0
    365     beq             yuv420sp_vu_row_loop_end_uv
    366     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    367     @//Ex if width is 162, above loop will process 160 pixels. And
    368     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    369     @// and written using VLD1 and VST1
    370     rsb             r6, r6, #8
    371     sub             r1, r1, r6
    372     sub             r2, r2, r6
    373     sub             r3, r3, r6, lsl #1
    374 
    375     vld1.8          d1, [r1]!
    376     vld1.8          d0, [r2]!
    377     vst2.8          {d0, d1}, [r3]!
    378 
    379 yuv420sp_vu_row_loop_end_uv:
    380     add             r1, r1, r7
    381     add             r2, r2, r7
    382     add             r3, r3, r8
    383     subs            r4, r4, #1
    384     bgt             yuv420sp_vu_row_loop_uv
    385     @//POP THE REGISTERS
    386     ldmfd           sp!, {r4-r8, pc}
    387 
    388 
    389 
    390 
    391 
    392