Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 
     21 .text
     22 .p2align 2
     23 
     24 @/*****************************************************************************
     25 @*                                                                            *
     26 @*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
     27 @*                                                                            *
     28 @*  Description      : This function conversts the image from YUV420P color   *
     29 @*                     space to 420SP color space(UV interleaved).        *
     30 @*                                                                            *
     31 @*  Arguments        : R0           pu1_y                                     *
     32 @*                     R1           pu1_u                                     *
     33 @*                     R2           pu1_v                                     *
     34 @*                     R3           pu1_dest_y                                *
     35 @*                     [R13 #40]    pu1_dest_uv                               *
     36 @*                     [R13 #44]    u2_height                                 *
     37 @*                     [R13 #48]    u2_width                                  *
     38 @*                     [R13 #52]    u2_stridey                                *
     39 @*                     [R13 #56]    u2_strideu                                *
     40 @*                     [R13 #60]    u2_stridev                                *
     41 @*                     [R13 #64]    u2_dest_stride_y                          *
     42 @*                     [R13 #68]    u2_dest_stride_uv                         *
     43 @*                     [R13 #72]    convert_uv_only                           *
     44 @*                                                                            *
     45 @*  Values Returned  : None                                                   *
     46 @*                                                                            *
     47 @*  Register Usage   : R0 - R14                                               *
     48 @*                                                                            *
     49 @*  Stack Usage      : 40 Bytes                                               *
     50 @*                                                                            *
     51 @*  Interruptibility : Interruptible                                          *
     52 @*                                                                            *
     53 @*  Known Limitations                                                         *
     54 @*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
     55 @*                     greater than or equal to 16                *
     56 @*                     Image Height:    Assumed to be even.                   *
     57 @*                                                                            *
     58 @*  Revision History :                                                        *
     59 @*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
     60 @*         07 06 2010   Varshita        Draft                                 *
     61 @*         07 06 2010   Naveen Kr T     Completed                             *
     62 @*                                                                            *
     63 @*****************************************************************************/
     64     .global ih264e_fmt_conv_420p_to_420sp_a9q
     65 
     66 ih264e_fmt_conv_420p_to_420sp_a9q:
     67 
     68     @// push the registers on the stack
     69     stmfd         sp!, {r4-r12, lr}
     70 
     71     ldr           r4, [sp, #72]         @// Load convert_uv_only
     72 
     73     cmp           r4, #1
     74     beq           yuv420sp_uv_chroma
     75     @/* Do the preprocessing before the main loops start */
     76     @// Load the parameters from stack
     77     ldr           r4, [sp, #44]         @// Load u2_height from stack
     78     ldr           r5, [sp, #48]         @// Load u2_width from stack
     79     ldr           r7, [sp, #52]         @// Load u2_stridey from stack
     80     ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
     81     sub           r7, r7, r5            @// Source increment
     82     sub           r8, r8, r5            @// Destination increment
     83 
     84 yuv420sp_uv_row_loop_y:
     85     mov           r6, r5
     86 
     87 yuv420sp_uv_col_loop_y:
     88     pld           [r0, #128]
     89     vld1.8        {d0, d1}, [r0]!
     90     vst1.8        {d0, d1}, [r3]!
     91     sub           r6, r6, #16
     92     cmp           r6, #15
     93     bgt           yuv420sp_uv_col_loop_y
     94 
     95     cmp           r6, #0
     96     beq           yuv420sp_uv_row_loop_end_y
     97     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     98     @//Ex if width is 162, above loop will process 160 pixels. And
     99     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    100     @// and written using VLD1 and VST1
    101     rsb           r6, r6, #16
    102     sub           r0, r0, r6
    103     sub           r3, r3, r6
    104 
    105     vld1.8        {d0, d1}, [r0]!
    106     vst1.8        {d0, d1}, [r3]!
    107 
    108 yuv420sp_uv_row_loop_end_y:
    109     add           r0, r0, r7
    110     add           r3, r3, r8
    111     subs          r4, r4, #1
    112     bgt           yuv420sp_uv_row_loop_y
    113 
    114 yuv420sp_uv_chroma:
    115 
    116     ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
    117 
    118     ldr           r4, [sp, #44]         @// Load u2_height from stack
    119 
    120     ldr           r5, [sp, #48]         @// Load u2_width from stack
    121 
    122 
    123     ldr           r7, [sp, #56]         @// Load u2_strideu from stack
    124 
    125     ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack
    126 
    127     sub           r7, r7, r5, lsr #1    @// Source increment
    128 
    129     sub           r8, r8, r5            @// Destination increment
    130 
    131     mov           r5, r5, lsr #1
    132     mov           r4, r4, lsr #1
    133     ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack
    134 
    135 yuv420sp_uv_row_loop_uv:
    136     mov           r6, r5
    137 
    138 
    139 yuv420sp_uv_col_loop_uv:
    140     pld           [r1, #128]
    141     pld           [r2, #128]
    142     vld1.8        d0, [r1]!
    143     vld1.8        d1, [r2]!
    144     vst2.8        {d0, d1}, [r3]!
    145     sub           r6, r6, #8
    146     cmp           r6, #7
    147     bgt           yuv420sp_uv_col_loop_uv
    148 
    149     cmp           r6, #0
    150     beq           yuv420sp_uv_row_loop_end_uv
    151     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    152     @//Ex if width is 162, above loop will process 160 pixels. And
    153     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    154     @// and written using VLD1 and VST1
    155     rsb           r6, r6, #8
    156     sub           r1, r1, r6
    157     sub           r2, r2, r6
    158     sub           r3, r3, r6, lsl #1
    159 
    160     vld1.8        d0, [r1]!
    161     vld1.8        d1, [r2]!
    162     vst2.8        {d0, d1}, [r3]!
    163 
    164 yuv420sp_uv_row_loop_end_uv:
    165     add           r1, r1, r7
    166     add           r2, r2, r7
    167     add           r3, r3, r8
    168     subs          r4, r4, #1
    169     bgt           yuv420sp_uv_row_loop_uv
    170     @//POP THE REGISTERS
    171     ldmfd         sp!, {r4-r12, pc}
    172 
    173 
    174 
    175 
    176 
    177 @ /**
    178 @ *******************************************************************************
    179 @ *
    180 @ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
    181 @ *     Function used from format conversion or frame copy
    182 @ *
    183 @ *
    184 @ *
    185 @ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
    186 @ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
    187 @ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
    188 @ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
    189 @ *             stack + 40 - u4_width         -   Width of the Y plane.
    190 @ *                     44 - u4_height        -   Height of the Y plane.
    191 @ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
    192 @ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
    193 @ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
    194 @ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
    195 @ *
    196 @ * @par   Description
    197 @ * Function used from copying or converting a reference frame to display buffer
    198 @ * in non shared mode
    199 @ *
    200 @ * @param[in] pu1_y_dst
    201 @ *   Output Y pointer
    202 @ *
    203 @ * @param[in] pu1_u_dst
    204 @ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
    205 @ *
    206 @ * @param[in] pu1_v_dst
    207 @ *   Output V pointer ( used in 420P output case)
    208 @ *
    209 @ * @param[in] u4_dst_y_strd
    210 @ *   Stride of destination Y buffer
    211 @ *
    212 @ * @param[in] u4_dst_u_strd
    213 @ *   Stride of destination  U/V buffer
    214 @ *
    215 @ *
    216 @ * @param[in] blocking
    217 @ *   To indicate whether format conversion should wait till frame is reconstructed
    218 @ *   and then return after complete copy is done. To be set to 1 when called at the
    219 @ *   end of frame processing and set to 0 when called between frame processing modules
    220 @ *   in order to utilize available MCPS
    221 @ *
    222 @ * @returns Error from IH264E_ERROR_T
    223 @ *
    224 @ * @remarks
    225 @ * Assumes that the stride of U and V buffers are same.
    226 @ * This is correct in most cases
    227 @ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
    228 @ * Since we read 4 pixels ata time the width should be aligned to 4
    229 @ * In assembly width should be aligned to 16 and height to 2.
    230 @ *
    231 @ *
    232 @ * Revision History :
    233 @ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
    234 @ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
    235 @ *
    236 @ *******************************************************************************
    237 @ */
    238 
    239 @//`
    240 @*/
    241     .global ih264e_fmt_conv_422i_to_420sp_a9q
    242 ih264e_fmt_conv_422i_to_420sp_a9q:
    243     stmfd         sp!, {r4-r12, lr}     @// Back the register which are used
    244 
    245 
    246 
    247     @/* Do the preprocessing before the main loops start */
    248     @// Load the parameters from stack
    249     ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack
    250 
    251     ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
    252     add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y
    253 
    254     ldr           r7, [sp, #40]         @// Load u4_width          from stack
    255     add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
    256 
    257     ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
    258     sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width
    259 
    260 @LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
    261     sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width
    262 
    263     ldr           r11, [sp, #44]        @// Load u4_height         from stack
    264     sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1
    265 
    266 @   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
    267     mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2
    268 
    269     mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)
    270 
    271     add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
    272     add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
    273 
    274 @// Register Assignment
    275 @// pu1_y               - r0
    276 @// pu1_y_nxt_row       - r6
    277 @// pu1_u               - r1
    278 @// pu1_v               - r2
    279 @// pu2_yuv422i         - r3
    280 @// pu2_yuv422i_nxt_row - r8
    281 @// u2_offset1          - r4
    282 @// u2_offset2          - r9
    283 @// u2_offset3          - r10
    284 @// u2_offset_yuv422i   - r5
    285 @// u4_width / 16       - r7
    286 @// u4_height / 2       - r11
    287 @// inner loop count    - r12
    288 yuv422i_to_420sp_height_loop:
    289 
    290     mov           r12, r7               @// Inner loop count = u4_width / 16
    291 
    292 yuv422i_to_420sp_width_loop:
    293     vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
    294     vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
    295     sub           r12, r12, #16
    296 
    297     vrhadd.u8     d0, d0, d4
    298     vrhadd.u8     d2, d2, d6
    299 
    300     vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
    301     vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
    302 
    303     vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
    304 
    305     cmp           r12, #15
    306     bgt           yuv422i_to_420sp_width_loop
    307     cmp           r12, #0
    308     beq           yuv422i_to_420sp_row_loop_end
    309 
    310     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    311     @//Ex if width is 162, above loop will process 160 pixels. And
    312     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
    313     @// and written using VLD1 and VST1
    314     rsb           r12, r12, #16
    315     sub           r3, r3, r12, lsl #1
    316     sub           r8, r8, r12, lsl #1
    317     sub           r0, r0, r12
    318     sub           r6, r6, r12
    319     sub           r1, r1, r12
    320 
    321     vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
    322     vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
    323 
    324     vrhadd.u8     d0, d0, d4
    325     vrhadd.u8     d2, d2, d6
    326 
    327     vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
    328     vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y
    329 
    330     vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U
    331 
    332 yuv422i_to_420sp_row_loop_end:
    333     @// Update the buffer pointer so that they will refer to next pair of rows
    334     add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
    335     add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1
    336 
    337     add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
    338     subs          r11, r11, #1
    339 
    340     add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i
    341 
    342     add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
    343     bgt           yuv422i_to_420sp_height_loop
    344     ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
    345 
    346 
    347 
    348