1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 21 .text 22 .p2align 2 23 24 @/***************************************************************************** 25 @* * 26 @* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() * 27 @* * 28 @* Description : This function conversts the image from YUV420P color * 29 @* space to 420SP color space(UV interleaved). * 30 @* * 31 @* Arguments : R0 pu1_y * 32 @* R1 pu1_u * 33 @* R2 pu1_v * 34 @* R3 pu1_dest_y * 35 @* [R13 #40] pu1_dest_uv * 36 @* [R13 #44] u2_height * 37 @* [R13 #48] u2_width * 38 @* [R13 #52] u2_stridey * 39 @* [R13 #56] u2_strideu * 40 @* [R13 #60] u2_stridev * 41 @* [R13 #64] u2_dest_stride_y * 42 @* [R13 #68] u2_dest_stride_uv * 43 @* [R13 #72] convert_uv_only * 44 @* * 45 @* Values Returned : None * 46 @* * 47 @* Register Usage : R0 - R14 * 48 @* * 49 @* Stack Usage : 40 Bytes * 50 @* * 51 @* Interruptibility : Interruptible * 52 @* * 53 @* Known Limitations * 54 @* Assumptions: Image Width: Assumed to be multiple of 16 and * 55 @* greater than or equal to 16 * 56 @* Image Height: Assumed to be even. * 57 @* * 58 @* Revision History : * 59 @* DD MM YYYY Author(s) Changes (Describe the changes made) * 60 @* 07 06 2010 Varshita Draft * 61 @* 07 06 2010 Naveen Kr T Completed * 62 @* * 63 @*****************************************************************************/ 64 .global ih264e_fmt_conv_420p_to_420sp_a9q 65 66 ih264e_fmt_conv_420p_to_420sp_a9q: 67 68 @// push the registers on the stack 69 stmfd sp!, {r4-r12, lr} 70 71 ldr r4, [sp, #72] @// Load convert_uv_only 72 73 cmp r4, #1 74 beq yuv420sp_uv_chroma 75 @/* Do the preprocessing before the main loops start */ 76 @// Load the parameters from stack 77 ldr r4, [sp, #44] @// Load u2_height from stack 78 ldr r5, [sp, #48] @// Load u2_width from stack 79 ldr r7, [sp, #52] @// Load u2_stridey from stack 80 ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack 81 sub r7, r7, r5 @// Source increment 82 sub r8, r8, r5 @// Destination increment 83 84 yuv420sp_uv_row_loop_y: 85 mov r6, r5 86 87 yuv420sp_uv_col_loop_y: 88 pld [r0, #128] 89 vld1.8 {d0, d1}, [r0]! 90 vst1.8 {d0, d1}, [r3]! 91 sub r6, r6, #16 92 cmp r6, #15 93 bgt yuv420sp_uv_col_loop_y 94 95 cmp r6, #0 96 beq yuv420sp_uv_row_loop_end_y 97 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 98 @//Ex if width is 162, above loop will process 160 pixels. And 99 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 100 @// and written using VLD1 and VST1 101 rsb r6, r6, #16 102 sub r0, r0, r6 103 sub r3, r3, r6 104 105 vld1.8 {d0, d1}, [r0]! 106 vst1.8 {d0, d1}, [r3]! 107 108 yuv420sp_uv_row_loop_end_y: 109 add r0, r0, r7 110 add r3, r3, r8 111 subs r4, r4, #1 112 bgt yuv420sp_uv_row_loop_y 113 114 yuv420sp_uv_chroma: 115 116 ldr r3, [sp, #40] @// Load pu1_dest_uv from stack 117 118 ldr r4, [sp, #44] @// Load u2_height from stack 119 120 ldr r5, [sp, #48] @// Load u2_width from stack 121 122 123 ldr r7, [sp, #56] @// Load u2_strideu from stack 124 125 ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack 126 127 sub r7, r7, r5, lsr #1 @// Source increment 128 129 sub r8, r8, r5 @// Destination increment 130 131 mov r5, r5, lsr #1 132 mov r4, r4, lsr #1 133 ldr r3, [sp, #40] @// Load pu1_dest_uv from stack 134 135 yuv420sp_uv_row_loop_uv: 136 mov r6, r5 137 138 139 yuv420sp_uv_col_loop_uv: 140 pld [r1, #128] 141 pld [r2, #128] 142 vld1.8 d0, [r1]! 143 vld1.8 d1, [r2]! 144 vst2.8 {d0, d1}, [r3]! 145 sub r6, r6, #8 146 cmp r6, #7 147 bgt yuv420sp_uv_col_loop_uv 148 149 cmp r6, #0 150 beq yuv420sp_uv_row_loop_end_uv 151 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 152 @//Ex if width is 162, above loop will process 160 pixels. And 153 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 154 @// and written using VLD1 and VST1 155 rsb r6, r6, #8 156 sub r1, r1, r6 157 sub r2, r2, r6 158 sub r3, r3, r6, lsl #1 159 160 vld1.8 d0, [r1]! 161 vld1.8 d1, [r2]! 162 vst2.8 {d0, d1}, [r3]! 163 164 yuv420sp_uv_row_loop_end_uv: 165 add r1, r1, r7 166 add r2, r2, r7 167 add r3, r3, r8 168 subs r4, r4, #1 169 bgt yuv420sp_uv_row_loop_uv 170 @//POP THE REGISTERS 171 ldmfd sp!, {r4-r12, pc} 172 173 174 175 176 177 @ /** 178 @ ******************************************************************************* 179 @ * 180 @ * @brief ih264e_fmt_conv_422i_to_420sp_a9q 181 @ * Function used from format conversion or frame copy 182 @ * 183 @ * 184 @ * 185 @ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane. 186 @ * r1 - pu1_u - UWORD8 pointer to u plane. 187 @ * r2 - pu1_v - UWORD8 pointer to u plane. 188 @ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage. 189 @ * stack + 40 - u4_width - Width of the Y plane. 190 @ * 44 - u4_height - Height of the Y plane. 191 @ * 48 - u4_stride_y - Stride in pixels of Y plane. 192 @ * 52 - u4_stride_u - Stride in pixels of U plane. 193 @ * 56 - u4_stride_v - Stride in pixels of V plane. 194 @ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image. 195 @ * 196 @ * @par Description 197 @ * Function used from copying or converting a reference frame to display buffer 198 @ * in non shared mode 199 @ * 200 @ * @param[in] pu1_y_dst 201 @ * Output Y pointer 202 @ * 203 @ * @param[in] pu1_u_dst 204 @ * Output U/UV pointer ( UV is interleaved in the same format as that of input) 205 @ * 206 @ * @param[in] pu1_v_dst 207 @ * Output V pointer ( used in 420P output case) 208 @ * 209 @ * @param[in] u4_dst_y_strd 210 @ * Stride of destination Y buffer 211 @ * 212 @ * @param[in] u4_dst_u_strd 213 @ * Stride of destination U/V buffer 214 @ * 215 @ * 216 @ * @param[in] blocking 217 @ * To indicate whether format conversion should wait till frame is reconstructed 218 @ * and then return after complete copy is done. To be set to 1 when called at the 219 @ * end of frame processing and set to 0 when called between frame processing modules 220 @ * in order to utilize available MCPS 221 @ * 222 @ * @returns Error from IH264E_ERROR_T 223 @ * 224 @ * @remarks 225 @ * Assumes that the stride of U and V buffers are same. 226 @ * This is correct in most cases 227 @ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also 228 @ * Since we read 4 pixels ata time the width should be aligned to 4 229 @ * In assembly width should be aligned to 16 and height to 2. 230 @ * 231 @ * 232 @ * Revision History : 233 @ * DD MM YYYY Author(s) Changes (Describe the changes made) 234 @ * 07 06 2010 Harinarayanan K K Adapeted to 422p 235 @ * 236 @ ******************************************************************************* 237 @ */ 238 239 @//` 240 @*/ 241 .global ih264e_fmt_conv_422i_to_420sp_a9q 242 ih264e_fmt_conv_422i_to_420sp_a9q: 243 stmfd sp!, {r4-r12, lr} @// Back the register which are used 244 245 246 247 @/* Do the preprocessing before the main loops start */ 248 @// Load the parameters from stack 249 ldr r4, [sp, #48] @// Load u4_stride_y from stack 250 251 ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack 252 add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y 253 254 ldr r7, [sp, #40] @// Load u4_width from stack 255 add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel) 256 257 ldr r9, [sp, #52] @// Load u4_stride_u from stack 258 sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width 259 260 @LDR r10,[sp,#56] ;// Load u4_stride_v from stack 261 sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width 262 263 ldr r11, [sp, #44] @// Load u4_height from stack 264 sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1 265 266 @ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1 267 mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2 268 269 mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1) 270 271 add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y 272 add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i 273 274 @// Register Assignment 275 @// pu1_y - r0 276 @// pu1_y_nxt_row - r6 277 @// pu1_u - r1 278 @// pu1_v - r2 279 @// pu2_yuv422i - r3 280 @// pu2_yuv422i_nxt_row - r8 281 @// u2_offset1 - r4 282 @// u2_offset2 - r9 283 @// u2_offset3 - r10 284 @// u2_offset_yuv422i - r5 285 @// u4_width / 16 - r7 286 @// u4_height / 2 - r11 287 @// inner loop count - r12 288 yuv422i_to_420sp_height_loop: 289 290 mov r12, r7 @// Inner loop count = u4_width / 16 291 292 yuv422i_to_420sp_width_loop: 293 vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1 294 vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2 295 sub r12, r12, #16 296 297 vrhadd.u8 d0, d0, d4 298 vrhadd.u8 d2, d2, d6 299 300 vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y 301 vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y 302 303 vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U 304 305 cmp r12, #15 306 bgt yuv422i_to_420sp_width_loop 307 cmp r12, #0 308 beq yuv422i_to_420sp_row_loop_end 309 310 @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 311 @//Ex if width is 162, above loop will process 160 pixels. And 312 @//Both source and destination will point to 146th pixel and then 16 bytes will be read 313 @// and written using VLD1 and VST1 314 rsb r12, r12, #16 315 sub r3, r3, r12, lsl #1 316 sub r8, r8, r12, lsl #1 317 sub r0, r0, r12 318 sub r6, r6, r12 319 sub r1, r1, r12 320 321 vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1 322 vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2 323 324 vrhadd.u8 d0, d0, d4 325 vrhadd.u8 d2, d2, d6 326 327 vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y 328 vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y 329 330 vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U 331 332 yuv422i_to_420sp_row_loop_end: 333 @// Update the buffer pointer so that they will refer to next pair of rows 334 add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1 335 add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1 336 337 add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2 338 subs r11, r11, #1 339 340 add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i 341 342 add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i 343 bgt yuv422i_to_420sp_height_loop 344 ldmfd sp!, {r4-r12, pc} @// Restore the register which are used 345 346 347 348