1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 21 ///* 22 ////---------------------------------------------------------------------------- 23 //// File Name : impeg2_format_conv.s 24 //// 25 //// Description : This file has the Idct Implementations for the 26 //// MPEG4 SP decoder on neon platform. 27 //// 28 //// Reference Document : 29 //// 30 //// Revision History : 31 //// Date Author Detail Description 32 //// ------------ ---------------- ---------------------------------- 33 //// Jul 07, 2008 Naveen Kumar T Created 34 //// 35 ////------------------------------------------------------------------------- 36 //*/ 37 38 ///* 39 //// ---------------------------------------------------------------------------- 40 //// Include Files 41 //// ---------------------------------------------------------------------------- 42 //*/ 43 .set log2_16 , 4 44 .set log2_2 , 1 45 46 .text 47 .include "impeg2_neon_macros.s" 48 ///* 49 //// ---------------------------------------------------------------------------- 50 //// Struct/Union Types and Define 51 //// ---------------------------------------------------------------------------- 52 //*/ 53 54 ///* 55 //// ---------------------------------------------------------------------------- 56 //// Static Global Data section variables 57 //// ---------------------------------------------------------------------------- 58 //*/ 59 ////--------------------------- NONE -------------------------------------------- 60 61 ///* 62 //// ---------------------------------------------------------------------------- 63 //// Static Prototype Functions 64 //// ---------------------------------------------------------------------------- 65 //*/ 66 //// -------------------------- NONE -------------------------------------------- 67 68 ///* 69 //// ---------------------------------------------------------------------------- 70 //// Exported functions 71 //// ---------------------------------------------------------------------------- 72 //*/ 73 74 75 ///***************************************************************************** 76 //* * 77 //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * 78 //* * 79 //* Description : This function conversts the image from YUV420P color * 80 //* space to 420SP color space(UV interleaved). * 81 //* * 82 //* Arguments : x0 pu1_y * 83 //* x1 pu1_u * 84 //* x2 pu1_v * 85 //* x3 pu1_dest_y * 86 //* x4 pu1_dest_uv * 87 //* x5 u2_height * 88 //* x6 u2_width * 89 //* x7 u2_stridey * 90 //* sp, #80 u2_strideu * 91 //* sp, #88 u2_stridev * 92 //* sp, #96 u2_dest_stride_y * 93 //* sp, #104 u2_dest_stride_uv * 94 //* sp, #112 convert_uv_only * 95 //* * 96 //* Values Returned : None * 97 //* * 98 //* Register Usage : x8, x10, x16, x20, v0, v1 * 99 //* * 100 //* Stack Usage : 80 Bytes * 101 //* * 102 //* Interruptibility : Interruptible * 103 //* * 104 //* Known Limitations * 105 //* Assumptions: Image Width: Assumed to be multiple of 16 and * 106 //* greater than or equal to 16 * 107 //* Image Height: Assumed to be even. * 108 //* * 109 //* Revision History : * 110 //* DD MM YYYY Author(s) Changes (Describe the changes made) * 111 //* 07 06 2010 Varshita Draft * 112 //* 07 06 2010 Naveen Kr T Completed * 113 //* * 114 //*****************************************************************************/ 115 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 116 impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: 117 118 //// push the registers on the stack 119 // pu1_y, - x0 120 // pu1_u, - x1 121 // pu1_v, - x2 122 // pu1_dest_y, - x3 123 // pu1_dest_uv, - x4 124 // u2_height, - x5 125 // u2_width, - x6 126 // u2_stridey, - x7 127 // u2_strideu, - sp, #80 128 // u2_stridev, - sp, #88 129 // u2_dest_stride_y, - sp, #96 130 // u2_dest_stride_uv, - sp, #104 131 // convert_uv_only - sp, #112 132 // STMFD sp!,{x4-x12,x14} 133 push_v_regs 134 stp x19, x20, [sp, #-16]! 135 136 ldr w14, [sp, #112] //// Load convert_uv_only 137 138 cmp w14, #1 139 beq yuv420sp_uv_chroma 140 ///* Do the preprocessing before the main loops start */ 141 //// Load the parameters from stack 142 143 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 144 uxtw x8, w8 145 146 sub x7, x7, x6 //// Source increment 147 148 sub x8, x8, x6 //// Destination increment 149 150 151 yuv420sp_uv_row_loop_y: 152 mov x16, x6 153 154 yuv420sp_uv_col_loop_y: 155 prfm pldl1keep, [x0, #128] 156 ld1 {v0.8b, v1.8b}, [x0], #16 157 st1 {v0.8b, v1.8b}, [x3], #16 158 sub x16, x16, #16 159 cmp x16, #15 160 bgt yuv420sp_uv_col_loop_y 161 162 cmp x16, #0 163 beq yuv420sp_uv_row_loop__y 164 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 165 ////Ex if width is 162, above loop will process 160 pixels. And 166 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 167 //// and written using VLD1 and VST1 168 sub x20, x16, #16 169 neg x16, x20 170 sub x0, x0, x16 171 sub x3, x3, x16 172 173 ld1 {v0.8b, v1.8b}, [x0], #16 174 st1 {v0.8b, v1.8b}, [x3], #16 175 176 yuv420sp_uv_row_loop__y: 177 add x0, x0, x7 178 add x3, x3, x8 179 subs x5, x5, #1 180 bgt yuv420sp_uv_row_loop_y 181 182 yuv420sp_uv_chroma: 183 ldr w7, [sp, #88] //// Load u2_strideu from stack 184 sxtw x7, w7 185 186 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 187 sxtw x8, w8 188 189 sub x7, x7, x6, lsr #1 //// Source increment 190 191 sub x8, x8, x6 //// Destination increment 192 193 lsr x6, x6, #1 194 lsr x5, x5, #1 195 yuv420sp_uv_row_loop_uv: 196 mov x16, x6 197 198 199 yuv420sp_uv_col_loop_uv: 200 prfm pldl1keep, [x1, #128] 201 prfm pldl1keep, [x2, #128] 202 203 ld1 {v0.8b}, [x1], #8 204 ld1 {v1.8b}, [x2], #8 205 st2 {v0.8b, v1.8b}, [x4], #16 206 207 sub x16, x16, #8 208 cmp x16, #7 209 bgt yuv420sp_uv_col_loop_uv 210 211 cmp x16, #0 212 beq yuv420sp_uv_row_loop__uv 213 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 214 ////Ex if width is 162, above loop will process 160 pixels. And 215 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 216 //// and written using VLD1 and VST1 217 sub x20, x16, #8 218 neg x16, x20 219 sub x1, x1, x16 220 sub x2, x2, x16 221 sub x4, x4, x16, lsl #1 222 223 ld1 {v0.8b}, [x1], #8 224 ld1 {v1.8b}, [x2], #8 225 st2 {v0.8b, v1.8b}, [x4], #16 226 227 yuv420sp_uv_row_loop__uv: 228 add x1, x1, x7 229 add x2, x2, x7 230 add x4, x4, x8 231 subs x5, x5, #1 232 bgt yuv420sp_uv_row_loop_uv 233 ////POP THE REGISTERS 234 // LDMFD sp!,{x4-x12,PC} 235 ldp x19, x20, [sp], #16 236 pop_v_regs 237 ret 238 239 240 241 242 243 ///***************************************************************************** 244 //* * 245 //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * 246 //* * 247 //* Description : This function conversts the image from YUV420P color * 248 //* space to 420SP color space(VU interleaved). * 249 //* This function is similar to above function * 250 //* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * 251 //* VLD1.8 for chroma - order of registers is different * 252 //* * 253 //* Arguments : x0 pu1_y * 254 //* x1 pu1_u * 255 //* x2 pu1_v * 256 //* x3 pu1_dest_y * 257 //* x4 pu1_dest_uv * 258 //* x5 u2_height * 259 //* x6 u2_width * 260 //* x7 u2_stridey * 261 //* sp, #80 u2_strideu * 262 //* sp, #88 u2_stridev * 263 //* sp, #96 u2_dest_stride_y * 264 //* sp, #104 u2_dest_stride_uv * 265 //* sp, #112 convert_uv_only * 266 //* * 267 //* Values Returned : None * 268 //* * 269 //* Register Usage : x8, x14, x16, x20, v0, v1 * 270 //* * 271 //* Stack Usage : 80 Bytes * 272 //* * 273 //* Interruptibility : Interruptible * 274 //* * 275 //* Known Limitations * 276 //* Assumptions: Image Width: Assumed to be multiple of 16 and * 277 //* greater than or equal to 16 * 278 //* Image Height: Assumed to be even. * 279 //* * 280 //* Revision History : * 281 //* DD MM YYYY Author(s) Changes (Describe the changes made) * 282 //* 07 06 2010 Varshita Draft * 283 //* 07 06 2010 Naveen Kr T Completed * 284 //* * 285 //*****************************************************************************/ 286 287 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 288 impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: 289 290 //// push the registers on the stack 291 // pu1_y, - x0 292 // pu1_u, - x1 293 // pu1_v, - x2 294 // pu1_dest_y, - x3 295 // pu1_dest_uv, - x4 296 // u2_height, - x5 297 // u2_width, - x6 298 // u2_stridey, - x7 299 // u2_strideu, - sp, #80 300 // u2_stridev, - sp, #88 301 // u2_dest_stride_y, - sp, #96 302 // u2_dest_stride_uv, - sp, #104 303 // convert_uv_only - sp, #112 304 // STMFD sp!,{x4-x12,x14} 305 push_v_regs 306 stp x19, x20, [sp, #-16]! 307 308 ldr w14, [sp, #112] //// Load convert_uv_only 309 310 cmp w14, #1 311 beq yuv420sp_vu_chroma 312 313 ///* Do the preprocessing before the main loops start */ 314 //// Load the parameters from stack 315 316 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 317 uxtw x8, w8 318 319 sub x7, x7, x6 //// Source increment 320 321 sub x8, x8, x6 //// Destination increment 322 323 324 yuv420sp_vu_row_loop_y: 325 mov x16, x6 326 327 yuv420sp_vu_col_loop_y: 328 prfm pldl1keep, [x0, #128] 329 ld1 {v0.8b, v1.8b}, [x0], #16 330 st1 {v0.8b, v1.8b}, [x3], #16 331 sub x16, x16, #16 332 cmp x16, #15 333 bgt yuv420sp_vu_col_loop_y 334 335 cmp x16, #0 336 beq yuv420sp_vu_row_loop__y 337 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 338 ////Ex if width is 162, above loop will process 160 pixels. And 339 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 340 //// and written using VLD1 and VST1 341 sub x20, x16, #16 342 neg x16, x20 343 sub x0, x0, x16 344 sub x3, x3, x16 345 346 ld1 {v0.8b, v1.8b}, [x0], #16 347 st1 {v0.8b, v1.8b}, [x3], #16 348 349 yuv420sp_vu_row_loop__y: 350 add x0, x0, x7 351 add x3, x3, x8 352 subs x5, x5, #1 353 bgt yuv420sp_vu_row_loop_y 354 355 yuv420sp_vu_chroma: 356 ldr w7, [sp, #80] //// Load u2_strideu from stack 357 sxtw x7, w7 358 359 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 360 sxtw x8, w8 361 362 sub x7, x7, x6, lsr #1 //// Source increment 363 364 sub x8, x8, x6 //// Destination increment 365 366 lsr x6, x6, #1 367 lsr x5, x5, #1 368 yuv420sp_vu_row_loop_uv: 369 mov x16, x6 370 371 372 yuv420sp_vu_col_loop_uv: 373 prfm pldl1keep, [x1, #128] 374 prfm pldl1keep, [x2, #128] 375 ld1 {v1.8b}, [x1], #8 376 ld1 {v0.8b}, [x2], #8 377 st2 {v0.8b, v1.8b}, [x4], #16 378 sub x16, x16, #8 379 cmp x16, #7 380 bgt yuv420sp_vu_col_loop_uv 381 382 cmp x16, #0 383 beq yuv420sp_vu_row_loop__uv 384 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 385 ////Ex if width is 162, above loop will process 160 pixels. And 386 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 387 //// and written using VLD1 and VST1 388 sub x20, x16, #8 389 neg x16, x20 390 sub x1, x1, x16 391 sub x2, x2, x16 392 sub x4, x4, x16, lsl #1 393 394 ld1 {v1.8b}, [x1], #8 395 ld1 {v0.8b}, [x2], #8 396 st2 {v0.8b, v1.8b}, [x4], #16 397 398 yuv420sp_vu_row_loop__uv: 399 add x1, x1, x7 400 add x2, x2, x7 401 add x4, x4, x8 402 subs x5, x5, #1 403 bgt yuv420sp_vu_row_loop_uv 404 ////POP THE REGISTERS 405 // LDMFD sp!,{x4-x12,PC} 406 ldp x19, x20, [sp], #16 407 pop_v_regs 408 ret 409 410