1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 ///******************************************************************************* 20 //* //file 21 //* ihevcd_fmt_conv_420sp_to_420p.s 22 //* 23 //* //brief 24 //* contains function definitions for format conversions 25 //* 26 //* //author 27 //* ittiam 28 //* 29 //* //par list of functions: 30 //* 31 //* 32 //* //remarks 33 //* none 34 //* 35 //*******************************************************************************/ 36 37 .text 38 39 .include "ihevc_neon_macros.s" 40 41 42 43 44 ///***************************************************************************** 45 //* * 46 //* Function Name : neon_copy_yuv420sp_to_yuv420p() * 47 //* * 48 //* Description : This function conversts the image from YUV420sP color * 49 //* space to 420SP color space(UV interleaved). * 50 //* * 51 //* Arguments : x0 pu1_src_y * 52 //* x1 pu1_src_uv * 53 //* x2 pu1_dest_y * 54 //* x3 pu1_dest_u * 55 //* [x13 #40] pu1_dest_v * 56 //* [x13 #44] u2_width * 57 //* [x13 #48] u2_height * 58 //* [x13 #52] u2_stridey * 59 //* [x13 #56] u2_strideuv * 60 //* [x13 #60] u2_dest_stridey * 61 //* [x13 #64] u2_dest_strideuv * 62 //* [x13 #68] is_u_first * 63 //* [x13 #72] disable_luma_copy * 64 //* * 65 //* Values Returned : None * 66 //* * 67 //* Register Usage : x0 - x14 * 68 //* * 69 //* Stack Usage : 40 Bytes * 70 //* * 71 //* Interruptibility : Interruptible * 72 //* * 73 //* Known Limitations * 74 //* Assumptions: Image Width: Assumed to be multiple of 2 and * 75 //* Image Height: Assumed to be even. * 76 //* * 77 //* Revision History : * 78 //* DD MM YYYY Author(s) Changes (Describe the changes made) * 79 //* 16 05 2012 Naveen SR draft * 80 //* * 81 //*****************************************************************************/ 82 83 .globl ihevcd_fmt_conv_420sp_to_420p_av8 84 85 .type ihevcd_fmt_conv_420sp_to_420p_av8, %function 86 87 ihevcd_fmt_conv_420sp_to_420p_av8: 88 // STMFD sp!,{x4-x12, x14} 89 push_v_regs 90 stp x19, x20,[sp,#-16]! 91 mov x15, x4 92 mov x8, x5 ////Load u2_width 93 mov x9, x6 ////Load u2_height 94 95 LDR w5, [sp,#88] ////Load u2_dest_stridey 96 sxtw x5,w5 97 // LDR x6,[sp,#80] @//Load u2_strideuv 98 99 SUB x10,x7,x8 //// Src Y increment 100 SUB x11,x5,x8 //// Dst Y increment 101 102 LDR w5, [sp,#112] ////Load disable_luma_copy flag 103 sxtw x5,w5 104 CMP x5,#0 ////skip luma if disable_luma_copy is non-zero 105 BNE uv_copy_start 106 107 ///* Copy Y */ 108 109 MOV x4,x9 //// Copying height 110 y_row_loop: 111 MOV x6,x8 //// Copying width 112 113 y_col_loop: 114 115 SUB x6,x6,#16 116 ld1 {v0.8b, v1.8b},[x0],#16 117 st1 {v0.8b, v1.8b},[x2],#16 118 CMP x6,#16 119 BGE y_col_loop 120 CMP x6,#0 121 BEQ y_col_loop_end 122 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 123 ////Ex if width is 162, above loop will process 160 pixels. And 124 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 125 //// and written using VLD1 and VST1 126 sub x20,x6,#16 127 neg x6, x20 128 SUB x0,x0,x6 129 SUB x2,x2,x6 130 ld1 {v0.8b, v1.8b}, [x0],#16 131 st1 {v0.8b, v1.8b}, [x2],#16 132 133 y_col_loop_end: 134 ADD x0, x0, x10 135 ADD x2, x2, x11 136 SUBS x4, x4, #1 137 BGT y_row_loop 138 139 140 ///* Copy UV */ 141 uv_copy_start: 142 143 LDR w5, [sp,#96] ////Load u2_dest_strideuv 144 sxtw x5,w5 145 LDR w7, [sp,#80] ////Load u2_strideuv 146 sxtw x7,w7 147 148 LSR x9, x9, #1 //// height/2 149 // MOV x8,x8,LSR #1 @// Width/2 150 151 SUB x10,x7,x8 //// Src UV increment 152 LSR x11, x8, #1 153 SUB x11,x5,x11 //// Dst U and V increment 154 155 mov x5, x15 ////Load pu1_dest_v 156 157 LDR w4, [sp,#104] ////Load is_u_first_flag 158 sxtw x4,w4 159 CMP x4,#0 ////Swap U and V dest if is_u_first_flag is zero 160 csel x4, x5, x4,EQ 161 csel x5, x3, x5,EQ 162 csel x3, x4, x3,EQ 163 164 MOV x4,x9 //// Copying height 165 uv_row_loop: 166 MOV x6,x8 //// Copying width 167 168 uv_col_loop: 169 170 SUB x6,x6,#16 171 172 prfm PLDL1KEEP,[x1,#128] 173 ld2 {v0.8b, v1.8b},[x1],#16 174 ST1 {v0.8b},[x3],#8 175 ST1 {v1.8b},[x5],#8 176 CMP x6,#16 177 BGE uv_col_loop 178 CMP x6,#0 179 BEQ uv_col_loop_end 180 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 181 ////Ex if width is 162, above loop will process 160 pixels. And 182 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 183 //// and written using VLD1 and VST1 184 sub x20,x6,#16 185 neg x6, x20 186 SUB x1,x1,x6 187 SUB x3,x3,x6,LSR #1 188 SUB x5,x5,x6,LSR #1 189 ld2 {v0.8b, v1.8b}, [x1],#16 190 ST1 {v0.8b},[x3],#8 191 ST1 {v1.8b},[x5],#8 192 uv_col_loop_end: 193 ADD x1, x1, x10 194 ADD x3, x3, x11 195 ADD x5, x5, x11 196 SUBS x4, x4, #1 197 BGT uv_row_loop 198 199 exit: 200 // LDMFD sp!,{x4-x12, pc} 201 ldp x19, x20,[sp],#16 202 pop_v_regs 203 ret 204 205 206 207 208 209 210