1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 ///** 20 //******************************************************************************* 21 //* 22 //* //brief 23 //* interprediction luma function for copy 24 //* 25 //* //par description: 26 //* copies the array of width 'wd' and height 'ht' from the location pointed 27 //* by 'src' to the location pointed by 'dst' 28 //* 29 //* //param[in] pu1_src 30 //* uword8 pointer to the source 31 //* 32 //* //param[out] pu1_dst 33 //* uword8 pointer to the destination 34 //* 35 //* //param[in] src_strd 36 //* integer source stride 37 //* 38 //* //param[in] dst_strd 39 //* integer destination stride 40 //* 41 //* //param[in] pi1_coeff 42 //* word8 pointer to the filter coefficients 43 //* 44 //* //param[in] ht 45 //* integer height of the array 46 //* 47 //* //param[in] wd 48 //* integer width of the array 49 //* 50 //* //returns 51 //* 52 //* //remarks 53 //* none 54 //* 55 //******************************************************************************* 56 //*/ 57 //void ihevc_inter_pred_luma_copy ( 58 // uword8 *pu1_src, 59 // uword8 *pu1_dst, 60 // word32 src_strd, 61 // word32 dst_strd, 62 // word8 *pi1_coeff, 63 // word32 ht, 64 // word32 wd ) 65 66 //**************variables vs registers***************************************** 67 // x0 => *pu1_src 68 // x1 => *pu1_dst 69 // x2 => src_strd 70 // x3 => dst_strd 71 // x11 => ht 72 // x16 => wd 73 74 .text 75 .align 4 76 77 .include "ihevc_neon_macros.s" 78 79 .globl ihevc_inter_pred_luma_copy_av8 80 81 .type ihevc_inter_pred_luma_copy_av8, %function 82 83 ihevc_inter_pred_luma_copy_av8: 84 // stmfd sp!, {x8-x16, lr} //stack stores the values of the arguments 85 stp x19,x20,[sp, #-16]! 86 mov x16,x6 //loads wd 87 mov x11,x5 //loads ht 88 cmp x11,#0 //checks ht == 0 89 ble end_loops 90 tst x16,#15 //checks wd for multiples for 4 & 8 91 beq core_loop_wd_16 92 tst x16,#7 //checks wd for multiples for 4 & 8 93 beq core_loop_wd_8 94 sub x15,x16,#4 95 96 outer_loop_wd_4: 97 subs x8,x16,#0 //checks wd == 0 98 ble end_inner_loop_wd_4 99 100 inner_loop_wd_4: 101 ld1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 102 add x9,x0,x2 //pu1_src_tmp += src_strd 103 add x10,x1,x3 //pu1_dst_tmp += dst_strd 104 st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 105 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add x0,x0,#4 //pu1_src += 4 107 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 108 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 109 subs x8,x8,#4 //(wd -4) 110 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 111 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 112 add x1,x1,#4 //pu1_dst += 4 113 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 114 115 bgt inner_loop_wd_4 116 117 end_inner_loop_wd_4: 118 subs x11,x11,#4 //ht - 4 119 sub x0,x9,x15 //pu1_src = pu1_src_tmp 120 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 121 bgt outer_loop_wd_4 122 123 end_loops: 124 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 125 // MRS x20,PMCCFILTR_EL0 126 sub x0,x20,x19 127 ldp x19,x20,[sp],#16 128 ret 129 130 131 core_loop_wd_8: 132 sub x15,x16,#8 133 134 outer_loop_wd_8: 135 subs x8,x16,#0 //checks wd 136 ble end_inner_loop_wd_8 137 138 inner_loop_wd_8: 139 add x9,x0,x2 //pu1_src_tmp += src_strd 140 ld1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 141 add x10,x1,x3 //pu1_dst_tmp += dst_strd 142 st1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 143 ld1 {v1.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 144 st1 {v1.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 145 subs x8,x8,#8 //wd - 8(loop condition) 146 ld1 {v2.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 147 st1 {v2.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 148 ld1 {v3.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 149 st1 {v3.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 150 bgt inner_loop_wd_8 151 152 end_inner_loop_wd_8: 153 subs x11,x11,#4 //ht -= 4 154 sub x0,x9,x15 //pu1_src = pu1_src_tmp 155 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 156 bgt outer_loop_wd_8 157 158 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 159 // MRS x20,PMCCFILTR_EL0 160 sub x0,x20,x19 161 ldp x19,x20,[sp],#16 162 ret 163 164 core_loop_wd_16: 165 sub x15,x16,#16 166 167 outer_loop_wd_16: 168 subs x8,x16,#0 //checks wd 169 ble end_inner_loop_wd_16 170 171 inner_loop_wd_16: 172 add x9,x0,x2 //pu1_src_tmp += src_strd 173 ld1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 174 add x10,x1,x3 //pu1_dst_tmp += dst_strd 175 st1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 176 ld1 {v1.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 177 st1 {v1.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 178 subs x8,x8,#16 //wd - 8(loop condition) 179 ld1 {v2.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 180 st1 {v2.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 181 ld1 {v3.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 182 st1 {v3.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 183 bgt inner_loop_wd_16 184 185 end_inner_loop_wd_16: 186 subs x11,x11,#4 //ht -= 4 187 sub x0,x9,x15 //pu1_src = pu1_src_tmp 188 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 189 bgt outer_loop_wd_16 190 191 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 192 // MRS x20,PMCCFILTR_EL0 193 sub x0,x20,x19 194 ldp x19,x20,[sp],#16 195 ret 196 197 198 199 200