1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_chroma_ver_neon.s 22 //* 23 //* @brief 24 //* contains function definitions for intra prediction dc filtering. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* @author 30 //* yogeswaran rs 31 //* 32 //* @par list of functions: 33 //* 34 //* 35 //* @remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* @brief 44 //* luma intraprediction filter for dc input 45 //* 46 //* @par description: 47 //* 48 //* @param[in] pu1_ref 49 //* uword8 pointer to the source 50 //* 51 //* @param[out] pu1_dst 52 //* uword8 pointer to the destination 53 //* 54 //* @param[in] src_strd 55 //* integer source stride 56 //* 57 //* @param[in] dst_strd 58 //* integer destination stride 59 //* 60 //* @param[in] nt 61 //* size of tranform block 62 //* 63 //* @param[in] mode 64 //* type of filtering 65 //* 66 //* @returns 67 //* 68 //* @remarks 69 //* none 70 //* 71 //******************************************************************************* 72 //*/ 73 74 //void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref, 75 // word32 src_strd, 76 // uword8 *pu1_dst, 77 // word32 dst_strd, 78 // word32 nt, 79 // word32 mode) 80 //**************variables vs registers***************************************** 81 //x0 => *pu1_ref 82 //x1 => src_strd 83 //x2 => *pu1_dst 84 //x3 => dst_strd 85 86 //stack contents from #40 87 // nt 88 // mode 89 90 .text 91 .align 4 92 .include "ihevc_neon_macros.s" 93 94 95 .globl ihevc_intra_pred_chroma_ver_av8 96 97 .type ihevc_intra_pred_chroma_ver_av8, %function 98 99 ihevc_intra_pred_chroma_ver_av8: 100 101 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 102 push_v_regs 103 stp x19, x20,[sp,#-16]! 104 105 lsl x5, x4, #2 //4nt 106 107 108 cmp x4, #8 109 beq blk_8 110 blt blk_4 111 112 copy_16: 113 add x5, x5, #2 //2nt+2 114 add x6, x0, x5 //&src[2nt+1] 115 116 add x5, x2, x3 //pu1_dst + dst_strd 117 ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15) 118 add x8, x5, x3 119 120 add x10, x8, x3 121 ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) 122 lsl x11, x3, #2 123 124 sub x11, x11, #16 125 126 127 st2 {v20.8b, v21.8b}, [x2],#16 128 st2 {v20.8b, v21.8b}, [x5],#16 129 st2 {v20.8b, v21.8b}, [x8],#16 130 st2 {v20.8b, v21.8b}, [x10],#16 131 132 st2 {v22.8b, v23.8b}, [x2], x11 133 st2 {v22.8b, v23.8b}, [x5], x11 134 st2 {v22.8b, v23.8b}, [x8], x11 135 st2 {v22.8b, v23.8b}, [x10], x11 136 137 subs x4, x4, #4 138 139 kernel_copy_16: 140 st2 {v20.8b, v21.8b}, [x2],#16 141 st2 {v20.8b, v21.8b}, [x5],#16 142 st2 {v20.8b, v21.8b}, [x8],#16 143 st2 {v20.8b, v21.8b}, [x10],#16 144 145 st2 {v22.8b, v23.8b}, [x2], x11 146 st2 {v22.8b, v23.8b}, [x5], x11 147 st2 {v22.8b, v23.8b}, [x8], x11 148 st2 {v22.8b, v23.8b}, [x10], x11 149 150 subs x4, x4, #4 151 152 153 st2 {v20.8b, v21.8b}, [x2],#16 154 st2 {v20.8b, v21.8b}, [x5],#16 155 st2 {v20.8b, v21.8b}, [x8],#16 156 st2 {v20.8b, v21.8b}, [x10],#16 157 158 st2 {v22.8b, v23.8b}, [x2], x11 159 st2 {v22.8b, v23.8b}, [x5], x11 160 st2 {v22.8b, v23.8b}, [x8], x11 161 st2 {v22.8b, v23.8b}, [x10], x11 162 163 subs x4, x4, #4 164 165 st2 {v20.8b, v21.8b}, [x2],#16 166 st2 {v20.8b, v21.8b}, [x5],#16 167 st2 {v20.8b, v21.8b}, [x8],#16 168 st2 {v20.8b, v21.8b}, [x10],#16 169 170 st2 {v22.8b, v23.8b}, [x2], x11 171 st2 {v22.8b, v23.8b}, [x5], x11 172 st2 {v22.8b, v23.8b}, [x8], x11 173 st2 {v22.8b, v23.8b}, [x10], x11 174 175 subs x4, x4, #4 176 bne kernel_copy_16 177 178 b end_func 179 180 blk_8: 181 182 add x5, x5, #2 //2nt+2 183 add x6, x0, x5 //&src[2nt+1] 184 185 add x5, x2, x3 //pu1_dst + dst_strd 186 ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15) 187 add x8, x5, x3 188 189 add x10, x8, x3 190 ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) 191 192 lsl x11,x3,#2 193 194 st2 {v20.8b, v21.8b}, [x2],x11 195 st2 {v20.8b, v21.8b}, [x5],x11 196 st2 {v20.8b, v21.8b}, [x8],x11 197 st2 {v20.8b, v21.8b}, [x10],x11 198 199 st2 {v20.8b, v21.8b}, [x2] 200 st2 {v20.8b, v21.8b}, [x5] 201 st2 {v20.8b, v21.8b}, [x8] 202 st2 {v20.8b, v21.8b}, [x10] 203 204 subs x4, x4, #8 205 beq end_func 206 207 blk_4: 208 209 //lsl x5, x4, #2 @4nt 210 add x5, x5, #2 //2nt+2 211 add x6, x0, x5 //&src[2nt+1] 212 213 ld1 {v0.8b},[x6] 214 add x5, x2, x3 //pu1_dst + dst_strd 215 216 st1 {v0.8b},[x2] 217 add x8, x5, x3 218 st1 {v0.8b},[x5] 219 add x10, x8, x3 220 st1 {v0.8b},[x8] 221 st1 {v0.8b},[x10] 222 223 224 225 end_func: 226 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 227 ldp x19, x20,[sp],#16 228 pop_v_regs 229 ret 230 231 232 233