1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_intra_pred_chroma_dc_neon.s 22 @* 23 @* @brief 24 @* contains function definitions for intra prediction dc filtering. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* yogeswaran rs 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* luma intraprediction filter for dc input 45 @* 46 @* @par description: 47 @* 48 @* @param[in] pu1_ref 49 @* uword8 pointer to the source 50 @* 51 @* @param[out] pu1_dst 52 @* uword8 pointer to the destination 53 @* 54 @* @param[in] src_strd 55 @* integer source stride 56 @* 57 @* @param[in] dst_strd 58 @* integer destination stride 59 @* 60 @* @param[in] pi1_coeff 61 @* word8 pointer to the planar coefficients 62 @* 63 @* @param[in] nt 64 @* size of tranform block 65 @* 66 @* @param[in] mode 67 @* type of filtering 68 @* 69 @* @returns 70 @* 71 @* @remarks 72 @* none 73 @* 74 @******************************************************************************* 75 @*/ 76 77 @void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref, 78 @ word32 src_strd, 79 @ uword8 *pu1_dst, 80 @ word32 dst_strd, 81 @ word32 nt, 82 @ word32 mode) 83 @ 84 @**************variables vs registers***************************************** 85 @r0 => *pu1_ref 86 @r1 => src_strd 87 @r2 => *pu1_dst 88 @r3 => dst_strd 89 90 @stack contents from #40 91 @ nt 92 @ mode 93 @ pi1_coeff 94 95 .text 96 .align 4 97 98 99 100 101 .globl ihevc_intra_pred_chroma_dc_a9q 102 103 .type ihevc_intra_pred_chroma_dc_a9q, %function 104 105 ihevc_intra_pred_chroma_dc_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 109 ldr r4,[sp,#40] @loads nt 110 mov r9, #0 111 vmov d17, r9, r9 112 113 clz r5, r4 @counts leading zeros 114 115 add r6, r0, r4,lsl #1 @&src[2nt] 116 vmov d18, r9, r9 117 rsb r5, r5, #32 @log2nt 118 add r7, r0, r4, lsl #2 @&src[4nt] 119 mov r12,r5 120 add r8, r7, #2 @&src[4nt+2] 121 122 cmp r4, #4 123 beq dc_4 @nt=4 loop 124 125 126 add_loop: 127 vld2.s8 {d30,d31}, [r6]! @load from src[nt] 128 lsl r10,r4,#1 @2nt 129 130 vpaddl.u8 d2, d30 131 subs r10, #0x10 132 133 vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1] 134 135 vpaddl.u8 d3, d31 136 vpaddl.u16 d2, d2 137 vpaddl.u16 d3, d3 138 139 vpadal.u32 d17, d2 140 141 vpadal.u32 d18, d3 142 143 vpaddl.u8 d2, d26 144 vpaddl.u8 d3, d27 145 146 vpaddl.u16 d2, d2 147 vpaddl.u16 d3, d3 148 149 vpadal.u32 d17, d2 150 vpadal.u32 d18, d3 151 152 beq epil_add_loop 153 154 core_loop_add: 155 vld2.s8 {d30,d31}, [r6]! @load from src[nt] 156 vpaddl.u8 d28, d30 157 vpaddl.u8 d3, d31 158 159 vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1] 160 161 vpaddl.u16 d3, d3 162 vpaddl.u16 d29, d28 163 164 vpadal.u32 d18, d3 165 vpadal.u32 d17, d29 166 167 vpaddl.u8 d3, d27 168 vpaddl.u8 d28, d26 169 170 vpaddl.u16 d3, d3 171 vpaddl.u16 d29, d28 172 173 vpadal.u32 d18, d3 174 vpadal.u32 d17, d29 175 176 177 epil_add_loop: 178 179 vmov.32 r1,d18[0] 180 vmov.32 r11,d17[0] 181 182 add r1,r1,r4 183 add r11,r11,r4 184 185 lsr r1,r1,r12 186 lsr r11,r11,r12 187 188 vdup.8 d17,r1 189 vdup.8 d16,r11 190 191 prologue_cpy_32: 192 193 add r5, r2, r3 194 subs r9, r4, #8 195 lsl r6, r3, #2 196 moveq r11,r6 197 add r8, r5, r3 198 add r10, r8, r3 199 200 beq epilogue_copy 201 202 vst2.8 {d16,d17}, [r2]! 203 add r6, r6, #0xfffffff0 204 205 vst2.8 {d16,d17}, [r5]! 206 vst2.8 {d16,d17}, [r8]! 207 movne r11,#16 208 vst2.8 {d16,d17}, [r10]! 209 210 211 vst2.8 {d16,d17}, [r2], r6 212 vst2.8 {d16,d17}, [r5], r6 213 vst2.8 {d16,d17}, [r8], r6 214 vst2.8 {d16,d17}, [r10], r6 215 216 kernel_copy: 217 vst2.8 {d16,d17}, [r2]! 218 vst2.8 {d16,d17}, [r5]! 219 vst2.8 {d16,d17}, [r8]! 220 vst2.8 {d16,d17}, [r10]! 221 222 vst2.8 {d16,d17}, [r2], r6 223 vst2.8 {d16,d17}, [r5], r6 224 vst2.8 {d16,d17}, [r8], r6 225 vst2.8 {d16,d17}, [r10], r6 226 227 vst2.8 {d16,d17}, [r2]! 228 vst2.8 {d16,d17}, [r5]! 229 vst2.8 {d16,d17}, [r8]! 230 vst2.8 {d16,d17}, [r10]! 231 232 vst2.8 {d16,d17}, [r2], r6 233 vst2.8 {d16,d17}, [r5], r6 234 vst2.8 {d16,d17}, [r8], r6 235 vst2.8 {d16,d17}, [r10], r6 236 237 epilogue_copy: 238 vst2.8 {d16,d17}, [r2],r11 239 vst2.8 {d16,d17}, [r5],r11 240 vst2.8 {d16,d17}, [r8],r11 241 vst2.8 {d16,d17}, [r10],r11 242 243 vst2.8 {d16,d17}, [r2] 244 vst2.8 {d16,d17}, [r5] 245 vst2.8 {d16,d17}, [r8] 246 vst2.8 {d16,d17}, [r10] 247 b end_func 248 249 dc_4: 250 vld2.s8 {d30,d31},[r6] @load from src[nt] 251 vshl.i64 d3,d30,#32 252 253 vld2.s8 {d26,d27},[r8] @load from src[2nt+1] 254 vshl.i64 d2,d31,#32 255 256 vpaddl.u8 d3,d3 257 vpaddl.u8 d2,d2 258 vpaddl.u16 d3,d3 259 vpaddl.u16 d2,d2 260 vpadal.u32 d17,d3 261 vpadal.u32 d18,d2 262 263 vshl.i64 d3,d26,#32 264 vshl.i64 d2,d27,#32 265 vpaddl.u8 d3,d3 266 vpaddl.u8 d2,d2 267 vpaddl.u16 d3,d3 268 vpaddl.u16 d2,d2 269 vpadal.u32 d17,d3 270 vpadal.u32 d18,d2 271 272 vmov.32 r10,d17[0] 273 vmov.32 r11,d18[0] 274 275 add r10,r10,r4 276 add r11,r11,r4 277 lsr r10,r10,r12 278 lsr r11,r11,r12 279 orr r10,r10,r11,lsl #8 280 vdup.16 d0,r10 281 282 vst1.8 {d0},[r2],r3 283 vst1.8 {d0},[r2],r3 284 vst1.8 {d0},[r2],r3 285 vst1.8 {d0},[r2] 286 287 end_func: 288 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 289 290 291 292 293