1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_luma_mode_18_34_neon.s 22 //* 23 //* @brief 24 //* contains function definitions for intra prediction dc filtering. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* @author 30 //* yogeswaran rs 31 //* 32 //* @par list of functions: 33 //* 34 //* 35 //* @remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* @brief 44 //* luma intraprediction filter for dc input 45 //* 46 //* @par description: 47 //* 48 //* @param[in] pu1_ref 49 //* uword8 pointer to the source 50 //* 51 //* @param[out] pu1_dst 52 //* uword8 pointer to the destination 53 //* 54 //* @param[in] src_strd 55 //* integer source stride 56 //* 57 //* @param[in] dst_strd 58 //* integer destination stride 59 //* 60 //* @param[in] pi1_coeff 61 //* word8 pointer to the planar coefficients 62 //* 63 //* @param[in] nt 64 //* size of tranform block 65 //* 66 //* @param[in] mode 67 //* type of filtering 68 //* 69 //* @returns 70 //* 71 //* @remarks 72 //* none 73 //* 74 //******************************************************************************* 75 //*/ 76 77 //void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref, 78 // word32 src_strd, 79 // uword8 *pu1_dst, 80 // word32 dst_strd, 81 // word32 nt, 82 // word32 mode) 83 // 84 //**************variables vs registers***************************************** 85 //x0 => *pu1_ref 86 //x1 => src_strd 87 //x2 => *pu1_dst 88 //x3 => dst_strd 89 90 //stack contents from #40 91 // nt 92 // mode 93 // pi1_coeff 94 95 .text 96 .align 4 97 .include "ihevc_neon_macros.s" 98 99 100 101 .globl ihevc_intra_pred_luma_mode_18_34_av8 102 103 .type ihevc_intra_pred_luma_mode_18_34_av8, %function 104 105 ihevc_intra_pred_luma_mode_18_34_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 push_v_regs 109 stp x19, x20,[sp,#-16]! 110 111 cmp x4,#4 112 beq mode2_4 113 114 mov x11,x4 115 mov x12,x4 116 sub x14,x4,#8 117 118 add x0,x0,x4,lsl #1 119 120 cmp x5,#0x22 121 mov x10,x2 122 123 add x0,x0,#2 124 sub x20,x0,#2 125 csel x0, x20, x0,ne 126 mov x20,#1 127 csel x6, x20, x6,eq 128 mov x20,#-1 129 csel x6, x20, x6,ne 130 mov x8,x0 131 132 prologue_cpy_32: 133 134 ld1 {v0.8b},[x8],x6 135 lsr x1, x4, #3 136 ld1 {v1.8b},[x8],x6 137 mul x1, x4, x1 138 ld1 {v2.8b},[x8],x6 139 ld1 {v3.8b},[x8],x6 140 subs x1,x1,#8 141 ld1 {v4.8b},[x8],x6 142 ld1 {v5.8b},[x8],x6 143 ld1 {v6.8b},[x8],x6 144 145 ld1 {v7.8b},[x8],x6 146 147 148 beq epilogue_mode2 149 sub x11,x11,#8 150 151 cmp x5,#0x22 152 add x20,x0,#8 153 csel x0, x20, x0,ne 154 csel x8, x0, x8,ne 155 bne kernel_mode18 156 //add x8,x0,#8 157 158 kernel_mode2: 159 st1 {v0.8b},[x10],x3 160 st1 {v1.8b},[x10],x3 161 subs x12,x12,#8 162 st1 {v2.8b},[x10],x3 163 add x20,x2,#8 164 csel x2, x20, x2,ne 165 st1 {v3.8b},[x10],x3 166 167 ld1 {v0.8b},[x8],x6 168 st1 {v4.8b},[x10],x3 169 170 st1 {v5.8b},[x10],x3 171 ld1 {v1.8b},[x8],x6 172 st1 {v6.8b},[x10],x3 173 ld1 {v2.8b},[x8],x6 174 st1 {v7.8b},[x10],x3 175 176 ld1 {v3.8b},[x8],x6 177 sub x20,x10,x14 178 csel x2, x20, x2,eq 179 ld1 {v4.8b},[x8],x6 180 mov x10,x2 181 ld1 {v5.8b},[x8],x6 182 csel x12, x4, x12,eq 183 ld1 {v6.8b},[x8],x6 184 subs x11,x11,#8 185 186 ld1 {v7.8b},[x8],x6 187 188 add x20,x0,#8 189 csel x0, x20, x0,eq 190 csel x11, x4, x11,eq 191 csel x8, x0, x8,eq 192 193 subs x1, x1, #8 194 195 bne kernel_mode2 196 197 b epilogue_mode2 198 199 kernel_mode18: 200 st1 {v0.8b},[x10],x3 201 st1 {v1.8b},[x10],x3 202 subs x12,x12,#8 203 st1 {v2.8b},[x10],x3 204 add x20,x2,#8 205 csel x2, x20, x2,ne 206 st1 {v3.8b},[x10],x3 207 208 ld1 {v0.8b},[x8],x6 209 st1 {v4.8b},[x10],x3 210 211 st1 {v5.8b},[x10],x3 212 ld1 {v1.8b},[x8],x6 213 214 st1 {v6.8b},[x10],x3 215 ld1 {v2.8b},[x8],x6 216 st1 {v7.8b},[x10],x3 217 218 ld1 {v3.8b},[x8],x6 219 sub x20,x10,x14 220 csel x2, x20, x2,eq 221 ld1 {v4.8b},[x8],x6 222 mov x10,x2 223 ld1 {v5.8b},[x8],x6 224 csel x12, x4, x12,eq 225 ld1 {v6.8b},[x8],x6 226 subs x11,x11,#8 227 ld1 {v7.8b},[x8],x6 228 229 add x20,x0,#8 230 csel x0, x20, x0,ne 231 csel x11, x4, x11,eq 232 sub x20,x8,x14 233 csel x0, x20, x0,eq 234 subs x1, x1, #8 235 mov x8,x0 236 237 bne kernel_mode18 238 239 240 epilogue_mode2: 241 242 st1 {v0.8b},[x10],x3 243 st1 {v1.8b},[x10],x3 244 st1 {v2.8b},[x10],x3 245 st1 {v3.8b},[x10],x3 246 st1 {v4.8b},[x10],x3 247 st1 {v5.8b},[x10],x3 248 st1 {v6.8b},[x10],x3 249 st1 {v7.8b},[x10],x3 250 251 b end_func 252 253 mode2_4: 254 255 add x0,x0,#10 256 cmp x5,#0x22 257 sub x20,x0,#2 258 csel x0, x20, x0,ne 259 260 mov x20,#1 261 csel x8, x20, x8,eq 262 mov x20,#-1 263 csel x8, x20, x8,ne 264 265 ld1 {v0.8b},[x0],x8 266 st1 {v0.s}[0],[x2],x3 267 268 ld1 {v0.8b},[x0],x8 269 st1 {v0.s}[0],[x2],x3 270 271 ld1 {v0.8b},[x0],x8 272 st1 {v0.s}[0],[x2],x3 273 274 ld1 {v0.8b},[x0],x8 275 st1 {v0.s}[0],[x2],x3 276 277 end_func: 278 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 279 ldp x19, x20,[sp],#16 280 pop_v_regs 281 ret 282 283 284 285 286 287 288 289