1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_chroma_horz_neon.s 22 //* 23 //* @brief 24 //* contains function definition for intra prediction interpolation filters 25 //* 26 //* 27 //* @author 28 //* parthiban v 29 //* 30 //* @par list of functions: 31 //* - ihevc_intra_pred_luma_horz() 32 //* 33 //* @remarks 34 //* none 35 //* 36 //******************************************************************************* 37 //*/ 38 // 39 ///** 40 //******************************************************************************* 41 //* 42 //* @brief 43 //* intra prediction interpolation filter for horizontal luma variable. 44 //* 45 //* @par description: 46 //* horizontal intraprediction(mode 10) with.extern samples location 47 //* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer 48 //* to section 8.4.4.2.6 in the standard (special case) 49 //* 50 //* @param[in] pu1_src 51 //* uword8 pointer to the source 52 //* 53 //* @param[out] pu1_dst 54 //* uword8 pointer to the destination 55 //* 56 //* @param[in] src_strd 57 //* integer source stride 58 //* 59 //* @param[in] dst_strd 60 //* integer destination stride 61 //* 62 //* @param[in] nt 63 //* integer transform block size 64 //* 65 //* @param[in] mode 66 //* integer intraprediction mode 67 //* 68 //* @returns 69 //* 70 //* @remarks 71 //* none 72 //* 73 //******************************************************************************* 74 //*/ 75 //void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref, 76 // word32 src_strd, 77 // uword8 *pu1_dst, 78 // word32 dst_strd, 79 // word32 nt, 80 // word32 mode) 81 //**************variables vs registers***************************************** 82 //x0 => *pu1_ref 83 //x1 => src_strd 84 //x2 => *pu1_dst 85 //x3 => dst_strd 86 87 .text 88 .align 4 89 .include "ihevc_neon_macros.s" 90 91 92 .globl ihevc_intra_pred_chroma_horz_av8 93 94 .type ihevc_intra_pred_chroma_horz_av8, %function 95 96 ihevc_intra_pred_chroma_horz_av8: 97 98 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 99 100 stp x19, x20,[sp,#-16]! 101 102 lsl x6,x4,#2 //four_nt 103 104 add x12,x0,x6 //*pu1_ref[four_nt] 105 cmp x4,#4 //if nt == 4 106 beq core_loop_4 107 108 cmp x4,#8 //if nt == 8 109 beq core_loop_8 110 111 //cmp x4,#16 @if nt == 16 112 //beq core_loop_16 113 114 sub x12,x12,#16 //move to 16th value pointer 115 add x9,x2,#16 116 117 core_loop_16: 118 ld1 { v0.8h},[x12] //load 16 values. d1[7] will have the 1st value. 119 sub x12,x12,#16 120 ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value. 121 122 dup v2.8h, v0.4h[7] //duplicate the i value. 123 124 dup v4.8h, v0.4h[6] //duplicate the ii value. 125 dup v6.8h, v0.4h[5] //duplicate the iii value. 126 st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns 127 st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns 128 129 dup v1.8h, v0.4h[4] 130 st1 { v4.8h},[x2],x3 131 st1 { v4.8h},[x9],x3 132 133 dup v2.8h, v0.4h[3] 134 st1 { v6.8h},[x2],x3 135 st1 { v6.8h},[x9],x3 136 137 dup v4.8h, v0.4h[2] 138 st1 { v1.8h},[x2],x3 139 st1 { v1.8h},[x9],x3 140 141 dup v6.8h, v0.4h[1] 142 st1 { v2.8h},[x2],x3 143 st1 { v2.8h},[x9],x3 144 145 dup v1.8h, v0.4h[0] 146 st1 { v4.8h},[x2],x3 147 st1 { v4.8h},[x9],x3 148 149 dup v2.8h, v18.4h[7] 150 st1 { v6.8h},[x2],x3 151 st1 { v6.8h},[x9],x3 152 153 dup v4.8h, v18.4h[6] 154 st1 { v1.8h},[x2],x3 155 st1 { v1.8h},[x9],x3 156 157 dup v6.8h, v18.4h[5] 158 st1 { v2.8h},[x2],x3 159 st1 { v2.8h},[x9],x3 160 161 dup v1.8h, v18.4h[4] 162 st1 { v4.8h},[x2],x3 163 st1 { v4.8h},[x9],x3 164 165 dup v2.8h, v18.4h[3] 166 st1 { v6.8h},[x2],x3 167 st1 { v6.8h},[x9],x3 168 169 dup v4.8h, v18.4h[2] 170 st1 { v1.8h},[x2],x3 171 st1 { v1.8h},[x9],x3 172 173 dup v6.8h, v18.4h[1] 174 st1 { v2.8h},[x2],x3 175 st1 { v2.8h},[x9],x3 176 sub x12,x12,#16 //move to 16th value pointer 177 178 dup v1.8h, v18.4h[0] 179 st1 { v4.8h},[x2],x3 180 st1 { v4.8h},[x9],x3 181 182 subs x4,x4,#16 //decrement the loop count by 16 183 st1 { v6.8h},[x2],x3 184 st1 { v6.8h},[x9],x3 185 186 st1 { v1.8h},[x2],x3 187 st1 { v1.8h},[x9],x3 188 bgt core_loop_16 189 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 190 ldp x19, x20,[sp],#16 191 192 ret 193 b endloop 194 195 core_loop_8: 196 ldrb w14,[x12],#1 //pu1_ref[two_nt] 197 sxtw x14,w14 198 //vld1.8 {q15},[x12] @pu1_ref[two_nt + 1 + col] 199 200 dup v28.8b,w14 201 sub x12,x12,#17 202 ld1 { v0.16b},[x12] 203 204 sub x12,x12,#16 205 // ld1 { v30.16b},[x12] 206 dup v18.8h, v0.4h[7] 207 //vmovl.u8 q13,d26 208 209 dup v2.8h, v0.4h[6] 210 //vsubl.u8 q12,d30,d28 211 212 dup v4.8h, v0.4h[5] 213 //vshr.s16 q12,q12,#1 214 215 dup v6.8h, v0.4h[4] 216 //vqadd.s16 q11,q13,q12 217 218 dup v1.8h, v0.4h[3] 219 //vqmovun.s16 d22,q11 220 221 st1 { v18.8h},[x2],x3 222 223 dup v18.8h, v0.4h[2] 224 //vsubl.u8 q12,d31,d28 225 226 dup v19.8h, v0.4h[1] 227 //vshr.s16 q12,q12,#1 228 229 dup v20.8h, v0.4h[0] 230 //vqadd.s16 q11,q13,q12 231 232 dup v16.8h, v0.4h[3] 233 //vqmovun.s16 d22,q11 234 235 st1 { v2.8h},[x2],x3 236 //sub x2,x2,#8 237 238 st1 { v4.8h},[x2],x3 239 240 st1 { v6.8h},[x2],x3 241 st1 { v1.8h},[x2],x3 242 st1 { v18.8h},[x2],x3 243 244 //vdup.8 q1,d0[2] 245 st1 { v19.8h},[x2],x3 246 247 //vdup.8 q2,d0[1] 248 st1 { v20.8h},[x2],x3 249 250 //vdup.8 q3,d0[0] 251 //vst1.8 {q7},[x2],x3 252 253 //vdup.8 q4,d0[3] 254 //vst1.8 {q8},[x2],x3 255 256 //vdup.8 q5,d0[2] 257 //vst1.8 {q1},[x2],x3 258 259 //vdup.8 q6,d0[1] 260 //vst1.8 {q2},[x2],x3 261 262 //vdup.8 q7,d0[0] 263 //vst1.8 {q3},[x2],x3 264 265 //vst1.8 {q4},[x2],x3 266 //vst1.8 {q5},[x2],x3 267 //vst1.8 {q6},[x2],x3 268 //vst1.8 {q7},[x2],x3 269 270 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 271 ldp x19, x20,[sp],#16 272 273 ret 274 b endloop 275 276 277 core_loop_4: 278 ldrb w14,[x12] //pu1_ref[two_nt] 279 sxtw x14,w14 280 add x12,x12,#1 //pu1_ref[two_nt + 1] 281 //vld1.8 {d30},[x12] @pu1_ref[two_nt + 1 + col] 282 283 sub x12,x12,#9 284 ld1 {v0.8b},[x12] 285 sub x12,x12,#8 286 ld1 {v30.8b},[x12] 287 dup v26.4h, v0.4h[3] 288 dup v28.8b,w14 289 290 dup v3.4h, v0.4h[2] 291 uxtl v26.8h, v26.8b 292 293 dup v4.4h, v0.4h[1] 294 usubl v24.8h, v30.8b, v28.8b 295 296 dup v5.4h, v0.4h[0] 297 sshr v24.8h, v24.8h,#1 298 299 dup v6.4h, v0.4h[3] 300 sqadd v22.8h, v26.8h , v24.8h 301 302 dup v7.4h, v0.4h[2] 303 sqxtun v22.8b, v22.8h 304 305 st1 {v6.8b},[x2],x3 306 st1 {v3.8b},[x2],x3 307 308 dup v1.4h, v0.4h[1] 309 st1 {v4.8b},[x2],x3 310 st1 {v5.8b},[x2],x3 311 312 dup v17.4h, v0.4h[0] 313 //vst1.8 {d6},[x2],x3 314 //vst1.8 {d7},[x2],x3 315 316 //vst1.8 {d8},[x2],x3 317 //vst1.8 {d9},[x2],x3 318 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 319 ldp x19, x20,[sp],#16 320 321 ret 322 b endloop 323 324 325 //core_loop_4 326 ldrb w14,[x12] //pu1_ref[two_nt] 327 sxtw x14,w14 328 add x12,x12,#1 //pu1_ref[two_nt + 1] 329 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 330 331 sub x12,x12,#5 332 ld1 {v0.8b},[x12] 333 dup v28.8b,w14 334 dup v26.8b, v0.8b[3] 335 uxtl v26.8h, v26.8b 336 337 dup v3.8b, v0.8b[2] 338 usubl v24.8h, v30.8b, v28.8b 339 340 dup v4.8b, v0.8b[1] 341 sshr v24.8h, v24.8h,#1 342 343 dup v5.8b, v0.8b[0] 344 sqadd v22.8h, v26.8h , v24.8h 345 346 sqxtun v22.8b, v22.8h 347 348 st1 {v22.s}[0],[x2],x3 349 st1 {v3.s}[0],[x2],x3 350 st1 {v4.s}[0],[x2],x3 351 st1 {v5.s}[0],[x2],x3 352 353 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 354 ldp x19, x20,[sp],#16 355 356 ret 357 358 endloop: 359 360 361 362