1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_filters_vert.s 22 //* 23 //* @brief 24 //* contains function definitions for intra prediction dc filtering. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* @author 30 //* akshaya mukund 31 //* 32 //* @par list of functions: 33 //* 34 //* 35 //* @remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* @brief 44 //* luma intraprediction filter for dc input 45 //* 46 //* @par description: 47 //* 48 //* @param[in] pu1_ref 49 //* uword8 pointer to the source 50 //* 51 //* @param[out] pu1_dst 52 //* uword8 pointer to the destination 53 //* 54 //* @param[in] src_strd 55 //* integer source stride 56 //* 57 //* @param[in] dst_strd 58 //* integer destination stride 59 //* 60 //* @param[in] nt 61 //* size of tranform block 62 //* 63 //* @param[in] mode 64 //* type of filtering 65 //* 66 //* @returns 67 //* 68 //* @remarks 69 //* none 70 //* 71 //******************************************************************************* 72 //*/ 73 74 //void ihevc_intra_pred_luma_ver(uword8* pu1_ref, 75 // word32 src_strd, 76 // uword8* pu1_dst, 77 // word32 dst_strd, 78 // word32 nt, 79 // word32 mode) 80 // 81 //**************variables vs registers***************************************** 82 //x0 => *pu1_ref 83 //x1 => src_strd 84 //x2 => *pu1_dst 85 //x3 => dst_strd 86 87 //stack contents from #40 88 // nt 89 // mode 90 91 .text 92 .align 4 93 .include "ihevc_neon_macros.s" 94 95 96 97 .globl ihevc_intra_pred_luma_ver_av8 98 99 .type ihevc_intra_pred_luma_ver_av8, %function 100 101 ihevc_intra_pred_luma_ver_av8: 102 103 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 104 105 stp x19, x20,[sp,#-16]! 106 107 lsl x5, x4, #1 //2nt 108 109 cmp x4, #16 110 beq blk_16 111 blt blk_4_8 112 113 add x5, x5, #1 //2nt+1 114 add x6, x0, x5 //&src[2nt+1] 115 116 copy_32: 117 add x5, x2, x3 118 ld1 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15) 119 add x8, x5, x3 120 121 add x10, x8, x3 122 ld1 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31) 123 lsl x11, x3, #2 124 125 sub x11, x11, #16 126 st1 {v20.8b, v21.8b}, [x2],#16 127 st1 {v20.8b, v21.8b}, [x5],#16 128 st1 {v20.8b, v21.8b}, [x8],#16 129 st1 {v20.8b, v21.8b}, [x10],#16 130 131 st1 {v22.8b, v23.8b}, [x2], x11 132 st1 {v22.8b, v23.8b}, [x5], x11 133 st1 {v22.8b, v23.8b}, [x8], x11 134 st1 {v22.8b, v23.8b}, [x10], x11 135 136 subs x4, x4, #8 137 138 kernel_copy_32: 139 st1 {v20.8b, v21.8b}, [x2],#16 140 st1 {v20.8b, v21.8b}, [x5],#16 141 st1 {v20.8b, v21.8b}, [x8],#16 142 st1 {v20.8b, v21.8b}, [x10],#16 143 144 st1 {v22.8b, v23.8b}, [x2], x11 145 st1 {v22.8b, v23.8b}, [x5], x11 146 st1 {v22.8b, v23.8b}, [x8], x11 147 st1 {v22.8b, v23.8b}, [x10], x11 148 149 subs x4, x4, #8 150 151 st1 {v20.8b, v21.8b}, [x2],#16 152 st1 {v20.8b, v21.8b}, [x5],#16 153 st1 {v20.8b, v21.8b}, [x8],#16 154 st1 {v20.8b, v21.8b}, [x10],#16 155 156 st1 {v22.8b, v23.8b}, [x2], x11 157 st1 {v22.8b, v23.8b}, [x5], x11 158 st1 {v22.8b, v23.8b}, [x8], x11 159 st1 {v22.8b, v23.8b}, [x10], x11 160 161 bne kernel_copy_32 162 163 st1 {v20.8b, v21.8b}, [x2],#16 164 st1 {v20.8b, v21.8b}, [x5],#16 165 st1 {v20.8b, v21.8b}, [x8],#16 166 st1 {v20.8b, v21.8b}, [x10],#16 167 168 st1 {v22.8b, v23.8b}, [x2], x11 169 st1 {v22.8b, v23.8b}, [x5], x11 170 st1 {v22.8b, v23.8b}, [x8], x11 171 st1 {v22.8b, v23.8b}, [x10], x11 172 173 b end_func 174 175 blk_16: 176 add x6, x0, x5 //&src[2nt] 177 178 ldrb w11, [x6], #1 //src[2nt] 179 sxtw x11,w11 180 181 dup v22.16b,w11 //src[2nt] 182 ldrb w12, [x6] //src[2nt+1] 183 sxtw x12,w12 184 185 ld1 {v16.8b, v17.8b}, [x6] //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores) 186 sub x6, x6, #17 //subtract -9 to take it to src[2nt-1-row(15)] 187 188 dup v24.16b,w12 //src[2nt+1] 189 dup v30.8h,w12 190 lsl x5, x3, #3 //8*stride 191 192 ld1 {v26.16b}, [x6],#16 //load src[2nt-1-row](rows 0:15) 193 add x5, x2, x5 //x5 -> 194 195 movi d18, #0x00000000000000ff 196 uhsub v26.16b, v26.16b , v22.16b //(src[2nt-1-row] - src[2nt])>>1 197 //vsubl.u8 q0, d26, d22 198 //vsubl.u8 q14, d27, d22 199 200 //vshr.s16 q0, q0, #1 201 //vshr.s16 q14, q14, #1 202 203 mov v19.d[0],v17.d[0] 204 //vaddl.s8 q0, d24, d26 205 sxtl v0.8h, v26.8b 206 sxtl2 v28.8h, v26.16b 207 sqadd v0.8h, v0.8h , v30.8h 208 sqadd v28.8h, v28.8h , v30.8h 209 210 movi d3, #0x00000000000000ff 211 //vaddl.s8 q1, d25, d27 212 213 sqxtun v24.8b, v28.8h 214 sqxtun2 v24.16b, v0.8h 215 //vmovn.u16 d25, q0 216 //vmovn.u16 d24, q1 217 218 rev64 v24.16b, v24.16b 219 mov v25.d[0], v24.d[1] 220 221 mov v4.d[0],v17.d[0] 222 223 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 224 bsl v3.8b, v25.8b , v16.8b 225 226 movi d1, #0x00000000000000ff 227 mov v2.d[0],v17.d[0] 228 229 movi d6, #0x00000000000000ff 230 mov v7.d[0],v17.d[0] 231 232 st1 {v18.8b, v19.8b}, [x2], x3 233 sshr d24, d24,#8 234 235 st1 {v3.8b, v4.8b}, [x5], x3 236 sshr d25, d25,#8 237 238 239 bsl v1.8b, v24.8b , v16.8b 240 bsl v6.8b, v25.8b , v16.8b 241 242 st1 {v1.8b, v2.8b}, [x2], x3 243 sshr d24, d24,#8 244 245 st1 {v6.8b, v7.8b}, [x5], x3 246 sshr d25, d25,#8 247 248 subs x4, x4,#8 249 250 movi d18, #0x00000000000000ff 251 //vmov.i64 d19, d17 252 253 movi d3, #0x00000000000000ff 254 //vmov.i64 d11, d17 255 256 257 loop_16: 258 259 260 movi d1, #0x00000000000000ff 261 262 movi d6, #0x00000000000000ff 263 264 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 265 bsl v3.8b, v25.8b , v16.8b 266 267 st1 {v18.8b, v19.8b}, [x2], x3 268 sshr d24, d24,#8 269 270 st1 {v3.8b, v4.8b}, [x5], x3 271 sshr d25, d25,#8 272 273 movi d18, #0x00000000000000ff 274 275 movi d3, #0x00000000000000ff 276 277 bsl v1.8b, v24.8b , v16.8b 278 bsl v6.8b, v25.8b , v16.8b 279 280 st1 {v1.8b, v2.8b}, [x2], x3 281 sshr d24, d24,#8 282 283 st1 {v6.8b, v7.8b}, [x5], x3 284 sshr d25, d25,#8 285 286 subs x4, x4, #4 287 288 bne loop_16 289 290 movi d1, #0x00000000000000ff 291 292 movi d6, #0x00000000000000ff 293 294 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 295 bsl v3.8b, v25.8b , v16.8b 296 297 st1 {v18.8b, v19.8b}, [x2], x3 298 sshr d24, d24,#8 299 300 st1 {v3.8b, v4.8b}, [x5], x3 301 sshr d25, d25,#8 302 303 bsl v1.8b, v24.8b , v16.8b 304 bsl v6.8b, v25.8b , v16.8b 305 306 st1 {v1.8b, v2.8b}, [x2], x3 307 308 st1 {v6.8b, v7.8b}, [x5], x3 309 310 b end_func 311 312 313 blk_4_8: 314 movi d4, #0x00000000000000ff 315 add x6, x0, x5 //&src[2nt] 316 317 movi d3, #0x00000000000000ff 318 ldrb w11, [x6], #1 //src[2nt] 319 sxtw x11,w11 320 321 dup v22.8b,w11 //src[2nt] 322 ldrb w12, [x6] //src[2nt+1] 323 sxtw x12,w12 324 325 ld1 {v16.8b},[x6] //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st) 326 sub x6, x6, #9 //subtract -9 to take it to src[2nt-1-row(15)] 327 328 dup v24.8b,w12 //src[2nt+1] 329 dup v30.8h,w12 330 331 ld1 {v26.8b},[x6],#8 //load src[2nt-1-row](rows 0:15) 332 333 movi d18, #0x00000000000000ff 334 uhsub v26.8b, v26.8b , v22.8b //(src[2nt-1-row] - src[2nt])>>1 335 //vsubl.u8 q13, d26, d22 336 337 //vshr.s16 q13, q13, #1 338 339 movi d19, #0x00000000000000ff 340 sxtl v26.8h, v26.8b 341 //vaddl.s8 q0, d24, d26 342 sqadd v0.8h, v26.8h , v30.8h 343 344 sqxtun v24.8b, v0.8h 345 //vmovn.s16 d24, q0 346 347 rev64 v24.8b, v24.8b 348 349 cmp x4, #4 350 beq blk_4 351 352 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 353 354 st1 {v18.8b},[x2], x3 355 sshr d24, d24,#8 356 357 movi d18, #0x00000000000000ff 358 359 bsl v19.8b, v24.8b , v16.8b 360 361 st1 {v19.8b},[x2], x3 362 sshr d24, d24,#8 363 364 movi d19, #0x00000000000000ff 365 366 bsl v3.8b, v24.8b , v16.8b 367 368 st1 {v3.8b},[x2], x3 369 sshr d24, d24,#8 370 371 movi d3, #0x00000000000000ff 372 373 bsl v4.8b, v24.8b , v16.8b 374 375 st1 {v4.8b},[x2], x3 376 sshr d24, d24,#8 377 378 movi d4, #0x00000000000000ff 379 380 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 381 382 st1 {v18.8b},[x2], x3 383 sshr d24, d24,#8 384 385 bsl v19.8b, v24.8b , v16.8b 386 387 st1 {v19.8b},[x2], x3 388 sshr d24, d24,#8 389 390 bsl v3.8b, v24.8b , v16.8b 391 392 st1 {v3.8b},[x2], x3 393 sshr d24, d24,#8 394 395 bsl v4.8b, v24.8b , v16.8b 396 397 st1 {v4.8b},[x2], x3 398 sshr d24, d24,#8 399 400 b end_func 401 402 403 blk_4: 404 bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel) 405 406 st1 {v18.s}[0],[x2], x3 407 sshr d24, d24,#8 408 409 bsl v19.8b, v24.8b , v16.8b 410 411 st1 {v19.s}[0],[x2], x3 412 sshr d24, d24,#8 413 414 bsl v3.8b, v24.8b , v16.8b 415 416 st1 {v3.s}[0],[x2], x3 417 sshr d24, d24,#8 418 419 bsl v4.8b, v24.8b , v16.8b 420 st1 {v4.s}[0],[x2], x3 421 422 423 end_func: 424 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 425 ldp x19, x20,[sp],#16 426 427 ret 428 429 430 431 432 433