1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* //file 21 //* ihevc_inter_pred_chroma_vert_neon.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* yogeswaran rs 31 //* 32 //* //par list of functions: 33 //* 34 //* 35 //* //remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 ///** 42 //******************************************************************************* 43 //* 44 //* //brief 45 //* chroma interprediction filter for vertical input 46 //* 47 //* //par description: 48 //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49 //* the elements pointed by 'pu1_src' and writes to the location pointed by 50 //* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits 51 //* assumptions : the function is optimized considering the fact width is 52 //* multiple of 2,4 or 8. and also considering height should be multiple of 2 53 //* width 4,8 is optimized further 54 //* 55 //* //param[in] pu1_src 56 //* uword8 pointer to the source 57 //* 58 //* //param[out] pu1_dst 59 //* uword8 pointer to the destination 60 //* 61 //* //param[in] src_strd 62 //* integer source stride 63 //* 64 //* //param[in] dst_strd 65 //* integer destination stride 66 //* 67 //* //param[in] pi1_coeff 68 //* word8 pointer to the filter coefficients 69 //* 70 //* //param[in] ht 71 //* integer height of the array 72 //* 73 //* //param[in] wd 74 //* integer width of the array 75 //* 76 //* //returns 77 //* 78 //* //remarks 79 //* none 80 //* 81 //******************************************************************************* 82 //*/ 83 //void ihevc_inter_pred_chroma_vert(uword8 *pu1_src, 84 // uword8 *pu1_dst, 85 // word32 src_strd, 86 // word32 dst_strd, 87 // word8 *pi1_coeff, 88 // word32 ht, 89 // word32 wd) 90 //**************variables vs registers***************************************** 91 //x0 => *pu1_src 92 //x1 => *pi2_dst 93 //x2 => src_strd 94 //x3 => dst_strd 95 .text 96 .align 4 97 98 .include "ihevc_neon_macros.s" 99 100 .globl ihevc_inter_pred_chroma_vert_av8 101 102 .type ihevc_inter_pred_chroma_vert_av8, %function 103 104 ihevc_inter_pred_chroma_vert_av8: 105 106 // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 mov x4,x16 //loads ht 115 mov x12,x15 //loads pi1_coeff 116 cmp x4,#0 //checks ht == 0 117 mov x6,x17 //loads wd 118 sub x0,x0,x2 //pu1_src - src_strd 119 ld1 {v0.8b},[x12] //loads pi1_coeff 120 121 ble end_loops //jumps to end 122 123 tst x6,#3 //checks (wd & 3) 124 abs v3.8b, v0.8b //vabs_s8(coeff) 125 lsl x10,x6,#1 //2*wd 126 dup v0.8b, v3.b[0] //coeffabs_0 127 dup v1.8b, v3.b[1] //coeffabs_1 128 dup v2.8b, v3.b[2] //coeffabs_2 129 dup v3.8b, v3.b[3] //coeffabs_3 130 131 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 132 133 tst x4,#7 //checks ht for mul of 8 134 beq core_loop_ht_8 //when height is multiple of 8 135 136 lsl x7,x3,#1 //2*dst_strd 137 sub x9,x7,x10 //2*dst_strd - 2wd 138 lsl x12,x2,#1 //2*src_strd 139 sub x8,x12,x10 //2*src_strd - 2wd 140 mov x5,x10 //2wd 141 142 inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 143 144 add x6,x0,x2 //pu1_src +src_strd 145 ld1 {v17.8b},[x6],x2 //loads pu1_src 146 subs x5,x5,#8 //2wd - 8 147 ld1 {v5.8b},[x0],#8 //loads src 148 umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 149 ld1 {v4.8b},[x6],x2 //loads incremented src 150 umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) 151 ld1 {v16.8b},[x6],x2 //loads incremented src 152 umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) 153 umull v4.8h, v4.8b, v1.8b 154 umlsl v6.8h, v16.8b, v3.8b 155 umlsl v4.8h, v17.8b, v0.8b 156 ld1 {v18.8b},[x6] //loads the incremented src 157 umlal v4.8h, v16.8b, v2.8b 158 sqrshrun v6.8b, v6.8h,#6 //shifts right 159 umlsl v4.8h, v18.8b, v3.8b 160 add x6,x1,x3 //pu1_dst + dst_strd 161 sqrshrun v4.8b, v4.8h,#6 //shifts right 162 st1 {v6.8b},[x1],#8 //stores the loaded value 163 164 st1 {v4.8b},[x6] //stores the loaded value 165 166 bgt inner_loop_ht_2 //inner loop again 167 168 subs x4,x4,#2 //ht - 2 169 add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd) 170 mov x5,x10 //2wd 171 add x0,x0,x8 //pu1_src += (2*src_strd - 2wd) 172 173 bgt inner_loop_ht_2 //loop again 174 175 b end_loops //jumps to end 176 177 outer_loop_wd_2: //called when width is multiple of 2 178 lsl x5,x3,#1 //2*dst_strd 179 mov x12,x10 //2wd 180 sub x9,x5,x10 //2*dst_strd - 2wd 181 lsl x7,x2,#1 //2*src_strd 182 sub x8,x7,x10 //2*src_strd - 2wd 183 184 inner_loop_wd_2: 185 186 add x6,x0,x2 //pu1_src + src_strd 187 ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0 188 subs x12,x12,#4 //2wd - 4 189 add x0,x0,#4 //pu1_src + 4 190 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp 191 dup v7.2s, v6.s[1] 192 ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp 193 umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 194 dup v7.2s, v7.s[1] 195 ld1 {v7.s}[1],[x6],x2 196 umlsl v4.8h, v6.8b, v0.8b 197 umlal v4.8h, v7.8b, v2.8b 198 dup v7.2s, v7.s[1] 199 ld1 {v7.s}[1],[x6] 200 add x6,x1,x3 //pu1_dst + dst_strd 201 umlsl v4.8h, v7.8b, v3.8b 202 sqrshrun v4.8b, v4.8h,#6 //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6) 203 st1 {v4.s}[0],[x1] //stores the loaded value 204 add x1,x1,#4 //pu1_dst += 4 205 st1 {v4.s}[1],[x6] //stores the loaded value 206 207 bgt inner_loop_wd_2 //inner loop again 208 209 //inner loop ends 210 subs x4,x4,#2 //ht - 2 211 add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd 212 mov x12,x10 //2wd 213 add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd 214 215 bgt inner_loop_wd_2 //loop again 216 217 b end_loops //jumps to end 218 219 core_loop_ht_8: //when wd & ht is multiple of 8 220 221 lsl x12,x3,#2 //4*dst_strd 222 sub x8,x12,x10 //4*dst_strd - 2wd 223 lsl x12,x2,#2 //4*src_strd 224 sub x9,x12,x10 //4*src_strd - 2wd 225 226 bic x5,x10,#7 //x5 ->wd 227 lsr x14, x10, #3 //divide by 8 228 mul x12, x4 , x14 //multiply height by width 229 sub x12, x12,#4 //subtract by one for epilog 230 231 prolog: 232 add x6,x0,x2 //pu1_src + src_strd 233 ld1 {v5.8b},[x6],x2 //loads pu1_src 234 subs x5,x5,#8 //2wd - 8 235 ld1 {v4.8b},[x0],#8 //loads the source 236 ld1 {v6.8b},[x6],x2 //load and increment 237 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 238 ld1 {v7.8b},[x6],x2 //load and increment 239 umlsl v30.8h, v4.8b, v0.8b 240 add x7,x1,x3 //pu1_dst 241 umlal v30.8h, v6.8b, v2.8b 242 umlsl v30.8h, v7.8b, v3.8b 243 ld1 {v16.8b},[x6],x2 //load and increment 244 245 umull v28.8h, v6.8b, v1.8b //mul_res 2 246 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 247 csel x0, x20, x0,le 248 umlsl v28.8h, v5.8b, v0.8b 249 bic x20,x10,#7 //x5 ->wd 250 csel x5, x20, x5,le 251 umlal v28.8h, v7.8b, v2.8b 252 ld1 {v17.8b},[x6],x2 253 umlsl v28.8h, v16.8b, v3.8b 254 sqrshrun v30.8b, v30.8h,#6 255 256 ld1 {v18.8b},[x6],x2 257 umull v26.8h, v7.8b, v1.8b 258 add x6,x0,x2 //pu1_src + src_strd 259 umlsl v26.8h, v6.8b, v0.8b 260 st1 {v30.8b},[x1],#8 //stores the loaded value 261 umlal v26.8h, v16.8b, v2.8b 262 ld1 {v4.8b},[x0],#8 //loads the source 263 umlsl v26.8h, v17.8b, v3.8b 264 sqrshrun v28.8b, v28.8h,#6 265 266 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 267 csel x1, x20, x1,le 268 umull v24.8h, v16.8b, v1.8b 269 ld1 {v5.8b},[x6],x2 //loads pu1_src 270 umlsl v24.8h, v7.8b, v0.8b 271 subs x12,x12,#4 272 ld1 {v6.8b},[x6],x2 //load and increment 273 umlal v24.8h, v17.8b, v2.8b 274 ld1 {v7.8b},[x6],x2 //load and increment 275 umlsl v24.8h, v18.8b, v3.8b 276 277 lsl x11,x2,#2 278 st1 {v28.8b},[x7],x3 //stores the loaded value 279 sqrshrun v26.8b, v26.8h,#6 280 sub x20,x2,x2,lsl #3 281 neg x11, x20 282 add x14,x2,x2,lsl #1 283 add x14,x14,x11 284 ble epilog //jumps to epilog 285 286 kernel_8: 287 288 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 289 subs x5,x5,#8 //2wd - 8 290 umlsl v30.8h, v4.8b, v0.8b 291 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 292 csel x0, x20, x0,le 293 umlal v30.8h, v6.8b, v2.8b 294 lsl x20,x2,#3 295 sub x20,x20,x2 296 csel x11,x20,x11,le 297 //rsble x11,x2,x2,lsl #3 298 umlsl v30.8h, v7.8b, v3.8b 299 st1 {v26.8b},[x7],x3 //stores the loaded value 300 sqrshrun v24.8b, v24.8h,#6 301 302 ld1 {v16.8b},[x6],x2 //load and increment 303 304 umull v28.8h, v6.8b, v1.8b //mul_res 2 305 bic x20,x10,#7 //x5 ->wd 306 csel x5, x20, x5,le 307 umlsl v28.8h, v5.8b, v0.8b 308 st1 {v24.8b},[x7],x3 //stores the loaded value 309 310 umlal v28.8h, v7.8b, v2.8b 311 312 ld1 {v17.8b},[x6],x2 313 sqrshrun v30.8b, v30.8h,#6 314 315 umlsl v28.8h, v16.8b, v3.8b 316 ld1 {v18.8b},[x6],x2 317 add x7,x1,x3 //pu1_dst 318 umull v26.8h, v7.8b, v1.8b 319 add x6,x0,x2 //pu1_src + src_strd 320 321 add x20,x0, x11 322 prfm PLDL1KEEP,[x20] 323 324 325 umlsl v26.8h, v6.8b, v0.8b 326 ld1 {v4.8b},[x0],#8 //loads the source 327 328 umlal v26.8h, v16.8b, v2.8b 329 st1 {v30.8b},[x1],#8 //stores the loaded value 330 331 umlsl v26.8h, v17.8b, v3.8b 332 ld1 {v5.8b},[x6],x2 //loads pu1_src 333 334 add x11,x11,x2 335 sqrshrun v28.8b, v28.8h,#6 336 337 umull v24.8h, v16.8b, v1.8b 338 ld1 {v6.8b},[x6],x2 //load and increment 339 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 340 csel x1, x20, x1,le 341 342 cmp x11,x14 343 lsl x20,x2,#3 344 sub x20,x20,x2 345 csel x11,x20,x11,gt 346 //rsbgt x11,x2,x2,lsl #3 347 348 umlsl v24.8h, v7.8b, v0.8b 349 subs x12,x12,#4 350 351 umlal v24.8h, v17.8b, v2.8b 352 ld1 {v7.8b},[x6],x2 //load and increment 353 354 umlsl v24.8h, v18.8b, v3.8b 355 st1 {v28.8b},[x7],x3 //stores the loaded value 356 sqrshrun v26.8b, v26.8h,#6 357 358 bgt kernel_8 //jumps to kernel_8 359 360 epilog: 361 362 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 363 umlsl v30.8h, v4.8b, v0.8b 364 umlal v30.8h, v6.8b, v2.8b 365 umlsl v30.8h, v7.8b, v3.8b 366 st1 {v26.8b},[x7],x3 //stores the loaded value 367 sqrshrun v24.8b, v24.8h,#6 368 369 ld1 {v16.8b},[x6],x2 //load and increment 370 umull v28.8h, v6.8b, v1.8b //mul_res 2 371 umlsl v28.8h, v5.8b, v0.8b 372 umlal v28.8h, v7.8b, v2.8b 373 umlsl v28.8h, v16.8b, v3.8b 374 st1 {v24.8b},[x7],x3 //stores the loaded value 375 sqrshrun v30.8b, v30.8h,#6 376 377 ld1 {v17.8b},[x6],x2 378 umull v26.8h, v7.8b, v1.8b 379 add x7,x1,x3 //pu1_dst 380 umlsl v26.8h, v6.8b, v0.8b 381 st1 {v30.8b},[x1],#8 //stores the loaded value 382 383 sqrshrun v28.8b, v28.8h,#6 384 umlal v26.8h, v16.8b, v2.8b 385 ld1 {v18.8b},[x6],x2 386 umlsl v26.8h, v17.8b, v3.8b 387 388 umull v24.8h, v16.8b, v1.8b 389 sqrshrun v26.8b, v26.8h,#6 390 st1 {v28.8b},[x7],x3 //stores the loaded value 391 umlsl v24.8h, v7.8b, v0.8b 392 umlal v24.8h, v17.8b, v2.8b 393 st1 {v26.8b},[x7],x3 //stores the loaded value 394 umlsl v24.8h, v18.8b, v3.8b 395 396 sqrshrun v24.8b, v24.8h,#6 397 st1 {v24.8b},[x7],x3 //stores the loaded value 398 end_loops: 399 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 400 ldp x19, x20,[sp],#16 401 402 ret 403 404 405 406