1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* //file 21 //* ihevc_inter_pred_chroma_vert_w16out_neon.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* yogeswaran rs/ pathiban 31 //* 32 //* //par list of functions: 33 //* 34 //* 35 //* //remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 ///** 42 //******************************************************************************* 43 //* 44 //* //brief 45 //* interprediction chroma filter to store vertical 16bit ouput 46 //* 47 //* //par description: 48 //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49 //* the elements pointed by 'pu1_src' and writes to the location pointed by 50 //* 'pu1_dst' no downshifting or clipping is done and the output is used as 51 //* an input for weighted prediction assumptions : the function is optimized 52 //* considering the fact width is multiple of 2,4 or 8. and also considering 53 //* height should be multiple of 2. width 4,8 is optimized further 54 //* 55 //* //param[in] pu1_src 56 //* uword8 pointer to the source 57 //* 58 //* //param[out] pi2_dst 59 //* word16 pointer to the destination 60 //* 61 //* //param[in] src_strd 62 //* integer source stride 63 //* 64 //* //param[in] dst_strd 65 //* integer destination stride 66 //* 67 //* //param[in] pi1_coeff 68 //* word8 pointer to the filter coefficients 69 //* 70 //* //param[in] ht 71 //* integer height of the array 72 //* 73 //* //param[in] wd 74 //* integer width of the array 75 //* 76 //* //returns 77 //* 78 //* //remarks 79 //* none 80 //* 81 //***************************************************************************** 82 //*/ 83 //void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src, 84 // word16 *pi2_dst, 85 // word32 src_strd, 86 // word32 dst_strd, 87 // word8 *pi1_coeff, 88 // word32 ht, 89 // word32 wd) 90 //**************variables vs registers***************************************** 91 //x0 => *pu1_src 92 //x1 => *pi2_dst 93 //x2 => src_strd 94 //x3 => dst_strd 95 96 .text 97 .align 4 98 99 .include "ihevc_neon_macros.s" 100 101 .globl ihevc_inter_pred_chroma_vert_w16out_av8 102 103 .type ihevc_inter_pred_chroma_vert_w16out_av8, %function 104 105 ihevc_inter_pred_chroma_vert_w16out_av8: 106 107 // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments 108 109 stp x19, x20,[sp,#-16]! 110 111 mov x15,x4 // pi1_coeff 112 mov x16,x5 // ht 113 mov x17,x6 // wd 114 115 116 mov x4,x16 //loads ht 117 mov x12,x15 //loads pi1_coeff 118 cmp x4,#0 //checks ht == 0 119 mov x6,x17 //loads wd 120 sub x0,x0,x2 //pu1_src - src_strd 121 ld1 {v0.8b},[x12] //loads pi1_coeff 122 123 ble end_loops //jumps to end 124 125 tst x6,#3 //checks (wd & 3) 126 abs v3.8b, v0.8b //vabs_s8(coeff) 127 lsl x10,x6,#1 //2*wd 128 dup v0.8b, v3.b[0] //coeffabs_0 129 dup v1.8b, v3.b[1] //coeffabs_1 130 dup v2.8b, v3.b[2] //coeffabs_2 131 dup v3.8b, v3.b[3] //coeffabs_3 132 133 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 134 135 tst x4,#7 //checks ht for mul of 8 136 beq core_loop_ht_8 //when height is multiple of 8 137 138 lsl x7,x3,#2 //2*dst_strd 139 sub x9,x7,x10,lsl #1 //4*dst_strd - 4wd 140 lsl x12,x2,#1 //2*src_strd 141 sub x8,x12,x10 //2*src_strd - 2wd 142 lsl x3, x3, #1 143 mov x5,x10 //2wd 144 145 inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 146 147 add x6,x0,x2 //pu1_src +src_strd 148 ld1 {v17.8b},[x6],x2 //loads pu1_src 149 subs x5,x5,#8 //2wd - 8 150 ld1 {v5.8b},[x0],#8 //loads src 151 umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 152 ld1 {v4.8b},[x6],x2 //loads incremented src 153 umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) 154 ld1 {v16.8b},[x6],x2 //loads incremented src 155 umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) 156 umull v4.8h, v4.8b, v1.8b 157 ld1 {v18.8b},[x6] //loads the incremented src 158 umlsl v6.8h, v16.8b, v3.8b 159 umlsl v4.8h, v17.8b, v0.8b 160 umlal v4.8h, v16.8b, v2.8b 161 umlsl v4.8h, v18.8b, v3.8b 162 add x6,x1,x3 //pu1_dst + dst_strd 163 st1 { v6.8h},[x1],#16 //stores the loaded value 164 165 st1 { v4.8h},[x6] //stores the loaded value 166 167 bgt inner_loop_ht_2 //inner loop again 168 169 subs x4,x4,#2 //ht - 2 170 add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd) 171 mov x5,x10 //2wd 172 add x0,x0,x8 //pu1_src += (2*src_strd - 2wd) 173 174 bgt inner_loop_ht_2 //loop again 175 176 b end_loops //jumps to end 177 178 outer_loop_wd_2: //called when width is multiple of 2 179 lsl x5,x3,#2 //2*dst_strd 180 mov x12,x10 //2wd 181 sub x9,x5,x10,lsl #1 //4*dst_strd - 4wd 182 lsl x7,x2,#1 //2*src_strd 183 sub x8,x7,x10 //2*src_strd - 2wd 184 185 inner_loop_wd_2: 186 187 add x6,x0,x2 //pu1_src + src_strd 188 ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0 189 subs x12,x12,#4 //2wd - 4 190 add x0,x0,#4 //pu1_src + 4 191 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp 192 dup v7.2s, v6.s[1] 193 ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp 194 umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 195 dup v7.2s, v7.s[1] 196 ld1 {v7.s}[1],[x6],x2 197 umlsl v4.8h, v6.8b, v0.8b 198 umlal v4.8h, v7.8b, v2.8b 199 dup v7.2s, v7.s[1] 200 ld1 {v7.s}[1],[x6] 201 add x6,x1,x3,lsl #1 //pu1_dst + dst_strd 202 umlsl v4.8h, v7.8b, v3.8b 203 st1 {v4.d}[0],[x1] //stores the loaded value 204 add x1,x1,#8 //pu1_dst += 4 205 st1 {v4.d}[1],[x6] //stores the loaded value 206 207 bgt inner_loop_wd_2 //inner loop again 208 209 //inner loop ends 210 subs x4,x4,#2 //ht - 2 211 add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd 212 mov x12,x10 //2wd 213 add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd 214 215 bgt inner_loop_wd_2 //loop again 216 217 b end_loops //jumps to end 218 219 core_loop_ht_8: //when wd & ht is multiple of 8 220 221 lsl x12,x3,#3 //4*dst_strd 222 sub x8,x12,x10,lsl #1 //4*dst_strd - 2wd 223 lsl x12,x2,#2 //4*src_strd 224 sub x9,x12,x10 //4*src_strd - 2wd 225 226 bic x5,x10,#7 //x5 ->wd 227 lsr x14, x10, #3 //divide by 8 228 mul x12, x4 , x14 //multiply height by width 229 sub x12, x12,#4 //subtract by one for epilog 230 lsl x3, x3, #1 231 232 prolog: 233 add x6,x0,x2 //pu1_src + src_strd 234 ld1 {v5.8b},[x6],x2 //loads pu1_src 235 subs x5,x5,#8 //2wd - 8 236 ld1 {v4.8b},[x0],#8 //loads the source 237 ld1 {v6.8b},[x6],x2 //load and increment 238 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 239 ld1 {v7.8b},[x6],x2 //load and increment 240 umlsl v30.8h, v4.8b, v0.8b 241 add x7,x1,x3 //pu1_dst 242 umlal v30.8h, v6.8b, v2.8b 243 umlsl v30.8h, v7.8b, v3.8b 244 ld1 {v16.8b},[x6],x2 //load and increment 245 246 umull v28.8h, v6.8b, v1.8b //mul_res 2 247 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 248 csel x0, x20, x0,le 249 umlsl v28.8h, v5.8b, v0.8b 250 bic x20,x10,#7 //x5 ->wd 251 csel x5, x20, x5,le 252 umlal v28.8h, v7.8b, v2.8b 253 ld1 {v17.8b},[x6],x2 254 umlsl v28.8h, v16.8b, v3.8b 255 256 ld1 {v18.8b},[x6],x2 257 umull v26.8h, v7.8b, v1.8b 258 add x6,x0,x2 //pu1_src + src_strd 259 umlsl v26.8h, v6.8b, v0.8b 260 st1 { v30.16b},[x1],#16 //stores the loaded value 261 umlal v26.8h, v16.8b, v2.8b 262 ld1 {v4.8b},[x0],#8 //loads the source 263 umlsl v26.8h, v17.8b, v3.8b 264 265 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 266 csel x1, x20, x1,le 267 umull v24.8h, v16.8b, v1.8b 268 ld1 {v5.8b},[x6],x2 //loads pu1_src 269 umlsl v24.8h, v7.8b, v0.8b 270 subs x12,x12,#4 271 ld1 {v6.8b},[x6],x2 //load and increment 272 umlal v24.8h, v17.8b, v2.8b 273 ld1 {v7.8b},[x6],x2 //load and increment 274 umlsl v24.8h, v18.8b, v3.8b 275 sub x20,x2,x2,lsl #3 276 neg x11, x20 277 add x14,x2,x2,lsl #1 278 add x14,x14,x11 279 st1 { v28.16b},[x7],x3 //stores the loaded value 280 281 ble epilog //jumps to epilog 282 283 kernel_8: 284 285 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 286 subs x5,x5,#8 //2wd - 8 287 umlsl v30.8h, v4.8b, v0.8b 288 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 289 csel x0, x20, x0,le 290 umlal v30.8h, v6.8b, v2.8b 291 292 lsl x20,x2,#3 293 sub x20,x20,x2 294 csel x11,x20,x11,le 295 //rsble x11,x2,x2,lsl #3 296 umlsl v30.8h, v7.8b, v3.8b 297 st1 { v26.16b},[x7],x3 //stores the loaded value 298 299 ld1 {v16.8b},[x6],x2 //load and increment 300 301 umull v28.8h, v6.8b, v1.8b //mul_res 2 302 bic x20,x10,#7 //x5 ->wd 303 csel x5, x20, x5,le 304 umlsl v28.8h, v5.8b, v0.8b 305 st1 { v24.16b},[x7],x3 //stores the loaded value 306 307 umlal v28.8h, v7.8b, v2.8b 308 ld1 {v17.8b},[x6],x2 309 310 umlsl v28.8h, v16.8b, v3.8b 311 ld1 {v18.8b},[x6],x2 312 add x7,x1,x3 //pu1_dst 313 umull v26.8h, v7.8b, v1.8b 314 add x6,x0,x2 //pu1_src + src_strd 315 add x20,x0, x11 316 prfm PLDL1KEEP,[x20] 317 318 umlsl v26.8h, v6.8b, v0.8b 319 ld1 {v4.8b},[x0],#8 //loads the source 320 321 add x11,x11,x2 322 umlal v26.8h, v16.8b, v2.8b 323 st1 { v30.16b},[x1],#16 //stores the loaded value 324 325 umlsl v26.8h, v17.8b, v3.8b 326 ld1 {v5.8b},[x6],x2 //loads pu1_src 327 328 umull v24.8h, v16.8b, v1.8b 329 ld1 {v6.8b},[x6],x2 //load and increment 330 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 331 csel x1, x20, x1,le 332 333 cmp x11,x14 334 335 lsl x20,x2,#3 336 sub x20,x20,x2 337 csel x11,x20,x11,gt 338 //rsbgt x11,x2,x2,lsl #3 339 340 umlsl v24.8h, v7.8b, v0.8b 341 subs x12,x12,#4 342 343 344 umlal v24.8h, v17.8b, v2.8b 345 ld1 {v7.8b},[x6],x2 //load and increment 346 347 umlsl v24.8h, v18.8b, v3.8b 348 st1 { v28.16b},[x7],x3 //stores the loaded value 349 350 bgt kernel_8 //jumps to kernel_8 351 352 epilog: 353 354 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 355 umlsl v30.8h, v4.8b, v0.8b 356 umlal v30.8h, v6.8b, v2.8b 357 umlsl v30.8h, v7.8b, v3.8b 358 st1 { v26.16b},[x7],x3 //stores the loaded value 359 360 ld1 {v16.8b},[x6],x2 //load and increment 361 umull v28.8h, v6.8b, v1.8b //mul_res 2 362 umlsl v28.8h, v5.8b, v0.8b 363 umlal v28.8h, v7.8b, v2.8b 364 umlsl v28.8h, v16.8b, v3.8b 365 st1 { v24.16b},[x7],x3 //stores the loaded value 366 367 ld1 {v17.8b},[x6],x2 368 umull v26.8h, v7.8b, v1.8b 369 add x7,x1,x3 //pu1_dst 370 umlsl v26.8h, v6.8b, v0.8b 371 st1 { v30.16b},[x1],#16 //stores the loaded value 372 umlal v26.8h, v16.8b, v2.8b 373 ld1 {v18.8b},[x6],x2 374 umlsl v26.8h, v17.8b, v3.8b 375 376 umull v24.8h, v16.8b, v1.8b 377 st1 { v28.16b},[x7],x3 //stores the loaded value 378 umlsl v24.8h, v7.8b, v0.8b 379 umlal v24.8h, v17.8b, v2.8b 380 st1 { v26.16b},[x7],x3 //stores the loaded value 381 umlsl v24.8h, v18.8b, v3.8b 382 383 st1 { v24.16b},[x7],x3 //stores the loaded value 384 385 end_loops: 386 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 387 ldp x19, x20,[sp],#16 388 389 ret 390 391 392 393