1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @****************************************************************************** 20 @* @file 21 @* ihevc_inter_pred_filters_luma_vert_w16inp.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* yogeswaran rs 31 @* 32 @* @par list of functions: 33 @* 34 @* - ihevc_inter_pred_luma_vert() 35 @* 36 @* @remarks 37 @* none 38 @* 39 @******************************************************************************* 40 @*/ 41 42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43 @/* include reconstruction */ 44 @ 45 46 @/** 47 @******************************************************************************* 48 @* 49 @* @brief 50 @* luma vertical filter for 16bit input. 51 @* 52 @* @par description: 53 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54 @* the elements pointed by 'pu1_src' and writes to the location pointed by 55 @* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and 56 @* clipped to lie between 0 and 255 assumptions : the function is 57 @* optimized considering the fact width is multiple of 4. and height as 58 @* multiple of 2. 59 @* 60 @* @param[in] pi2_src 61 @* word16 pointer to the source 62 @* 63 @* @param[out] pu1_dst 64 @* uword8 pointer to the destination 65 @* 66 @* @param[in] src_strd 67 @* integer source stride 68 @* 69 @* @param[in] dst_strd 70 @* integer destination stride 71 @* 72 @* @param[in] pi1_coeff 73 @* word8 pointer to the filter coefficients 74 @* 75 @* @param[in] ht 76 @* integer height of the array 77 @* 78 @* @param[in] wd 79 @* integer width of the array 80 @* 81 @* @returns 82 @* 83 @* @remarks 84 @* none 85 @* 86 @******************************************************************************* 87 @*/ 88 89 @void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src, 90 @ uword8 *pu1_dst, 91 @ word32 src_strd, 92 @ word32 dst_strd, 93 @ word8 *pi1_coeff, 94 @ word32 ht, 95 @ word32 wd ) 96 @**************variables vs registers***************************************** 97 @ r0 => *pu2_src 98 @ r1 => *pu1_dst 99 @ r2 => src_strd 100 @ r3 => dst_strd 101 @ r4 => *pi1_coeff 102 @ r5 => ht 103 @ r6 => wd 104 105 .text 106 .align 4 107 108 109 110 111 .globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q 112 113 .type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function 114 115 ihevc_inter_pred_luma_vert_w16inp_w16out_a9q: 116 117 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 118 119 ldr r12,[sp,#40] @load pi1_coeff 120 mov r6,r3,lsl #1 121 ldr r5,[sp,#48] @load wd 122 vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 123 mov r2, r2, lsl #1 124 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 125 @vabs.s8 d0,d0 @vabs_s8(coeff) 126 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 127 ldr r3,[sp,#44] @load ht 128 subs r7,r3,#0 @r3->ht 129 @ble end_loops @end loop jump 130 vmovl.s8 q0,d0 131 vdup.16 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 132 vdup.16 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 133 vdup.16 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 134 vdup.16 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 135 vdup.16 d26,d1[0] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 136 vdup.16 d27,d1[1] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 137 vdup.16 d28,d1[2] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 138 vdup.16 d29,d1[3] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 139 vmov.i32 q15,#0x80000 140 141 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 142 rsb r8,r5,r2,lsl #2 @r2->src_strd 143 sub r8,r8,r5 144 sub r9,r9,r5 145 mov r3, r5, lsr #2 @divide by 4 146 mul r7, r3 @multiply height by width 147 sub r7, #4 @subtract by one for epilog 148 mov r4,r5 @r5 ->wd 149 @mov r2, r2, lsl #1 150 151 prolog: 152 153 add r3,r0,r2 @pu1_src_tmp += src_strd@ 154 vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 155 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 156 subs r4,r4,#4 157 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 158 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 159 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 160 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 161 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 162 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 163 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 164 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 165 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 166 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 167 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 168 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 169 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 170 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 171 172 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 173 174 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 175 addle r0,r0,r8,lsl #0 176 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 177 movle r4,r5 @r5 ->wd 178 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 179 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 180 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 181 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 182 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 183 add r3,r0,r2 @pu1_src_tmp += src_strd@ 184 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 185 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 186 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 187 vsub.s32 q4, q4, q15 188 189 vld1.16 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 190 vmull.s16 q6,d3,d23 191 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 192 vmlal.s16 q6,d2,d22 193 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 194 vmlal.s16 q6,d4,d24 195 vmlal.s16 q6,d5,d25 196 vmlal.s16 q6,d6,d26 197 vmlal.s16 q6,d7,d27 198 vmlal.s16 q6,d16,d28 199 vmlal.s16 q6,d17,d29 200 add r14,r1,r6 201 vsub.s32 q5, q5, q15 202 vshrn.s32 d8, q4, #6 203 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 204 205 vmull.s16 q7,d4,d23 206 vmlal.s16 q7,d3,d22 207 vmlal.s16 q7,d5,d24 208 vmlal.s16 q7,d6,d25 209 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 210 vmlal.s16 q7,d7,d26 211 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 212 vmlal.s16 q7,d16,d27 213 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 214 vmlal.s16 q7,d17,d28 215 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 216 vmlal.s16 q7,d18,d29 217 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 218 219 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 220 vsub.s32 q6, q6, q15 221 vshrn.s32 d10, q5, #6 222 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 223 addle r1,r1,r9 224 225 subs r7,r7,#4 226 227 228 blt epilog_end @jumps to epilog_end 229 beq epilog @jumps to epilog 230 231 kernel_8: 232 233 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 234 subs r4,r4,#4 235 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 236 addle r0,r0,r8,lsl #0 237 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 238 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 239 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 240 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 241 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 242 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 243 vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 244 245 vsub.s32 q7, q7, q15 246 vshrn.s32 d12, q6, #6 247 @vqrshrun.s16 d12,q6,#6 248 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 249 250 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 251 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 252 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 253 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 254 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 255 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 256 vst1.32 {d12},[r14],r6 257 258 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 259 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 260 261 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 262 263 vsub.s32 q4, q4, q15 264 vshrn.s32 d14, q7, #6 265 @vqrshrun.s16 d14,q7,#6 266 267 vmull.s16 q6,d3,d23 268 movle r4,r5 @r5 ->wd 269 270 vmlal.s16 q6,d2,d22 271 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 272 273 vmlal.s16 q6,d4,d24 274 add r3,r0,r2 @pu1_src_tmp += src_strd@ 275 276 vmlal.s16 q6,d5,d25 277 278 vmlal.s16 q6,d6,d26 279 vst1.32 {d14},[r14],r6 280 281 vmlal.s16 q6,d7,d27 282 vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 283 284 vmlal.s16 q6,d16,d28 285 add r14,r1,r6 286 287 vmlal.s16 q6,d17,d29 288 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 289 290 vsub.s32 q5, q5, q15 291 vshrn.s32 d8, q4, #6 292 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 293 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 294 295 vmull.s16 q7,d4,d23 296 vmlal.s16 q7,d3,d22 297 vmlal.s16 q7,d5,d24 298 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 299 300 vmlal.s16 q7,d6,d25 301 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 302 vmlal.s16 q7,d7,d26 303 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 304 vmlal.s16 q7,d16,d27 305 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 306 vmlal.s16 q7,d17,d28 307 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 308 vmlal.s16 q7,d18,d29 309 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 310 311 vsub.s32 q6, q6, q15 312 vshrn.s32 d10, q5, #6 313 addle r1,r1,r9 314 315 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 316 subs r7,r7,#4 317 318 bgt kernel_8 @jumps to kernel_8 319 320 epilog: 321 322 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 323 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 324 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 325 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 326 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 327 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 328 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 329 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 330 vst1.32 {d10},[r14],r6 331 332 vsub.s32 q7, q7, q15 333 vshrn.s32 d12, q6, #6 334 @vqrshrun.s16 d12,q6,#6 335 336 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 337 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 338 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 339 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 340 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 341 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 342 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 343 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 344 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 345 vst1.32 {d12},[r14],r6 346 347 vsub.s32 q4, q4, q15 348 vshrn.s32 d14, q7, #6 349 @vqrshrun.s16 d14,q7,#6 350 351 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 352 vmull.s16 q6,d3,d23 353 vmlal.s16 q6,d2,d22 354 vmlal.s16 q6,d4,d24 355 vmlal.s16 q6,d5,d25 356 vmlal.s16 q6,d6,d26 357 vmlal.s16 q6,d7,d27 358 vmlal.s16 q6,d16,d28 359 vmlal.s16 q6,d17,d29 360 vst1.32 {d14},[r14],r6 361 vsub.s32 q5, q5, q15 362 vshrn.s32 d8, q4, #6 363 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 364 365 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 366 vmull.s16 q7,d4,d23 367 vmlal.s16 q7,d3,d22 368 vmlal.s16 q7,d5,d24 369 vmlal.s16 q7,d6,d25 370 vmlal.s16 q7,d7,d26 371 vmlal.s16 q7,d16,d27 372 vmlal.s16 q7,d17,d28 373 vmlal.s16 q7,d18,d29 374 vsub.s32 q6, q6, q15 375 vshrn.s32 d10, q5, #6 376 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 377 378 add r14,r1,r6 379 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 380 381 epilog_end: 382 vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 383 vshrn.s32 d12, q6, #6 384 @vqrshrun.s16 d12,q6,#6 385 386 vst1.32 {d12},[r14],r6 387 vsub.s32 q7, q7, q15 388 vshrn.s32 d14, q7, #6 389 @vqrshrun.s16 d14,q7,#6 390 391 vst1.32 {d14},[r14],r6 392 393 394 end_loops: 395 396 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 397 398 399 400 401 402 403 404 405