1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* //file 21 //* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* yogeswaran rs / parthiban 31 //* 32 //* //par list of functions: 33 //* 34 //* 35 //* //remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 ///** 42 //******************************************************************************* 43 //* 44 //* //brief 45 //* chroma interprediction filter for 16bit vertical input and output. 46 //* 47 //* //par description: 48 //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49 //* the elements pointed by 'pu1_src' and writes to the location pointed by 50 //* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and 51 //* 8192 is subtracted to store it as a 16 bit number the output is used as 52 //* a input to weighted prediction assumptions : the function is optimized 53 //* considering the fact width and height are multiple of 2. 54 //* 55 //* //param[in] pi2_src 56 //* word16 pointer to the source 57 //* 58 //* //param[out] pi2_dst 59 //* word16 pointer to the destination 60 //* 61 //* //param[in] src_strd 62 //* integer source stride 63 //* 64 //* //param[in] dst_strd 65 //* integer destination stride 66 //* 67 //* //param[in] pi1_coeff 68 //* word8 pointer to the filter coefficients 69 //* 70 //* //param[in] ht 71 //* integer height of the array 72 //* 73 //* //param[in] wd 74 //* integer width of the array 75 //* 76 //* //returns 77 //* 78 //* //remarks 79 //* none 80 //* 81 //******************************************************************************* 82 //*/ 83 //void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src, 84 // word16 *pi2_dst, 85 // word32 src_strd, 86 // word32 dst_strd, 87 // word8 *pi1_coeff, 88 // word32 ht, 89 // word32 wd) 90 //**************variables vs registers***************************************** 91 //x0 => *pu1_src 92 //x1 => *pi2_dst 93 //x2 => src_strd 94 //x3 => dst_strd 95 .text 96 .align 4 97 98 .include "ihevc_neon_macros.s" 99 100 .globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8 101 102 .type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function 103 104 ihevc_inter_pred_chroma_vert_w16inp_w16out_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 mov x4, x15 //loads pi1_coeff 115 mov x6, x17 //wd 116 lsl x2,x2,#1 //src_strd = 2* src_strd 117 mov x5,x16 //loads ht 118 ld1 {v0.8b},[x4] //loads pi1_coeff 119 sub x4,x0,x2 //pu1_src - src_strd 120 sxtl v0.8h, v0.8b //long the value 121 122 tst x6,#3 //checks wd == 2 123 dup v16.4h, v0.h[0] //coeff_0 124 dup v17.4h, v0.h[1] //coeff_1 125 dup v18.4h, v0.h[2] //coeff_2 126 dup v19.4h, v0.h[3] //coeff_3 127 128 bgt core_loop_ht_2 //jumps to loop handles wd 2 129 130 tst x5,#3 //checks ht == mul of 4 131 beq core_loop_ht_4 //jumps to loop handles ht mul of 4 132 133 core_loop_ht_2: 134 lsl x7,x2,#1 //2*src_strd 135 lsl x3,x3,#1 //2*dst_strd 136 lsl x9,x6,#2 //4*wd 137 sub x6,x3,x6,lsl #1 //2*dst_strd - 2*wd 138 sub x8,x7,x9 //2*src_strd - 4*wd 139 mov x12,x9 //4wd 140 141 inner_loop_ht_2: 142 add x0,x4,x2 //increments pi2_src 143 ld1 {v0.4h},[x4],#8 //loads pu1_src 144 smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 145 subs x12,x12,#8 //2wd + 8 146 ld1 {v2.4h},[x0],x2 //loads pi2_src 147 smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 148 ld1 {v3.4h},[x0],x2 //loads pi2_src 149 smlal v0.4s, v2.4h, v17.4h 150 ld1 {v6.4h},[x0],x2 151 smlal v7.4s, v3.4h, v17.4h 152 ld1 {v2.4h},[x0] 153 add x7,x1,x3 //pu1_dst + dst_strd 154 smlal v0.4s, v3.4h, v18.4h 155 smlal v7.4s, v6.4h, v18.4h 156 smlal v0.4s, v6.4h, v19.4h 157 smlal v7.4s, v2.4h, v19.4h 158 sqshrn v0.4h, v0.4s,#6 //right shift 159 sqshrn v30.4h, v7.4s,#6 //right shift 160 st1 {v0.2s},[x1],#8 //stores the loaded value 161 st1 {v30.2s},[x7] //stores the loaded value 162 bgt inner_loop_ht_2 //inner loop -again 163 164 //inner loop ends 165 subs x5,x5,#2 //increments ht 166 add x1,x1,x6,lsl #1 //pu1_dst += 2*dst_strd - 2*wd 167 mov x12,x9 //4wd 168 add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd 169 bgt inner_loop_ht_2 //loop again 170 171 b end_loops //jumps to end 172 173 core_loop_ht_4: 174 lsl x7,x2,#2 //2*src_strd 175 lsl x10,x3,#2 //2*dst_strd 176 lsr x11, x6, #1 //divide by 2 177 sub x14,x10,x6,lsl #1 //2*dst_strd - 2*wd 178 sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd 179 180 mul x12, x5 , x11 //multiply height by width 181 sub x12, x12,#4 //subtract by one for epilog 182 lsl x11, x6, #1 //2*wd 183 lsl x3,x3,#1 //2*dst_strd 184 185 prolog: 186 add x0,x4,x2 //increments pi2_src 187 ld1 {v0.4h},[x4],#8 //loads pu1_src 188 ld1 {v1.4h},[x0],x2 //loads pi2_src 189 subs x11,x11,#4 190 ld1 {v2.4h},[x0],x2 //loads pi2_src 191 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 192 ld1 {v3.4h},[x0],x2 193 smlal v30.4s, v1.4h, v17.4h 194 smlal v30.4s, v2.4h, v18.4h 195 add x9,x1,x3 //pu1_dst + dst_strd 196 smlal v30.4s, v3.4h, v19.4h 197 198 ld1 {v4.4h},[x0],x2 199 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 200 add x20,x4,x8 201 csel x4, x20, x4,le 202 lsl x20,x6,#1 203 csel x11, x20, x11,le 204 smlal v28.4s, v2.4h, v17.4h 205 smlal v28.4s, v3.4h, v18.4h 206 ld1 {v5.4h},[x0],x2 207 smlal v28.4s, v4.4h, v19.4h 208 209 sqshrn v30.4h, v30.4s,#6 //right shift 210 211 ld1 {v6.4h},[x0],x2 212 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 213 smlal v26.4s, v3.4h, v17.4h 214 smlal v26.4s, v4.4h, v18.4h 215 add x0,x4,x2 216 ld1 {v0.4h},[x4],#8 //loads pu1_src 217 smlal v26.4s, v5.4h, v19.4h 218 219 sqshrn v28.4h, v28.4s,#6 //right shift 220 221 ld1 {v1.4h},[x0],x2 //loads pi2_src 222 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 223 st1 {v30.2s},[x1],#8 //stores the loaded value 224 smlal v24.4s, v4.4h, v17.4h 225 ld1 {v2.4h},[x0],x2 //loads pi2_src 226 smlal v24.4s, v5.4h, v18.4h 227 ld1 {v3.4h},[x0],x2 228 smlal v24.4s, v6.4h, v19.4h 229 add x20,x1,x14,lsl #1 230 csel x1, x20, x1,le 231 232 sqshrn v26.4h, v26.4s,#6 //right shift 233 subs x12,x12,#4 234 235 beq epilog //jumps to epilog 236 237 kernel_4: 238 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 239 subs x11,x11,#4 240 smlal v30.4s, v1.4h, v17.4h 241 st1 {v28.2s},[x9],x3 //stores the loaded value 242 smlal v30.4s, v2.4h, v18.4h 243 smlal v30.4s, v3.4h, v19.4h 244 245 sqshrn v24.4h, v24.4s,#6 //right shift 246 247 ld1 {v4.4h},[x0],x2 248 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 249 smlal v28.4s, v2.4h, v17.4h 250 smlal v28.4s, v3.4h, v18.4h 251 smlal v28.4s, v4.4h, v19.4h 252 st1 {v26.2s},[x9],x3 //stores the loaded value 253 add x20,x4,x8 254 csel x4, x20, x4,le 255 lsl x20,x6,#1 256 csel x11, x20, x11,le 257 258 sqshrn v30.4h, v30.4s,#6 //right shift 259 260 ld1 {v5.4h},[x0],x2 261 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 262 ld1 {v6.4h},[x0],x2 263 smlal v26.4s, v3.4h, v17.4h 264 st1 {v24.2s},[x9] //stores the loaded value 265 add x0,x4,x2 266 smlal v26.4s, v4.4h, v18.4h 267 ld1 {v0.4h},[x4],#8 //loads pu1_src 268 smlal v26.4s, v5.4h, v19.4h 269 270 sqshrn v28.4h, v28.4s,#6 //right shift 271 272 ld1 {v1.4h},[x0],x2 //loads pi2_src 273 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 274 ld1 {v2.4h},[x0],x2 //loads pi2_src 275 smlal v24.4s, v4.4h, v17.4h 276 add x9,x1,x3 //pu1_dst + dst_strd 277 ld1 {v3.4h},[x0],x2 278 smlal v24.4s, v5.4h, v18.4h 279 280 st1 {v30.2s},[x1],#8 //stores the loaded value 281 smlal v24.4s, v6.4h, v19.4h 282 283 sqshrn v26.4h, v26.4s,#6 //right shift 284 add x20,x1,x14,lsl #1 285 csel x1, x20, x1,le 286 287 subs x12,x12,#4 288 289 bgt kernel_4 //jumps to kernel_4 290 291 epilog: 292 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 293 st1 {v28.2s},[x9],x3 //stores the loaded value 294 smlal v30.4s, v1.4h, v17.4h 295 smlal v30.4s, v2.4h, v18.4h 296 smlal v30.4s, v3.4h, v19.4h 297 298 sqshrn v24.4h, v24.4s,#6 //right shift 299 300 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 301 ld1 {v4.4h},[x0],x2 302 smlal v28.4s, v2.4h, v17.4h 303 st1 {v26.2s},[x9],x3 //stores the loaded value 304 smlal v28.4s, v3.4h, v18.4h 305 smlal v28.4s, v4.4h, v19.4h 306 307 sqshrn v30.4h, v30.4s,#6 //right shift 308 309 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 310 ld1 {v5.4h},[x0],x2 311 smlal v26.4s, v3.4h, v17.4h 312 smlal v26.4s, v4.4h, v18.4h 313 smlal v26.4s, v5.4h, v19.4h 314 315 sqshrn v28.4h, v28.4s,#6 //right shift 316 317 st1 {v24.2s},[x9] //stores the loaded value 318 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 319 smlal v24.4s, v4.4h, v17.4h 320 add x9,x1,x3 //pu1_dst + dst_strd 321 ld1 {v6.4h},[x0],x2 322 smlal v24.4s, v5.4h, v18.4h 323 smlal v24.4s, v6.4h, v19.4h 324 st1 {v30.2s},[x1],#8 //stores the loaded value 325 326 sqshrn v26.4h, v26.4s,#6 //right shift 327 328 st1 {v28.2s},[x9],x3 //stores the loaded value 329 330 sqshrn v24.4h, v24.4s,#6 //right shift 331 st1 {v26.2s},[x9],x3 //stores the loaded value 332 333 st1 {v24.2s},[x9] //stores the loaded value 334 335 end_loops: 336 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 337 ldp x19, x20,[sp],#16 338 339 ret 340 341 342 343 344