1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* //file 21 //* ihevc_inter_pred_chroma_copy_w16out_neon.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* yogeswaran rs 31 //* 32 //* //par list of functions: 33 //* 34 //* 35 //* //remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* //brief 44 //* chroma interprediction filter for copy 45 //* 46 //* //par description: 47 //* copies the array of width 'wd' and height 'ht' from the location pointed 48 //* by 'src' to the location pointed by 'dst' 49 //* 50 //* //param[in] pu1_src 51 //* uword8 pointer to the source 52 //* 53 //* //param[out] pu1_dst 54 //* uword8 pointer to the destination 55 //* 56 //* //param[in] src_strd 57 //* integer source stride 58 //* 59 //* //param[in] dst_strd 60 //* integer destination stride 61 //* 62 //* //param[in] pi1_coeff 63 //* word8 pointer to the filter coefficients 64 //* 65 //* //param[in] ht 66 //* integer height of the array 67 //* 68 //* //param[in] wd 69 //* integer width of the array 70 //* 71 //* //returns 72 //* 73 //* //remarks 74 //* none 75 //* 76 //******************************************************************************* 77 //*/ 78 79 //void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, 80 // word16 *pi2_dst, 81 // word32 src_strd, 82 // word32 dst_strd, 83 // word8 *pi1_coeff, 84 // word32 ht, 85 // word32 wd) 86 //**************variables vs registers***************************************** 87 //x0 => *pu1_src 88 //x1 => *pi2_dst 89 //x2 => src_strd 90 //x3 => dst_strd 91 //x4 => *pi1_coeff 92 //x5 => ht 93 //x6 => wd 94 95 .text 96 .align 4 97 98 .include "ihevc_neon_macros.s" 99 100 .globl ihevc_inter_pred_chroma_copy_w16out_av8 101 102 .type ihevc_inter_pred_chroma_copy_w16out_av8, %function 103 104 ihevc_inter_pred_chroma_copy_w16out_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 115 mov x12,x17 //loads wd 116 lsl x12,x12,#1 //2*wd 117 mov x7,x16 //loads ht 118 cmp x7,#0 //ht condition(ht == 0) 119 ble end_loops //loop 120 and x8,x7,#3 //check ht for mul of 2 121 sub x9,x7,x8 //check the rounded height value 122 and x11,x7,#6 123 cmp x11,#6 124 beq loop_ht_6 125 tst x12,#7 //conditional check for wd (multiples) 126 beq core_loop_wd_8 127 128 loop_ht_6: 129 sub x11,x12,#4 130 lsl x6, x3,#1 131 adds x6, x6,#0 132 cmp x9,#0 133 beq outer_loop_wd_4_ht_2 134 135 outer_loop_wd_4: 136 subs x4,x12,#0 //wd conditional subtract 137 ble end_inner_loop_wd_4 138 139 inner_loop_wd_4: 140 ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 141 add x5,x0,x2 //pu1_src +src_strd 142 uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 143 add x10,x1,x6 144 subs x4,x4,#4 //wd - 4 145 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 146 ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 147 add x0,x0,#4 //pu1_src += 4 148 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 149 add x1,x1,#8 150 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 151 ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 152 shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 153 uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 154 st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 155 shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) 156 ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 157 st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 158 uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 159 shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) 160 st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 161 bgt inner_loop_wd_4 162 163 end_inner_loop_wd_4: 164 subs x9,x9,#4 //ht - 4 165 sub x0,x5,x11 166 sub x1,x10,x11,lsl #1 167 bgt outer_loop_wd_4 168 cmp x8,#0 169 bgt outer_loop_wd_4_ht_2 170 171 172 end_loops: 173 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 174 ldp x19, x20,[sp],#16 175 176 ret 177 178 179 outer_loop_wd_4_ht_2: 180 subs x4,x12,#0 //wd conditional subtract 181 ble end_inner_loop_wd_4 182 183 inner_loop_wd_4_ht_2: 184 ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 185 add x5,x0,x2 //pu1_src +src_strd 186 uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 187 add x10,x1,x6 188 subs x4,x4,#4 //wd - 4 189 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 190 ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 191 add x0,x0,#4 //pu1_src += 4 192 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 193 add x1,x1,#8 194 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 195 ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 196 shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 197 uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 198 st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 199 bgt inner_loop_wd_4_ht_2 200 b end_loops 201 202 203 core_loop_wd_8: 204 //sub x11,x12,#8 205 lsl x5, x3,#1 206 adds x5, x5,#0 207 sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width 208 neg x11, x20 209 sub x20,x12,x2,lsl #2 //x2->src_strd 210 neg x8, x20 211 lsr x4, x12, #3 // divide by 8 212 mov x7,x9 213 mul x7, x7, x4 214 sub x4,x12,#0 //wd conditional check 215 sub x7,x7,#4 //subtract one for epilog 216 cmp x9,#0 217 beq core_loop_wd_8_ht_2 218 219 prolog: 220 add x6,x0,x2 //pu1_src_tmp += src_strd 221 add x10,x1,x5 222 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 223 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 224 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 225 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 226 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 227 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 228 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 229 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 230 subs x4,x4,#8 //wd decrements by 8 231 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 232 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 233 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 234 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 235 add x20,x0,x8 236 csel x0, x20, x0,le 237 add x6,x0,x2 //pu1_src_tmp += src_strd 238 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 239 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 240 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 241 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 242 243 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 244 add x20,x1,x11,lsl #1 245 csel x1, x20, x1,le 246 sub x20,x12,#0 //wd conditional check 247 csel x4, x20, x4,le 248 249 subs x7,x7,#4 //ht - 4 250 251 blt epilog_end //jumps to epilog_end 252 beq epilog //jumps to epilog 253 254 255 256 outer_loop_wd_8: 257 258 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 259 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 260 261 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 262 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 263 264 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 265 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 266 267 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 268 269 subs x4,x4,#8 //wd decrements by 8 270 add x20,x0,x8 271 csel x0, x20, x0,le 272 273 add x6,x0,x2 //pu1_src_tmp += src_strd 274 275 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 276 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 277 278 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 279 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 280 281 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 282 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 283 284 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 285 add x10,x1,x5 286 287 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 288 289 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 290 291 add x20,x1,x11,lsl #1 292 csel x1, x20, x1,le 293 sub x20,x12,#0 //wd conditional check 294 csel x4, x20, x4,le 295 296 subs x7,x7,#4 //ht - 4 297 bgt outer_loop_wd_8 298 299 epilog: 300 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 301 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 302 303 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 304 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 305 306 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 307 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 308 309 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 310 //add x6,x0,x2 //pu1_src_tmp += src_strd 311 312 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 313 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 314 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 315 add x10,x1,x5 316 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 317 318 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 319 epilog_end: 320 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 321 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 322 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 323 b end_loops 324 325 core_loop_wd_8_ht_2: 326 add x6,x0,x2 //pu1_src_tmp += src_strd 327 add x10,x1,x5 328 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 329 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 330 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 331 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 332 subs x12,x12,#8 //wd decrements by 8 333 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 334 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 335 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 336 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 337 bgt core_loop_wd_8_ht_2 338 339 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 340 ldp x19, x20,[sp],#16 341 342 ret 343 344 345 346 347 348 349