1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_luma_horz_neon.s 22 //* 23 //* @brief 24 //* contains function definition for intra prediction interpolation filters 25 //* 26 //* 27 //* @author 28 //* parthiban v 29 //* 30 //* @par list of functions: 31 //* - ihevc_intra_pred_luma_horz() 32 //* 33 //* @remarks 34 //* none 35 //* 36 //******************************************************************************* 37 //*/ 38 // 39 ///** 40 //******************************************************************************* 41 //* 42 //* @brief 43 //* intra prediction interpolation filter for horizontal luma variable. 44 //* 45 //* @par description: 46 //* horizontal intraprediction(mode 10) with.extern samples location 47 //* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer 48 //* to section 8.4.4.2.6 in the standard (special case) 49 //* 50 //* @param[in] pu1_src 51 //* uword8 pointer to the source 52 //* 53 //* @param[out] pu1_dst 54 //* uword8 pointer to the destination 55 //* 56 //* @param[in] src_strd 57 //* integer source stride 58 //* 59 //* @param[in] dst_strd 60 //* integer destination stride 61 //* 62 //* @param[in] nt 63 //* integer transform block size 64 //* 65 //* @param[in] mode 66 //* integer intraprediction mode 67 //* 68 //* @returns 69 //* 70 //* @remarks 71 //* none 72 //* 73 //******************************************************************************* 74 //*/ 75 //void ihevc_intra_pred_luma_horz(uword8 *pu1_ref, 76 // word32 src_strd, 77 // uword8 *pu1_dst, 78 // word32 dst_strd, 79 // word32 nt, 80 // word32 mode) 81 //**************variables vs registers***************************************** 82 //x0 => *pu1_ref 83 //x1 => src_strd 84 //x2 => *pu1_dst 85 //x3 => dst_strd 86 87 .text 88 .align 4 89 .include "ihevc_neon_macros.s" 90 91 92 93 .globl ihevc_intra_pred_luma_horz_av8 94 95 .type ihevc_intra_pred_luma_horz_av8, %function 96 97 ihevc_intra_pred_luma_horz_av8: 98 99 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 100 101 stp x19, x20,[sp,#-16]! 102 103 //ldr x5,[sp,#44] @loads mode 104 105 lsl x6,x4,#1 //two_nt 106 107 add x12,x0,x6 //*pu1_ref[two_nt] 108 cmp x4,#4 //if nt == 4 109 beq core_loop_4 110 111 cmp x4,#8 //if nt == 8 112 beq core_loop_8 113 114 cmp x4,#16 //if nt == 16 115 beq core_loop_16 116 sub x12,x12,#16 //move to 16th value pointer 117 add x9,x2,#16 118 119 core_loop_32: 120 ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value. 121 122 dup v2.16b, v0.16b[15] //duplicate the i value. 123 124 dup v4.16b, v0.16b[14] //duplicate the ii value. 125 dup v6.16b, v0.16b[13] //duplicate the iii value. 126 st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns 127 st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns 128 129 dup v1.16b, v0.16b[12] 130 st1 { v4.16b},[x2],x3 131 st1 { v4.16b},[x9],x3 132 133 dup v2.16b, v0.16b[11] 134 st1 { v6.16b},[x2],x3 135 st1 { v6.16b},[x9],x3 136 137 dup v4.16b, v0.16b[10] 138 st1 { v1.16b},[x2],x3 139 st1 { v1.16b},[x9],x3 140 141 dup v6.16b, v0.16b[9] 142 st1 { v2.16b},[x2],x3 143 st1 { v2.16b},[x9],x3 144 145 dup v1.16b, v0.16b[8] 146 st1 { v4.16b},[x2],x3 147 st1 { v4.16b},[x9],x3 148 149 dup v2.16b, v0.8b[7] 150 st1 { v6.16b},[x2],x3 151 st1 { v6.16b},[x9],x3 152 153 dup v4.16b, v0.8b[6] 154 st1 { v1.16b},[x2],x3 155 st1 { v1.16b},[x9],x3 156 157 dup v6.16b, v0.8b[5] 158 st1 { v2.16b},[x2],x3 159 st1 { v2.16b},[x9],x3 160 161 dup v1.16b, v0.8b[4] 162 st1 { v4.16b},[x2],x3 163 st1 { v4.16b},[x9],x3 164 165 dup v2.16b, v0.8b[3] 166 st1 { v6.16b},[x2],x3 167 st1 { v6.16b},[x9],x3 168 169 dup v4.16b, v0.8b[2] 170 st1 { v1.16b},[x2],x3 171 st1 { v1.16b},[x9],x3 172 173 dup v6.16b, v0.8b[1] 174 st1 { v2.16b},[x2],x3 175 st1 { v2.16b},[x9],x3 176 sub x12,x12,#16 //move to 16th value pointer 177 178 dup v1.16b, v0.8b[0] 179 st1 { v4.16b},[x2],x3 180 st1 { v4.16b},[x9],x3 181 182 subs x4,x4,#16 //decrement the loop count by 16 183 st1 { v6.16b},[x2],x3 184 st1 { v6.16b},[x9],x3 185 186 st1 { v1.16b},[x2],x3 187 st1 { v1.16b},[x9],x3 188 bgt core_loop_32 189 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 190 ldp x19, x20,[sp],#16 191 192 ret 193 b end_func 194 195 core_loop_16: 196 ldrb w14,[x12],#1 //pu1_ref[two_nt] 197 sxtw x14,w14 198 ld1 { v30.8b},[x12],#8 //pu1_ref[two_nt + 1 + col] 199 ld1 { v31.8b},[x12] //pu1_ref[two_nt + 1 + col] 200 sub x12,x12,#8 201 202 dup v28.8b,w14 203 sub x12,x12,#17 204 ld1 { v0.16b},[x12] 205 dup v26.8b, v0.16b[15] 206 uxtl v26.8h, v26.8b 207 208 dup v2.16b, v0.16b[14] 209 usubl v24.8h, v30.8b, v28.8b 210 211 dup v4.16b, v0.16b[13] 212 sshr v24.8h, v24.8h,#1 213 214 dup v6.16b, v0.16b[12] 215 sqadd v22.8h, v26.8h , v24.8h 216 217 dup v1.16b, v0.16b[11] 218 sqxtun v22.8b, v22.8h 219 220 st1 {v22.8b},[x2],#8 221 222 dup v18.16b, v0.16b[10] 223 usubl v24.8h, v31.8b, v28.8b 224 225 dup v19.16b, v0.16b[9] 226 sshr v24.8h, v24.8h,#1 227 228 dup v20.16b, v0.16b[8] 229 sqadd v22.8h, v26.8h , v24.8h 230 231 dup v16.16b, v0.8b[7] 232 sqxtun v22.8b, v22.8h 233 234 st1 {v22.8b},[x2],x3 235 sub x2,x2,#8 236 237 st1 { v2.16b},[x2],x3 238 239 st1 { v4.16b},[x2],x3 240 st1 { v6.16b},[x2],x3 241 st1 { v1.16b},[x2],x3 242 243 dup v2.16b, v0.8b[6] 244 st1 { v18.16b},[x2],x3 245 246 dup v4.16b, v0.8b[5] 247 st1 { v19.16b},[x2],x3 248 249 dup v6.16b, v0.8b[4] 250 st1 { v20.16b},[x2],x3 251 252 dup v1.16b, v0.8b[3] 253 st1 { v16.16b},[x2],x3 254 255 dup v18.16b, v0.8b[2] 256 st1 { v2.16b},[x2],x3 257 258 dup v19.16b, v0.8b[1] 259 st1 { v4.16b},[x2],x3 260 261 dup v20.16b, v0.8b[0] 262 st1 { v6.16b},[x2],x3 263 264 st1 { v1.16b},[x2],x3 265 st1 { v18.16b},[x2],x3 266 st1 { v19.16b},[x2],x3 267 st1 { v20.16b},[x2],x3 268 269 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 270 ldp x19, x20,[sp],#16 271 272 ret 273 b end_func 274 275 276 core_loop_8: 277 ldrb w14,[x12] //pu1_ref[two_nt] 278 sxtw x14,w14 279 add x12,x12,#1 //pu1_ref[two_nt + 1] 280 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 281 282 sub x12,x12,#9 283 ld1 {v0.8b},[x12] 284 dup v26.8b, v0.8b[7] 285 dup v28.8b,w14 286 287 dup v3.8b, v0.8b[6] 288 uxtl v26.8h, v26.8b 289 290 dup v4.8b, v0.8b[5] 291 usubl v24.8h, v30.8b, v28.8b 292 293 dup v5.8b, v0.8b[4] 294 sshr v24.8h, v24.8h,#1 295 296 dup v6.8b, v0.8b[3] 297 sqadd v22.8h, v26.8h , v24.8h 298 299 dup v7.8b, v0.8b[2] 300 sqxtun v22.8b, v22.8h 301 302 st1 {v22.8b},[x2],x3 303 st1 {v3.8b},[x2],x3 304 305 dup v1.8b, v0.8b[1] 306 st1 {v4.8b},[x2],x3 307 st1 {v5.8b},[x2],x3 308 309 dup v17.8b, v0.8b[0] 310 st1 {v6.8b},[x2],x3 311 st1 {v7.8b},[x2],x3 312 313 st1 {v1.8b},[x2],x3 314 st1 {v17.8b},[x2],x3 315 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 316 ldp x19, x20,[sp],#16 317 318 ret 319 b end_func 320 321 322 core_loop_4: 323 ldrb w14,[x12] //pu1_ref[two_nt] 324 sxtw x14,w14 325 add x12,x12,#1 //pu1_ref[two_nt + 1] 326 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 327 328 sub x12,x12,#5 329 ld1 {v0.8b},[x12] 330 dup v28.8b,w14 331 dup v26.8b, v0.8b[3] 332 uxtl v26.8h, v26.8b 333 334 dup v3.8b, v0.8b[2] 335 usubl v24.8h, v30.8b, v28.8b 336 337 dup v4.8b, v0.8b[1] 338 sshr v24.8h, v24.8h,#1 339 340 dup v5.8b, v0.8b[0] 341 sqadd v22.8h, v26.8h , v24.8h 342 343 sqxtun v22.8b, v22.8h 344 345 st1 {v22.s}[0],[x2],x3 346 st1 {v3.s}[0],[x2],x3 347 st1 {v4.s}[0],[x2],x3 348 st1 {v5.s}[0],[x2],x3 349 350 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 351 ldp x19, x20,[sp],#16 352 353 ret 354 end_func: 355 356 357 358