1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 // ******************************************************************************* 20 // * //file 21 // * ihevc_padding_neon.s 22 // * 23 // * //brief 24 // * contains function definitions padding 25 // * 26 // * //author 27 // * naveen sr 28 // * 29 // * //par list of functions: 30 // * - ihevc_pad_left_luma() 31 // * - ihevc_pad_left_chroma() 32 // * 33 // * //remarks 34 // * none 35 // * 36 // ******************************************************************************* 37 //*/ 38 39 ///** 40 //******************************************************************************* 41 //* 42 //* //brief 43 //* padding (luma block) at the left of a 2d array 44 //* 45 //* //par description: 46 //* the left column of a 2d array is replicated for pad_size times at the left 47 //* 48 //* 49 //* //param[in] pu1_src 50 //* uword8 pointer to the source 51 //* 52 //* //param[in] src_strd 53 //* integer source stride 54 //* 55 //* //param[in] ht 56 //* integer height of the array 57 //* 58 //* //param[in] wd 59 //* integer width of the array 60 //* 61 //* //param[in] pad_size 62 //* integer -padding size of the array 63 //* 64 //* //param[in] ht 65 //* integer height of the array 66 //* 67 //* //param[in] wd 68 //* integer width of the array 69 //* 70 //* //returns 71 //* 72 //* //remarks 73 //* none 74 //* 75 //******************************************************************************* 76 //*/ 77 //.if pad_left_luma == c 78 //void ihevc_pad_left_luma(uword8 *pu1_src, 79 // word32 src_strd, 80 // word32 ht, 81 // word32 pad_size) 82 //**************variables vs registers************************* 83 // x0 => *pu1_src 84 // x1 => src_strd 85 // x2 => ht 86 // x3 => pad_size 87 88 .text 89 .align 4 90 91 .globl ihevc_pad_left_luma_av8 92 93 .type ihevc_pad_left_luma_av8, %function 94 95 ihevc_pad_left_luma_av8: 96 97 loop_start_luma_left: 98 // pad size is assumed to be pad_left = 80 99 sub x4,x0,x3 100 101 ldrb w8,[x0] 102 add x0,x0,x1 103 ldrb w9,[x0] 104 add x0,x0,x1 105 ldrb w10,[x0] 106 add x0,x0,x1 107 ldrb w11,[x0] 108 add x0,x0,x1 109 110 dup v0.16b,w8 111 dup v2.16b,w9 112 dup v4.16b,w10 113 dup v6.16b,w11 114 115 add x5,x4,x1 116 117 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 118 st1 {v0.16b},[x4],#16 // 16 bytes store 119 st1 {v0.16b},[x4],#16 // 16 bytes store 120 st1 {v0.16b},[x4],#16 // 16 bytes store 121 st1 {v0.16b},[x4] // 16 bytes store 122 123 add x6,x5,x1 124 125 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 126 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 127 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 128 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 129 st1 {v2.16b},[x5] //128/8 = 16 bytes store 130 131 add x7,x6,x1 132 133 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 134 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 135 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 136 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 137 st1 {v4.16b},[x6] //128/8 = 16 bytes store 138 139 subs x2, x2,#4 140 141 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 142 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 143 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 144 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 145 st1 {v6.16b},[x7] //128/8 = 16 bytes store 146 147 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 148 149 bne loop_start_luma_left 150 151 ret 152 153 154 155 156 157 ///** 158 //******************************************************************************* 159 //* 160 //* //brief 161 //* padding (chroma block) at the left of a 2d array 162 //* 163 //* //par description: 164 //* the left column of a 2d array is replicated for pad_size times at the left 165 //* 166 //* 167 //* //param[in] pu1_src 168 //* uword8 pointer to the source 169 //* 170 //* //param[in] src_strd 171 //* integer source stride 172 //* 173 //* //param[in] ht 174 //* integer height of the array 175 //* 176 //* //param[in] wd 177 //* integer width of the array (each colour component) 178 //* 179 //* //param[in] pad_size 180 //* integer -padding size of the array 181 //* 182 //* //param[in] ht 183 //* integer height of the array 184 //* 185 //* //param[in] wd 186 //* integer width of the array 187 //* 188 //* //returns 189 //* 190 //* //remarks 191 //* none 192 //* 193 //******************************************************************************* 194 //*/ 195 //.if pad_left_chroma == c 196 //void ihevc_pad_left_chroma(uword8 *pu1_src, 197 // word32 src_strd, 198 // word32 ht, 199 // word32 pad_size) 200 //{ 201 // x0 => *pu1_src 202 // x1 => src_strd 203 // x2 => ht 204 // x3 => pad_size 205 206 207 208 .globl ihevc_pad_left_chroma_av8 209 210 .type ihevc_pad_left_chroma_av8, %function 211 212 ihevc_pad_left_chroma_av8: 213 214 215 loop_start_chroma_left: 216 // pad size is assumed to be pad_left = 80 217 sub x4,x0,x3 218 219 ldrh w8,[x0] 220 add x0,x0,x1 221 ldrh w9,[x0] 222 add x0,x0,x1 223 ldrh w10,[x0] 224 add x0,x0,x1 225 ldrh w11,[x0] 226 add x0,x0,x1 227 228 dup v0.8h,w8 229 dup v2.8h,w9 230 dup v4.8h,w10 231 dup v6.8h,w11 232 233 add x5,x4,x1 234 235 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 236 st1 {v0.16b},[x4],#16 // 16 bytes store 237 st1 {v0.16b},[x4],#16 // 16 bytes store 238 st1 {v0.16b},[x4],#16 // 16 bytes store 239 st1 {v0.16b},[x4] // 16 bytes store 240 241 add x6,x5,x1 242 243 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 244 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 245 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 246 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 247 st1 {v2.16b},[x5] //128/8 = 16 bytes store 248 249 add x7,x6,x1 250 251 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 252 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 253 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 254 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 255 st1 {v4.16b},[x6] //128/8 = 16 bytes store 256 257 subs x2, x2,#4 258 259 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 260 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 261 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 262 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 263 st1 {v6.16b},[x7] //128/8 = 16 bytes store 264 265 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 266 267 bne loop_start_chroma_left 268 269 ret 270 271 272 273 274 275 ///** 276 //******************************************************************************* 277 //* 278 //* //brief 279 //* padding (luma block) at the right of a 2d array 280 //* 281 //* //par description: 282 //* the right column of a 2d array is replicated for pad_size times at the right 283 //* 284 //* 285 //* //param[in] pu1_src 286 //* uword8 pointer to the source 287 //* 288 //* //param[in] src_strd 289 //* integer source stride 290 //* 291 //* //param[in] ht 292 //* integer height of the array 293 //* 294 //* //param[in] wd 295 //* integer width of the array 296 //* 297 //* //param[in] pad_size 298 //* integer -padding size of the array 299 //* 300 //* //param[in] ht 301 //* integer height of the array 302 //* 303 //* //param[in] wd 304 //* integer width of the array 305 //* 306 //* //returns 307 //* 308 //* //remarks 309 //* none 310 //* 311 //******************************************************************************* 312 //*/ 313 //.if pad_right_luma == c 314 //void ihevc_pad_right_luma(uword8 *pu1_src, 315 // word32 src_strd, 316 // word32 ht, 317 // word32 pad_size) 318 //{ 319 // word32 row// 320 // 321 // for(row = 0// row < ht// row++) 322 // { 323 // memset(pu1_src, *(pu1_src -1), pad_size)// 324 // 325 // pu1_src += src_strd// 326 // } 327 //} 328 // 329 // x0 => *pu1_src 330 // x1 => src_strd 331 // x2 => ht 332 // x3 => pad_size 333 334 335 336 .globl ihevc_pad_right_luma_av8 337 338 .type ihevc_pad_right_luma_av8, %function 339 340 ihevc_pad_right_luma_av8: 341 342 343 loop_start_luma_right: 344 // pad size is assumed to be pad_left = 80 345 mov x4,x0 346 347 ldrb w8,[x0, #-1] 348 add x0,x0,x1 349 ldrb w9,[x0, #-1] 350 add x0,x0,x1 351 ldrb w10,[x0, #-1] 352 add x0,x0,x1 353 ldrb w11,[x0, #-1] 354 add x0,x0,x1 355 356 add x5,x4,x1 357 add x6,x5,x1 358 add x7,x6,x1 359 360 dup v0.16b,w8 361 dup v2.16b,w9 362 dup v4.16b,w10 363 dup v6.16b,w11 364 365 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 366 st1 {v0.16b},[x4],#16 // 16 bytes store 367 st1 {v0.16b},[x4],#16 // 16 bytes store 368 st1 {v0.16b},[x4],#16 // 16 bytes store 369 st1 {v0.16b},[x4] // 16 bytes store 370 371 372 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 373 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 374 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 375 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 376 st1 {v2.16b},[x5] //128/8 = 16 bytes store 377 378 subs x2, x2,#4 379 380 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 381 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 382 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 383 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 384 st1 {v4.16b},[x6] //128/8 = 16 bytes store 385 386 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 387 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 388 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 389 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 390 st1 {v6.16b},[x7] //128/8 = 16 bytes store 391 392 393 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 394 395 396 bne loop_start_luma_right 397 398 ret 399 400 401 402 403 404 ///** 405 //******************************************************************************* 406 //* 407 //* //brief 408 ////* padding (chroma block) at the right of a 2d array 409 //* 410 //* //par description: 411 //* the right column of a 2d array is replicated for pad_size times at the right 412 //* 413 //* 414 //* //param[in] pu1_src 415 ////* uword8 pointer to the source 416 //* 417 //* //param[in] src_strd 418 //* integer source stride 419 //* 420 //* //param[in] ht 421 ////* integer height of the array 422 //* 423 //* //param[in] wd 424 //* integer width of the array (each colour component) 425 //* 426 //* //param[in] pad_size 427 //* integer -padding size of the array 428 //* 429 //* //param[in] ht 430 ////* integer height of the array 431 //* 432 //* //param[in] wd 433 //* integer width of the array 434 //* 435 //* //returns 436 //* 437 //* //remarks 438 //* none 439 //* 440 //******************************************************************************* 441 //*/ 442 //.if pad_right_chroma == c 443 //void ihevc_pad_right_chroma(uword8 *pu1_src, 444 // word32 src_strd, 445 // word32 ht, 446 // word32 pad_size) 447 // x0 => *pu1_src 448 // x1 => src_strd 449 // x2 => ht 450 // x3 => pad_size 451 452 453 454 .globl ihevc_pad_right_chroma_av8 455 456 .type ihevc_pad_right_chroma_av8, %function 457 458 ihevc_pad_right_chroma_av8: 459 460 461 loop_start_chroma_right: 462 // pad size is assumed to be pad_left = 80 463 mov x4,x0 464 465 ldrh w8,[x0, #-2] 466 add x0,x0,x1 467 ldrh w9,[x0, #-2] 468 add x0,x0,x1 469 ldrh w10,[x0, #-2] 470 add x0,x0,x1 471 ldrh w11,[x0, #-2] 472 add x0,x0,x1 473 474 dup v0.8h,w8 475 dup v2.8h,w9 476 dup v4.8h,w10 477 dup v6.8h,w11 478 479 add x5,x4,x1 480 481 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 482 st1 {v0.16b},[x4],#16 // 16 bytes store 483 st1 {v0.16b},[x4],#16 // 16 bytes store 484 st1 {v0.16b},[x4],#16 // 16 bytes store 485 st1 {v0.16b},[x4] // 16 bytes store 486 487 add x6,x5,x1 488 489 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 490 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 491 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 492 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 493 st1 {v2.16b},[x5] //128/8 = 16 bytes store 494 495 add x7,x6,x1 496 497 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 498 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 499 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 500 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 501 st1 {v4.16b},[x6] //128/8 = 16 bytes store 502 503 subs x2, x2,#4 504 505 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 506 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 507 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 508 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 509 st1 {v6.16b},[x7] //128/8 = 16 bytes store 510 511 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 512 513 bne loop_start_chroma_right 514 515 ret 516 517 518 519 520 521 522 523 524