1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 ///******************************************************************************* 20 //* //file 21 //* ihevcd_fmt_conv_420sp_to_rgba8888.s 22 //* 23 //* //brief 24 //* contains function definitions for format conversions 25 //* 26 //* //author 27 //* ittiam 28 //* 29 //* //par list of functions: 30 //* 31 //* 32 //* //remarks 33 //* none 34 //* 35 //*******************************************************************************/ 36 37 .equ DO1STROUNDING, 0 38 39 // ARM 40 // 41 // PRESERVE8 42 43 .text 44 .p2align 2 45 46 .include "ihevc_neon_macros.s" 47 48 49 50 ///***************************************************************************** 51 //* * 52 //* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() * 53 //* * 54 //* Description : This function conversts the image from YUV422 color * 55 //* space to RGB888 color space. The function can be * 56 //* invoked at the MB level. * 57 //* * 58 //* Arguments : x0 pubY * 59 //* x1 pubUV * 60 //* x2 pusRGB * 61 //* x3 pusRGB * 62 //* [x13 #40] usHeight * 63 //* [x13 #44] usWidth * 64 //* [x13 #48] usStrideY * 65 //* [x13 #52] usStrideU * 66 //* [x13 #56] usStrideV * 67 //* [x13 #60] usStrideRGB * 68 //* * 69 //* Values Returned : None * 70 //* * 71 //* Register Usage : x0 - x14 * 72 //* * 73 //* Stack Usage : 40 Bytes * 74 //* * 75 //* Interruptibility : Interruptible * 76 //* * 77 //* Known Limitations * 78 //* Assumptions: Image Width: Assumed to be multiple of 16 and * 79 //* greater than or equal to 16 * 80 //* Image Height: Assumed to be even. * 81 //* * 82 //* Revision History : * 83 //* DD MM YYYY Author(s) Changes (Describe the changes made) * 84 //* 07 06 2010 Varshita Draft * 85 //* 07 06 2010 Naveen Kr T Completed * 86 //* 05 08 2013 Naveen K P Modified for HEVC * 87 //*****************************************************************************/ 88 .global ihevcd_fmt_conv_420sp_to_rgba8888_av8 89 .type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function 90 ihevcd_fmt_conv_420sp_to_rgba8888_av8: 91 92 //// push the registers on the stack 93 // STMFD sp!,{x4-x12,x14} 94 95 stp d12,d14,[sp,#-16]! 96 stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. 97 // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. 98 stp x19, x20,[sp,#-16]! 99 100 101 ////x0 - Y PTR 102 ////x1 - UV PTR 103 ////x2 - RGB PTR 104 ////x3 - RGB PTR 105 ////x4 - PIC WIDTH 106 ////x5 - PIC HT 107 ////x6 - STRIDE Y 108 ////x7 - STRIDE U 109 ////x8 - STRIDE V 110 ////x9 - STRIDE RGB 111 112 ////ONE ROW PROCESSING AT A TIME 113 114 ////THE FOUR CONSTANTS ARE: 115 ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092 116 117 //PLD [x0] 118 //PLD [x1] 119 //PLD [x2] 120 121 122 ///* can be loaded from a defined const type */ 123 mov x10,#0x3311 124 mov v0.h[0], w10 ////C1 125 126 mov x10,#0xF379 127 mov v0.h[1], w10 ////C2 128 129 mov x10,#0xE5F8 130 mov v0.h[2], w10 ////C3 131 132 mov x10,#0x4092 133 mov v0.h[3], w10 ////C4 134 135 ////LOAD CONSTANT 128 INTO A CORTEX REGISTER 136 MOV x10,#128 137 dup v1.8b,w10 138 139 ////D0 HAS C1-C2-C3-C4 140 //// load other parameters from stack 141 mov x9, x7 142 mov x7, x6 143 mov x6, x5 144 mov x5, x4 145 //LDR x4,[sp,#44] 146 //LDR x8,[sp,#52] 147 148 //// calculate offsets, offset = stride - width 149 SUB x10,x6,x3 //// luma offset 150 SUB x11,x7,x3 151 //, LSR #1 @// u offset 152 //SUB x12,x8,x3, LSR #1 @// v offset 153 SUB x14,x9,x3 //// rgb offset in pixels 154 155 //// calculate height loop count 156 LSR x5, x5, #1 //// height_cnt = height / 16 157 158 //// create next row pointers for rgb and luma data 159 ADD x7,x0,x6 //// luma_next_row = luma + luma_stride 160 ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride 161 162 LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP: 163 164 ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES. 165 LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV 166 ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V 167 168 //// calculate width loop count 169 LSR x6, x3, #4 //// width_cnt = width / 16 170 171 ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 172 ////LOAD VALUES OF Y 8-BIT VALUES 173 LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 174 ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 175 LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 176 ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 177 178 SUBS x6,x6,#1 179 BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP 180 181 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: 182 //VMOV.I8 Q1,#128 183 UZP1 v27.8b, v2.8b, v3.8b 184 UZP2 v3.8b, v2.8b, v3.8b 185 mov v2.d[0], v27.d[0] 186 187 ////NEED TO SUBTRACT (U-128) AND (V-128) 188 ////(D2-D1),(D3-D1) 189 uSUBL v4.8h, v2.8b, v1.8b ////(U-128) 190 uSUBL v6.8h, v3.8b, v1.8b ////(V-128) 191 192 ////LOAD VALUES OF U&V for next row 193 LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U 194 ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V 195 196 //PLD [x0] 197 prfm PLDL1KEEP,[x1] 198 199 ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 200 sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B 201 sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B 202 203 sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R 204 sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R 205 206 sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G 207 sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 208 sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G 209 sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 210 211 ////NARROW RIGHT SHIFT BY 13 FOR R&B 212 sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES 213 sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES 214 ////Q4 - WEIGHT FOR B 215 216 ////NARROW RIGHT SHIFT BY 13 FOR R&B 217 sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES 218 sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES 219 ////Q5 - WEIGHT FOR R 220 221 ////NARROW RIGHT SHIFT BY 13 FOR G 222 sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 223 sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 224 ////Q6 - WEIGHT FOR G 225 226 UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B 227 UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R 228 UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G 229 230 UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B 231 UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R 232 UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G 233 234 sqxtun v14.8b, v14.8h 235 sqxtun v15.8b, v18.8h 236 sqxtun v16.8b, v16.8h 237 movi v17.8b, #0 238 239 sqxtun v20.8b, v20.8h 240 sqxtun v21.8b, v24.8h 241 sqxtun v22.8b, v22.8h 242 movi v23.8b, #0 243 244 ZIP1 v27.8b, v14.8b, v15.8b 245 ZIP2 v15.8b, v14.8b, v15.8b 246 mov v14.d[0], v27.d[0] 247 ZIP1 v27.8b, v16.8b, v17.8b 248 ZIP2 v17.8b, v16.8b, v17.8b 249 mov v16.d[0], v27.d[0] 250 251 ZIP1 v27.8b, v20.8b, v21.8b 252 ZIP2 v21.8b, v20.8b, v21.8b 253 mov v20.d[0], v27.d[0] 254 ZIP1 v27.8b, v22.8b, v23.8b 255 ZIP2 v23.8b, v22.8b, v23.8b 256 mov v22.d[0], v27.d[0] 257 258 mov v14.d[1], v15.d[0] 259 mov v20.d[1], v21.d[0] 260 mov v16.d[1], v17.d[0] 261 mov v22.d[1], v23.d[0] 262 263 ZIP1 v27.8h, v14.8h, v16.8h 264 ZIP2 v26.8h, v14.8h, v16.8h 265 266 ZIP1 v25.8h, v20.8h, v22.8h 267 ZIP2 v19.8h, v20.8h, v22.8h 268 269 ZIP1 v14.4s, v27.4s, v25.4s 270 ZIP2 v20.4s, v27.4s, v25.4s 271 272 ZIP1 v16.4s, v26.4s, v19.4s 273 ZIP2 v22.4s, v26.4s, v19.4s 274 275 ST1 {v14.4s},[x2],#16 276 ST1 {v20.4s},[x2],#16 277 ST1 {v16.4s},[x2],#16 278 ST1 {v22.4s},[x2],#16 279 280 ////D14-D20 - TOALLY HAVE 16 VALUES 281 ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 282 UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B 283 UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R 284 UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G 285 286 UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B 287 UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R 288 UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G 289 290 ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME 291 ////LOAD VALUES OF Y 8-BIT VALUES 292 LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 293 ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 294 LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 295 ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 296 297 prfm PLDL1KEEP,[x0] 298 prfm PLDL1KEEP,[x7] 299 300 sqxtun v14.8b, v14.8h 301 sqxtun v15.8b, v18.8h 302 sqxtun v16.8b, v16.8h 303 movi v17.8b, #0 304 305 sqxtun v20.8b, v20.8h 306 sqxtun v21.8b, v24.8h 307 sqxtun v22.8b, v22.8h 308 movi v23.8b, #0 309 310 ZIP1 v27.8b, v14.8b, v15.8b 311 ZIP2 v15.8b, v14.8b, v15.8b 312 mov v14.d[0], v27.d[0] 313 ZIP1 v27.8b, v16.8b, v17.8b 314 ZIP2 v17.8b, v16.8b, v17.8b 315 mov v16.d[0], v27.d[0] 316 317 ZIP1 v27.8b, v20.8b, v21.8b 318 ZIP2 v21.8b, v20.8b, v21.8b 319 mov v20.d[0], v27.d[0] 320 ZIP1 v27.8b, v22.8b, v23.8b 321 ZIP2 v23.8b, v22.8b, v23.8b 322 mov v22.d[0], v27.d[0] 323 324 mov v14.d[1], v15.d[0] 325 mov v20.d[1], v21.d[0] 326 mov v16.d[1], v17.d[0] 327 mov v22.d[1], v23.d[0] 328 329 ZIP1 v27.8h, v14.8h, v16.8h 330 ZIP2 v26.8h, v14.8h, v16.8h 331 332 ZIP1 v25.8h, v20.8h, v22.8h 333 ZIP2 v19.8h, v20.8h, v22.8h 334 335 ZIP1 v14.4s, v27.4s, v25.4s 336 ZIP2 v20.4s, v27.4s, v25.4s 337 338 ZIP1 v16.4s, v26.4s, v19.4s 339 ZIP2 v22.4s, v26.4s, v19.4s 340 341 ST1 {v14.4s},[x8],#16 342 ST1 {v20.4s},[x8],#16 343 ST1 {v16.4s},[x8],#16 344 ST1 {v22.4s},[x8],#16 345 346 SUBS x6,x6,#1 //// width_cnt -= 1 347 BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP 348 349 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: 350 //VMOV.I8 Q1,#128 351 UZP1 v27.8b, v2.8b, v3.8b 352 UZP2 v3.8b, v2.8b, v3.8b 353 mov v2.d[0], v27.d[0] 354 355 356 ////NEED TO SUBTRACT (U-128) AND (V-128) 357 ////(D2-D1),(D3-D1) 358 uSUBL v4.8h, v2.8b, v1.8b ////(U-128) 359 uSUBL v6.8h, v3.8b, v1.8b ////(V-128) 360 361 362 ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS 363 sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B 364 sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B 365 366 sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R 367 sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R 368 369 sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G 370 sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 371 sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G 372 sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 373 374 ////NARROW RIGHT SHIFT BY 13 FOR R&B 375 sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES 376 sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES 377 ////Q4 - WEIGHT FOR B 378 379 ////NARROW RIGHT SHIFT BY 13 FOR R&B 380 sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES 381 sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES 382 ////Q5 - WEIGHT FOR R 383 384 ////NARROW RIGHT SHIFT BY 13 FOR G 385 sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 386 sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES 387 ////Q6 - WEIGHT FOR G 388 389 UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B 390 UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R 391 UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G 392 393 UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B 394 UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R 395 UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G 396 397 sqxtun v14.8b, v14.8h 398 sqxtun v15.8b, v18.8h 399 sqxtun v16.8b, v16.8h 400 movi v17.8b, #0 401 402 sqxtun v20.8b, v20.8h 403 sqxtun v21.8b, v24.8h 404 sqxtun v22.8b, v22.8h 405 movi v23.8b, #0 406 407 ZIP1 v27.8b, v14.8b, v15.8b 408 ZIP2 v15.8b, v14.8b, v15.8b 409 mov v14.d[0], v27.d[0] 410 ZIP1 v27.8b, v16.8b, v17.8b 411 ZIP2 v17.8b, v16.8b, v17.8b 412 mov v16.d[0], v27.d[0] 413 414 ZIP1 v27.8b, v20.8b, v21.8b 415 ZIP2 v21.8b, v20.8b, v21.8b 416 mov v20.d[0], v27.d[0] 417 ZIP1 v27.8b, v22.8b, v23.8b 418 ZIP2 v23.8b, v22.8b, v23.8b 419 mov v22.d[0], v27.d[0] 420 421 mov v14.d[1], v15.d[0] 422 mov v20.d[1], v21.d[0] 423 mov v16.d[1], v17.d[0] 424 mov v22.d[1], v23.d[0] 425 426 ZIP1 v27.8h, v14.8h, v16.8h 427 ZIP2 v26.8h, v14.8h, v16.8h 428 429 ZIP1 v25.8h, v20.8h, v22.8h 430 ZIP2 v19.8h, v20.8h, v22.8h 431 432 ZIP1 v14.4s, v27.4s, v25.4s 433 ZIP2 v20.4s, v27.4s, v25.4s 434 435 ZIP1 v16.4s, v26.4s, v19.4s 436 ZIP2 v22.4s, v26.4s, v19.4s 437 438 ST1 {v14.4s},[x2],#16 439 ST1 {v20.4s},[x2],#16 440 ST1 {v16.4s},[x2],#16 441 ST1 {v22.4s},[x2],#16 442 443 ////D14-D20 - TOALLY HAVE 16 VALUES 444 ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS 445 UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B 446 UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R 447 UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G 448 449 UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B 450 UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R 451 UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G 452 453 sqxtun v14.8b, v14.8h 454 sqxtun v15.8b, v18.8h 455 sqxtun v16.8b, v16.8h 456 movi v17.8b, #0 457 458 sqxtun v20.8b, v20.8h 459 sqxtun v21.8b, v24.8h 460 sqxtun v22.8b, v22.8h 461 movi v23.8b, #0 462 463 ZIP1 v27.8b, v14.8b, v15.8b 464 ZIP2 v15.8b, v14.8b, v15.8b 465 mov v14.d[0], v27.d[0] 466 ZIP1 v27.8b, v16.8b, v17.8b 467 ZIP2 v17.8b, v16.8b, v17.8b 468 mov v16.d[0], v27.d[0] 469 470 ZIP1 v27.8b, v20.8b, v21.8b 471 ZIP2 v21.8b, v20.8b, v21.8b 472 mov v20.d[0], v27.d[0] 473 ZIP1 v27.8b, v22.8b, v23.8b 474 ZIP2 v23.8b, v22.8b, v23.8b 475 mov v22.d[0], v27.d[0] 476 477 mov v14.d[1], v15.d[0] 478 mov v20.d[1], v21.d[0] 479 mov v16.d[1], v17.d[0] 480 mov v22.d[1], v23.d[0] 481 482 ZIP1 v27.8h, v14.8h, v16.8h 483 ZIP2 v26.8h, v14.8h, v16.8h 484 485 ZIP1 v25.8h, v20.8h, v22.8h 486 ZIP2 v19.8h, v20.8h, v22.8h 487 488 ZIP1 v14.4s, v27.4s, v25.4s 489 ZIP2 v20.4s, v27.4s, v25.4s 490 491 ZIP1 v16.4s, v26.4s, v19.4s 492 ZIP2 v22.4s, v26.4s, v19.4s 493 494 ST1 {v14.4s},[x8],#16 495 ST1 {v20.4s},[x8],#16 496 ST1 {v16.4s},[x8],#16 497 ST1 {v22.4s},[x8],#16 498 499 //// Adjust the address pointers 500 ADD x0,x7,x10 //// luma = luma_next + offset 501 ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset 502 503 ADD x7,x0,x3 //// luma_next = luma + width 504 ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width 505 506 ADD x1,x1,x11 //// adjust u pointer 507 //ADD x2,x2,x12 @// adjust v pointer 508 509 ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch) 510 ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset 511 512 SUBS x5,x5,#1 //// height_cnt -= 1 513 514 BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP 515 516 ////POP THE REGISTERS 517 // LDMFD sp!,{x4-x12,PC} 518 ldp x19, x20,[sp],#16 519 ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. 520 // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. 521 ldp d12,d14,[sp],#16 522 ret 523 524 525 526 527 .section .note.GNU-stack,"",%progbits 528 529