Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevcd_fmt_conv_420sp_to_rgba8888.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for format conversions
     25 //*
     26 //* //author
     27 //*  ittiam
     28 //*
     29 //* //par list of functions:
     30 //*
     31 //*
     32 //* //remarks
     33 //*  none
     34 //*
     35 //*******************************************************************************/
     36 
     37     .equ DO1STROUNDING, 0
     38 
     39     // ARM
     40     //
     41     // PRESERVE8
     42 
     43 .text
     44 .p2align 2
     45 
     46 .include "ihevc_neon_macros.s"
     47 
     48 
     49 
     50 ///*****************************************************************************
     51 //*                                                                            *
     52 //*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
     53 //*                                                                            *
     54 //*  Description      : This function conversts the image from YUV422 color    *
     55 //*                     space to RGB888 color space. The function can be       *
     56 //*                     invoked at the MB level.                               *
     57 //*                                                                            *
     58 //*  Arguments        : x0           pubY                                      *
     59 //*                     x1           pubUV                                     *
     60 //*                     x2           pusRGB                                    *
     61 //*                     x3           pusRGB                                    *
     62 //*                     [x13 #40]    usHeight                                  *
     63 //*                     [x13 #44]    usWidth                                   *
     64 //*                     [x13 #48]    usStrideY                                 *
     65 //*                     [x13 #52]    usStrideU                                 *
     66 //*                     [x13 #56]    usStrideV                                 *
     67 //*                     [x13 #60]    usStrideRGB                               *
     68 //*                                                                            *
     69 //*  Values Returned  : None                                                   *
     70 //*                                                                            *
     71 //*  Register Usage   : x0 - x14                                               *
     72 //*                                                                            *
     73 //*  Stack Usage      : 40 Bytes                                               *
     74 //*                                                                            *
     75 //*  Interruptibility : Interruptible                                          *
     76 //*                                                                            *
     77 //*  Known Limitations                                                         *
     78 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
     79 //*                     greater than or equal to 16                  *
     80 //*                     Image Height:    Assumed to be even.                   *
     81 //*                                                                            *
     82 //*  Revision History :                                                        *
     83 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
     84 //*         07 06 2010   Varshita        Draft                                 *
     85 //*         07 06 2010   Naveen Kr T     Completed                             *
     86 //*         05 08 2013   Naveen K P      Modified for HEVC                     *
     87 //*****************************************************************************/
     88     .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
     89 .type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
     90 ihevcd_fmt_conv_420sp_to_rgba8888_av8:
     91 
     92     //// push the registers on the stack
     93     // STMFD sp!,{x4-x12,x14}
     94 
     95     stp         d12,d14,[sp,#-16]!
     96     stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
     97                                             // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     98     stp         x19, x20,[sp,#-16]!
     99 
    100 
    101     ////x0 - Y PTR
    102     ////x1 - UV PTR
    103     ////x2 - RGB PTR
    104     ////x3 - RGB PTR
    105     ////x4 - PIC WIDTH
    106     ////x5 - PIC HT
    107     ////x6 - STRIDE Y
    108     ////x7 - STRIDE U
    109     ////x8 - STRIDE V
    110     ////x9 - STRIDE RGB
    111 
    112     ////ONE ROW PROCESSING AT A TIME
    113 
    114     ////THE FOUR CONSTANTS ARE:
    115     ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
    116 
    117     //PLD        [x0]
    118     //PLD        [x1]
    119     //PLD        [x2]
    120 
    121 
    122     ///* can be loaded from a defined const type */
    123     mov         x10,#0x3311
    124     mov         v0.h[0], w10               ////C1
    125 
    126     mov         x10,#0xF379
    127     mov         v0.h[1], w10               ////C2
    128 
    129     mov         x10,#0xE5F8
    130     mov         v0.h[2], w10               ////C3
    131 
    132     mov         x10,#0x4092
    133     mov         v0.h[3], w10               ////C4
    134 
    135     ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
    136     MOV         x10,#128
    137     dup         v1.8b,w10
    138 
    139     ////D0 HAS C1-C2-C3-C4
    140     //// load other parameters from stack
    141     mov         x9, x7
    142     mov         x7, x6
    143     mov         x6, x5
    144     mov         x5, x4
    145     //LDR  x4,[sp,#44]
    146     //LDR  x8,[sp,#52]
    147 
    148     //// calculate offsets, offset = stride - width
    149     SUB         x10,x6,x3                   //// luma offset
    150     SUB         x11,x7,x3
    151     //, LSR #1    @// u offset
    152     //SUB     x12,x8,x3, LSR #1    @// v offset
    153     SUB         x14,x9,x3                   //// rgb offset in pixels
    154 
    155     //// calculate height loop count
    156     LSR         x5, x5, #1                  //// height_cnt = height / 16
    157 
    158     //// create next row pointers for rgb and luma data
    159     ADD         x7,x0,x6                    //// luma_next_row = luma + luma_stride
    160     ADD         x8,x2,x9,LSL #2             //// rgb_next_row = rgb + rgb_stride
    161 
    162 LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
    163 
    164     ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
    165     LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF UV
    166     ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
    167 
    168     //// calculate width loop count
    169     LSR         x6, x3, #4                  //// width_cnt = width / 16
    170 
    171     ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    172     ////LOAD VALUES OF Y 8-BIT VALUES
    173     LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
    174                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    175     LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
    176                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    177 
    178     SUBS        x6,x6,#1
    179     BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
    180 
    181 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
    182     //VMOV.I8 Q1,#128
    183     UZP1        v27.8b, v2.8b, v3.8b
    184     UZP2        v3.8b, v2.8b, v3.8b
    185     mov         v2.d[0], v27.d[0]
    186 
    187     ////NEED TO SUBTRACT (U-128) AND (V-128)
    188     ////(D2-D1),(D3-D1)
    189     uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
    190     uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
    191 
    192     ////LOAD VALUES OF U&V for next row
    193     LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF U
    194     ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V
    195 
    196     //PLD        [x0]
    197     prfm        PLDL1KEEP,[x1]
    198 
    199     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    200     sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
    201     sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B
    202 
    203     sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
    204     sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R
    205 
    206     sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
    207     sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
    208     sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
    209     sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
    210 
    211     ////NARROW RIGHT SHIFT BY 13 FOR R&B
    212     sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
    213     sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
    214     ////Q4 - WEIGHT FOR B
    215 
    216     ////NARROW RIGHT SHIFT BY 13 FOR R&B
    217     sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
    218     sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
    219     ////Q5 - WEIGHT FOR R
    220 
    221     ////NARROW RIGHT SHIFT BY 13 FOR G
    222     sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    223     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    224     ////Q6 - WEIGHT FOR G
    225 
    226     UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
    227     UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
    228     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
    229 
    230     UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
    231     UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
    232     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
    233 
    234     sqxtun      v14.8b, v14.8h
    235     sqxtun      v15.8b, v18.8h
    236     sqxtun      v16.8b, v16.8h
    237     movi        v17.8b, #0
    238 
    239     sqxtun      v20.8b, v20.8h
    240     sqxtun      v21.8b, v24.8h
    241     sqxtun      v22.8b, v22.8h
    242     movi        v23.8b, #0
    243 
    244     ZIP1        v27.8b, v14.8b, v15.8b
    245     ZIP2        v15.8b, v14.8b, v15.8b
    246     mov         v14.d[0], v27.d[0]
    247     ZIP1        v27.8b, v16.8b, v17.8b
    248     ZIP2        v17.8b, v16.8b, v17.8b
    249     mov         v16.d[0], v27.d[0]
    250 
    251     ZIP1        v27.8b, v20.8b, v21.8b
    252     ZIP2        v21.8b, v20.8b, v21.8b
    253     mov         v20.d[0], v27.d[0]
    254     ZIP1        v27.8b, v22.8b, v23.8b
    255     ZIP2        v23.8b, v22.8b, v23.8b
    256     mov         v22.d[0], v27.d[0]
    257 
    258     mov         v14.d[1], v15.d[0]
    259     mov         v20.d[1], v21.d[0]
    260     mov         v16.d[1], v17.d[0]
    261     mov         v22.d[1], v23.d[0]
    262 
    263     ZIP1        v27.8h, v14.8h, v16.8h
    264     ZIP2        v26.8h, v14.8h, v16.8h
    265 
    266     ZIP1        v25.8h, v20.8h, v22.8h
    267     ZIP2        v19.8h, v20.8h, v22.8h
    268 
    269     ZIP1        v14.4s, v27.4s, v25.4s
    270     ZIP2        v20.4s, v27.4s, v25.4s
    271 
    272     ZIP1        v16.4s, v26.4s, v19.4s
    273     ZIP2        v22.4s, v26.4s, v19.4s
    274 
    275     ST1         {v14.4s},[x2],#16
    276     ST1         {v20.4s},[x2],#16
    277     ST1         {v16.4s},[x2],#16
    278     ST1         {v22.4s},[x2],#16
    279 
    280     ////D14-D20 - TOALLY HAVE 16 VALUES
    281     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    282     UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
    283     UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
    284     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
    285 
    286     UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
    287     UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
    288     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
    289 
    290     ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    291     ////LOAD VALUES OF Y 8-BIT VALUES
    292     LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
    293                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    294     LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
    295                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    296 
    297     prfm        PLDL1KEEP,[x0]
    298     prfm        PLDL1KEEP,[x7]
    299 
    300     sqxtun      v14.8b, v14.8h
    301     sqxtun      v15.8b, v18.8h
    302     sqxtun      v16.8b, v16.8h
    303     movi        v17.8b, #0
    304 
    305     sqxtun      v20.8b, v20.8h
    306     sqxtun      v21.8b, v24.8h
    307     sqxtun      v22.8b, v22.8h
    308     movi        v23.8b, #0
    309 
    310     ZIP1        v27.8b, v14.8b, v15.8b
    311     ZIP2        v15.8b, v14.8b, v15.8b
    312     mov         v14.d[0], v27.d[0]
    313     ZIP1        v27.8b, v16.8b, v17.8b
    314     ZIP2        v17.8b, v16.8b, v17.8b
    315     mov         v16.d[0], v27.d[0]
    316 
    317     ZIP1        v27.8b, v20.8b, v21.8b
    318     ZIP2        v21.8b, v20.8b, v21.8b
    319     mov         v20.d[0], v27.d[0]
    320     ZIP1        v27.8b, v22.8b, v23.8b
    321     ZIP2        v23.8b, v22.8b, v23.8b
    322     mov         v22.d[0], v27.d[0]
    323 
    324     mov         v14.d[1], v15.d[0]
    325     mov         v20.d[1], v21.d[0]
    326     mov         v16.d[1], v17.d[0]
    327     mov         v22.d[1], v23.d[0]
    328 
    329     ZIP1        v27.8h, v14.8h, v16.8h
    330     ZIP2        v26.8h, v14.8h, v16.8h
    331 
    332     ZIP1        v25.8h, v20.8h, v22.8h
    333     ZIP2        v19.8h, v20.8h, v22.8h
    334 
    335     ZIP1        v14.4s, v27.4s, v25.4s
    336     ZIP2        v20.4s, v27.4s, v25.4s
    337 
    338     ZIP1        v16.4s, v26.4s, v19.4s
    339     ZIP2        v22.4s, v26.4s, v19.4s
    340 
    341     ST1         {v14.4s},[x8],#16
    342     ST1         {v20.4s},[x8],#16
    343     ST1         {v16.4s},[x8],#16
    344     ST1         {v22.4s},[x8],#16
    345 
    346     SUBS        x6,x6,#1                    //// width_cnt -= 1
    347     BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
    348 
    349 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
    350     //VMOV.I8 Q1,#128
    351     UZP1        v27.8b, v2.8b, v3.8b
    352     UZP2        v3.8b, v2.8b, v3.8b
    353     mov         v2.d[0], v27.d[0]
    354 
    355 
    356     ////NEED TO SUBTRACT (U-128) AND (V-128)
    357     ////(D2-D1),(D3-D1)
    358     uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
    359     uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)
    360 
    361 
    362     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    363     sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
    364     sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B
    365 
    366     sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
    367     sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R
    368 
    369     sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
    370     sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
    371     sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
    372     sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3
    373 
    374     ////NARROW RIGHT SHIFT BY 13 FOR R&B
    375     sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
    376     sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
    377     ////Q4 - WEIGHT FOR B
    378 
    379     ////NARROW RIGHT SHIFT BY 13 FOR R&B
    380     sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
    381     sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
    382     ////Q5 - WEIGHT FOR R
    383 
    384     ////NARROW RIGHT SHIFT BY 13 FOR G
    385     sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    386     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    387     ////Q6 - WEIGHT FOR G
    388 
    389     UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
    390     UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
    391     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G
    392 
    393     UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
    394     UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
    395     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G
    396 
    397     sqxtun      v14.8b, v14.8h
    398     sqxtun      v15.8b, v18.8h
    399     sqxtun      v16.8b, v16.8h
    400     movi        v17.8b, #0
    401 
    402     sqxtun      v20.8b, v20.8h
    403     sqxtun      v21.8b, v24.8h
    404     sqxtun      v22.8b, v22.8h
    405     movi        v23.8b, #0
    406 
    407     ZIP1        v27.8b, v14.8b, v15.8b
    408     ZIP2        v15.8b, v14.8b, v15.8b
    409     mov         v14.d[0], v27.d[0]
    410     ZIP1        v27.8b, v16.8b, v17.8b
    411     ZIP2        v17.8b, v16.8b, v17.8b
    412     mov         v16.d[0], v27.d[0]
    413 
    414     ZIP1        v27.8b, v20.8b, v21.8b
    415     ZIP2        v21.8b, v20.8b, v21.8b
    416     mov         v20.d[0], v27.d[0]
    417     ZIP1        v27.8b, v22.8b, v23.8b
    418     ZIP2        v23.8b, v22.8b, v23.8b
    419     mov         v22.d[0], v27.d[0]
    420 
    421     mov         v14.d[1], v15.d[0]
    422     mov         v20.d[1], v21.d[0]
    423     mov         v16.d[1], v17.d[0]
    424     mov         v22.d[1], v23.d[0]
    425 
    426     ZIP1        v27.8h, v14.8h, v16.8h
    427     ZIP2        v26.8h, v14.8h, v16.8h
    428 
    429     ZIP1        v25.8h, v20.8h, v22.8h
    430     ZIP2        v19.8h, v20.8h, v22.8h
    431 
    432     ZIP1        v14.4s, v27.4s, v25.4s
    433     ZIP2        v20.4s, v27.4s, v25.4s
    434 
    435     ZIP1        v16.4s, v26.4s, v19.4s
    436     ZIP2        v22.4s, v26.4s, v19.4s
    437 
    438     ST1         {v14.4s},[x2],#16
    439     ST1         {v20.4s},[x2],#16
    440     ST1         {v16.4s},[x2],#16
    441     ST1         {v22.4s},[x2],#16
    442 
    443     ////D14-D20 - TOALLY HAVE 16 VALUES
    444     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    445     UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
    446     UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
    447     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G
    448 
    449     UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
    450     UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
    451     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G
    452 
    453     sqxtun      v14.8b, v14.8h
    454     sqxtun      v15.8b, v18.8h
    455     sqxtun      v16.8b, v16.8h
    456     movi        v17.8b, #0
    457 
    458     sqxtun      v20.8b, v20.8h
    459     sqxtun      v21.8b, v24.8h
    460     sqxtun      v22.8b, v22.8h
    461     movi        v23.8b, #0
    462 
    463     ZIP1        v27.8b, v14.8b, v15.8b
    464     ZIP2        v15.8b, v14.8b, v15.8b
    465     mov         v14.d[0], v27.d[0]
    466     ZIP1        v27.8b, v16.8b, v17.8b
    467     ZIP2        v17.8b, v16.8b, v17.8b
    468     mov         v16.d[0], v27.d[0]
    469 
    470     ZIP1        v27.8b, v20.8b, v21.8b
    471     ZIP2        v21.8b, v20.8b, v21.8b
    472     mov         v20.d[0], v27.d[0]
    473     ZIP1        v27.8b, v22.8b, v23.8b
    474     ZIP2        v23.8b, v22.8b, v23.8b
    475     mov         v22.d[0], v27.d[0]
    476 
    477     mov         v14.d[1], v15.d[0]
    478     mov         v20.d[1], v21.d[0]
    479     mov         v16.d[1], v17.d[0]
    480     mov         v22.d[1], v23.d[0]
    481 
    482     ZIP1        v27.8h, v14.8h, v16.8h
    483     ZIP2        v26.8h, v14.8h, v16.8h
    484 
    485     ZIP1        v25.8h, v20.8h, v22.8h
    486     ZIP2        v19.8h, v20.8h, v22.8h
    487 
    488     ZIP1        v14.4s, v27.4s, v25.4s
    489     ZIP2        v20.4s, v27.4s, v25.4s
    490 
    491     ZIP1        v16.4s, v26.4s, v19.4s
    492     ZIP2        v22.4s, v26.4s, v19.4s
    493 
    494     ST1         {v14.4s},[x8],#16
    495     ST1         {v20.4s},[x8],#16
    496     ST1         {v16.4s},[x8],#16
    497     ST1         {v22.4s},[x8],#16
    498 
    499     //// Adjust the address pointers
    500     ADD         x0,x7,x10                   //// luma = luma_next + offset
    501     ADD         x2,x8,x14,LSL #2            //// rgb = rgb_next + offset
    502 
    503     ADD         x7,x0,x3                    //// luma_next = luma + width
    504     ADD         x8,x2,x3,LSL #2             //// rgb_next_row = rgb + width
    505 
    506     ADD         x1,x1,x11                   //// adjust u pointer
    507     //ADD        x2,x2,x12            @// adjust v pointer
    508 
    509     ADD         x7,x7,x10                   //// luma_next = luma + width + offset (because of register crunch)
    510     ADD         x8,x8,x14,LSL #2            //// rgb_next_row = rgb + width + offset
    511 
    512     SUBS        x5,x5,#1                    //// height_cnt -= 1
    513 
    514     BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
    515 
    516     ////POP THE REGISTERS
    517     // LDMFD sp!,{x4-x12,PC}
    518     ldp         x19, x20,[sp],#16
    519     ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
    520                                             // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    521     ldp         d12,d14,[sp],#16
    522     ret
    523 
    524 
    525 
    526 
    527     .section .note.GNU-stack,"",%progbits
    528 
    529