common/armv8/impeg2_idct.s

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// *  impeg2_idct.s
// *
// * @brief
// *  contains function definitions for single stage  inverse transform
// *
// * @author
// *  anand s
// *
// * @par list of functions:
// *  - impeg2_idct_recon_dc_av8()
// *
// * @remarks
// *  none
// *
// *******************************************************************************
//*/

///**
// *******************************************************************************
// *
// * @brief
// *  this function performs inverse transform  and reconstruction for 8x8
// * input block
// *
// * @par description:
// *  performs inverse transform and adds the prediction  data and clips output
// * to 8 bit
// *
// * @param[in] pi2_src
// *  input 8x8 coefficients
// *
// * @param[in] pi2_tmp
// *  temporary 8x8 buffer for storing inverse
// *
// *  transform
// *  1st stage output
// *
// * @param[in] pu1_pred
// *  prediction 8x8 block
// *
// * @param[out] pu1_dst
// *  output 8x8 block
// *
// * @param[in] src_strd
// *  input stride
// *
// * @param[in] pred_strd
// *  prediction stride
// *
// * @param[in] dst_strd
// *  output stride
// *
// * @param[in] shift
// *  output shift
// *
// * @param[in] zero_cols
// *  zero columns in pi2_src
// *
// * @returns  void
// *
// * @remarks
// *  none
// *
// *******************************************************************************
// */

//void impeg2_itrans_recon_8x8(word16 *pi2_src,
//                            word16 *pi2_tmp,
//                            uword8 *pu1_pred,
//                            uword8 *pu1_dst,
//                            word32 src_strd,
//                            word32 pred_strd,
//                            word32 dst_strd,
//                            word32 zero_cols
//                             word32    zero_rows                )

//**************variables vs registers*************************
//    x0 => *pi2_src
//    x1 => *pi2_tmp
//    x2 => *pu1_pred
//    x3 => *pu1_dst
//    src_strd
//    pred_strd
//    dst_strd
//    zero_cols


.text
.align 4
.include "impeg2_neon_macros.s"

.set idct_stg1_shift       ,            12
.set idct_stg2_shift       ,            16
.set idct_stg1_round        ,           (1 << (idct_stg1_shift - 1))
.set idct_stg2_round        ,           (1 << (idct_stg2_shift - 1))

.extern gai2_impeg2_idct_q15
.extern gai2_impeg2_idct_q11
.extern gai2_impeg2_idct_first_col_q15
.extern gai2_impeg2_idct_first_col_q11
.extern gai2_impeg2_mismatch_stg2_additive

.global impeg2_idct_recon_dc_av8
impeg2_idct_recon_dc_av8:
    // STMFD sp!,{x4,x6,x12,x14}
    push_v_regs
    ////x0: pi2_src
    ////x1: pi2_tmp - not used, used as pred_strd
    ////x2: pu1_pred
    ////x3: pu1_dst
    ////x4: used as scratch
    ////x5: pred_strd
    ////x6: dst_strd

    ldrsh           x4, [x0]
    adrp            x14, :got:gai2_impeg2_idct_q15
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
    ldrsh           x12, [x14]

    ld1             {v0.8b}, [x2], x5
    mul             x4, x4, x12

    ld1             {v1.8b}, [x2], x5
    add             x4, x4, #idct_stg1_round

    ld1             {v2.8b}, [x2], x5
    asr             x4, x4, #idct_stg1_shift

    adrp            x14, :got:gai2_impeg2_idct_q11
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
    ldrsh           x12, [x14]

    ld1             {v3.8b}, [x2], x5
    mul             x4, x4, x12

    ld1             {v4.8b}, [x2], x5
    add             x4, x4, #idct_stg2_round

    ld1             {v5.8b}, [x2], x5
    asr             x4, x4, #idct_stg2_shift

    ld1             {v6.8b}, [x2], x5
    dup             v30.8h, w4


    ld1             {v7.8b}, [x2], x5

    uaddw           v8.8h, v30.8h , v0.8b

    uaddw           v10.8h, v30.8h , v1.8b
    sqxtun          v0.8b, v8.8h

    uaddw           v12.8h, v30.8h , v2.8b
    sqxtun          v1.8b, v10.8h
    st1             {v0.8b}, [x3], x6

    uaddw           v14.8h, v30.8h , v3.8b
    sqxtun          v2.8b, v12.8h
    st1             {v1.8b}, [x3], x6

    uaddw           v16.8h, v30.8h , v4.8b
    sqxtun          v3.8b, v14.8h
    st1             {v2.8b}, [x3], x6

    uaddw           v18.8h, v30.8h , v5.8b
    sqxtun          v4.8b, v16.8h
    st1             {v3.8b}, [x3], x6

    uaddw           v20.8h, v30.8h , v6.8b
    sqxtun          v5.8b, v18.8h
    st1             {v4.8b}, [x3], x6

    uaddw           v22.8h, v30.8h , v7.8b
    sqxtun          v6.8b, v20.8h
    st1             {v5.8b}, [x3], x6

    sqxtun          v7.8b, v22.8h
    st1             {v6.8b}, [x3], x6


    st1             {v7.8b}, [x3], x6

    // LDMFD sp!,{x4,x6,x12,pc}
    pop_v_regs
    ret


.global impeg2_idct_recon_dc_mismatch_av8
.extern gai2_impeg2_idct_last_row_q11
.extern gai2_impeg2_mismatch_stg1_outp
impeg2_idct_recon_dc_mismatch_av8:
    // STMFD sp!,{x4-x12,x14}
    push_v_regs

    ldrsh           x4, [x0]
    adrp            x14, :got:gai2_impeg2_idct_q15
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
    ldrsh           x12, [x14]

    mul             x4, x4, x12
    add             x4, x4, #idct_stg1_round
    asr             x4, x4, #idct_stg1_shift

    adrp            x14, :got:gai2_impeg2_idct_q11
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
    ldrsh           x12, [x14]
    mul             x4, x4, x12
    dup             v0.4s, w4

    mov             x14, #16            ////Increment for table read
    adrp            x4, :got:gai2_impeg2_mismatch_stg2_additive
    ldr             x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6

    ld1             {v2.4h, v3.4h}, [x4], x14
    ld1             {v30.8b}, [x2], x5
    sxtl            v8.4s, v2.4h
    sxtl            v10.4s, v3.4h
    raddhn          v12.4h, v0.4s, v8.4s
    raddhn2         v12.8h, v0.4s, v10.4s
    uaddw           v14.8h, v12.8h , v30.8b
    sqxtun          v30.8b, v14.8h
    st1             {v30.8b}, [x3], x6


    // LDMFD sp!,{x4-x12,pc}
    pop_v_regs
    ret

.globl impeg2_idct_recon_av8

.type impeg2_idct_recon_av8, %function

impeg2_idct_recon_av8:
////register usage.extern        - loading and until idct of columns
////    cosine constants     -     d0
////    sine constants         -     d1
////    row 0 first half     -     d2        -    y0
////    row 1 first half     -     d6        -    y1
////    row 2 first half     -     d3        -    y2
////    row 3 first half     -     d7        -    y3
////    row 4 first half     -     d10        -    y4
////    row 5 first half     -     d14        -    y5
////    row 6 first half     -     d11        -    y6
////    row 7 first half     -     d15        -    y7

////    row 0 second half    -     d4        -    y0
////    row 1 second half    -     d8      -    y1
////    row 2 second half    -     d5      -    y2
////    row 3 second half    -     d9      -    y3
////    row 4 second half    -     d12     -    y4
////    row 5 second half    -     d16     -    y5
////    row 6 second half    -     d13     -    y6
////    row 7 second half    -     d17     -    y7

    //// copy the input pointer to another register
    //// step 1 : load all constants
    // stmfd sp!,{x4-x12,x14}

    ldr             w11, [sp]           // zero rows

    push_v_regs
    stp             x19, x20, [sp, #-16]!

    mov             x12, x7             // zero columns
    mov             x8, x5              // prediction stride
    mov             x7, x6              // destination stride
    mov             x6, x4              // src stride
    lsl             x6, x6, #1          // x sizeof(word16)
    add             x9, x0, x6, lsl #1  // 2 rows

    add             x10, x6, x6, lsl #1 // 3 rows

    sub             x10, x10, #8        // - 4 cols * sizeof(word16)
    sub             x5, x6, #8          // src_strd - 4 cols * sizeof(word16)

    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    ld1             {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data

    ////step 2 load all the input data
    ////step 3 operate first 4 colums at a time

    and             x11, x11, #0xff
    and             x12, x12, #0xff

    cmp             x11, #0xf0
    bge             skip_last4_rows


    ld1             {v2.4h}, [x0], #8
    ld1             {v3.4h}, [x9], #8
    ld1             {v4.4h}, [x0], x5
    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
    ld1             {v5.4h}, [x9], x5
    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    ld1             {v6.4h}, [x0], #8
    ld1             {v7.4h}, [x9], #8
    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
    ld1             {v8.4h}, [x0], x10
    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
    ld1             {v9.4h}, [x9], x10
    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
    ld1             {v10.4h}, [x0], #8
    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
    ld1             {v11.4h}, [x9], #8
    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    ld1             {v12.4h}, [x0], x5
    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    ld1             {v13.4h}, [x9], x5
    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    ld1             {v14.4h}, [x0], #8
    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    ld1             {v15.4h}, [x9], #8
    smull           v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
    ld1             {v16.4h}, [x0], x10
    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
    ld1             {v17.4h}, [x9], x10

    ///* this following was activated when alignment is not there */
////    vld1.16        d2,[x0]!
////    vld1.16        d3,[x2]!
////    vld1.16        d4,[x0]!
////    vld1.16        d5,[x2]!
////    vld1.16        d6,[x0]!
////    vld1.16        d7,[x2]!
////    vld1.16        d8,[x0],x3
////    vld1.16        d9,[x2],x3
////    vld1.16        d10,[x0]!
////    vld1.16        d11,[x2]!
////    vld1.16        d12,[x0]!
////    vld1.16        d13,[x2]!
////    vld1.16        d14,[x0]!
////    vld1.16        d15,[x2]!
////    vld1.16        d16,[x0],x3
////    vld1.16        d17,[x2],x3


    smlal           v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    smlsl           v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    smlal           v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    smlal           v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    smlsl           v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    smlal           v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    add             v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    smlal           v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    smlsl           v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    smlal           v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    smlsl           v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)

    add             v14.4s, v10.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    sub             v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)

    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)

    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)

    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)

    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)

    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)


    b               last4_cols


skip_last4_rows:
    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    ld1             {v0.4h, v1.4h}, [x14]

    ld1             {v2.4h}, [x0], #8
    ld1             {v3.4h}, [x9], #8
    ld1             {v4.4h}, [x0], x5
    ld1             {v5.4h}, [x9], x5
    ld1             {v6.4h}, [x0], #8
    ld1             {v7.4h}, [x9], #8
    ld1             {v8.4h}, [x0], x10
    ld1             {v9.4h}, [x9], x10


    movi            v12.4h, #0
    movi            v13.4h, #0
    movi            v16.4h, #0
    movi            v17.4h, #0


    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)

    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)

    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)

    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)


    add             v14.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    sub             v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)

    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)

    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)

    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)

    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)

    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)


last4_cols:
    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    ld1             {v0.4h, v1.4h}, [x14]


    cmp             x12, #0xf0
    bge             skip_last4cols

    smull           v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
    smull           v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)

    smlal           v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)

    smull           v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
    smull           v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)

    smull           v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)

    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    smlal           v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)

    add             v16.4s, v12.4s , v8.4s ////    a0 = c0 + d0(part of e0,e7)
    sub             v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)

    add             v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
    sub             v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)

    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)

    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)

    add             v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
    sub             v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)

    sqrshrn         v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    sqrshrn         v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    b               end_skip_last4cols


skip_last4cols:
    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
    ld1             {v0.4h, v1.4h}, [x14]

    umov            x15, v25.d[0]

    trn1            v25.4h, v2.4h, v6.4h
    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing

    trn1            v27.4h, v3.4h, v7.4h
    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing

    trn1            v6.2s, v29.2s, v31.2s
    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    trn1            v2.2s, v25.2s, v27.2s
    trn2            v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....


    trn1            v25.4h, v10.4h, v14.4h
    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing

    trn1            v27.4h, v11.4h, v15.4h
    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing

    trn1            v10.2s, v25.2s, v27.2s
    trn2            v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    trn1            v14.2s, v29.2s, v31.2s
    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....

    mov             v25.d[0], x15

    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)

    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)

    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
//    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)

    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)


    sub             v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    add             v4.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)


    add             v2.4s, v4.4s , v24.4s

    sub             v6.4s, v4.4s , v24.4s

    add             v8.4s, v22.4s , v30.4s

    sub             v24.4s, v22.4s , v30.4s

    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift

    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)


    add             v30.4s, v22.4s , v28.4s

    sub             v24.4s, v22.4s , v28.4s

    add             v28.4s, v18.4s , v26.4s

    sub             v22.4s, v18.4s , v26.4s
    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift


    umov            x19, v25.d[0]
    umov            x20, v25.d[1]

    trn1            v27.4h, v2.4h, v3.4h
    trn2            v29.4h, v2.4h, v3.4h
    trn1            v25.4h, v4.4h, v5.4h
    trn2            v31.4h, v4.4h, v5.4h

    trn1            v2.2s, v27.2s, v25.2s
    trn2            v4.2s, v27.2s, v25.2s
    trn1            v3.2s, v29.2s, v31.2s
    trn2            v5.2s, v29.2s, v31.2s

    trn1            v27.4h, v6.4h, v7.4h
    trn2            v29.4h, v6.4h, v7.4h
    trn1            v25.4h, v8.4h, v9.4h
    trn2            v31.4h, v8.4h, v9.4h

    trn1            v6.2s, v27.2s, v25.2s
    trn2            v8.2s, v27.2s, v25.2s
    trn1            v7.2s, v29.2s, v31.2s
    trn2            v9.2s, v29.2s, v31.2s

    mov             v25.d[0], x19
    mov             v25.d[1], x20

    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)

    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)

    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)


    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data


    add             x5, x8, x8, lsl #1  //


    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data


    add             x10, x7, x7, lsl #1 //

    // swapping v3 and v6
    mov             v31.d[0], v3.d[0]
    mov             v3.d[0], v6.d[0]
    mov             v6.d[0], v31.d[0]

    // swapping v5 and v8
    mov             v31.d[0], v5.d[0]
    mov             v5.d[0], v8.d[0]
    mov             v8.d[0], v31.d[0]


    sub             v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
    add             v12.4s, v20.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)


    add             v0.4s, v12.4s , v24.4s


    sub             v24.4s, v12.4s , v24.4s


    add             v12.4s, v22.4s , v30.4s


    sub             v14.4s, v22.4s , v30.4s

    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift

    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)


    add             v0.4s, v22.4s , v28.4s


    sub             v24.4s, v22.4s , v28.4s


    add             v28.4s, v18.4s , v26.4s


    sub             v26.4s, v18.4s , v26.4s
    ld1             {v18.8b}, [x2], x8

    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
    ld1             {v20.8b}, [x2], x5


    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
    ld1             {v19.8b}, [x2], x8


    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
    ld1             {v22.8b}, [x4], x8


    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
    ld1             {v21.8b}, [x2], x5


    b               pred_buff_addition
end_skip_last4cols:
    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
    ld1             {v0.4h, v1.4h}, [x14]


    umov            x19, v25.d[0]
    umov            x20, v25.d[1]

///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
    trn1            v27.4h, v2.4h, v6.4h
    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
    trn1            v25.4h, v3.4h, v7.4h
    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing

    trn1            v2.2s, v27.2s, v25.2s
    trn2            v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    trn1            v6.2s, v29.2s, v31.2s
    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....

    trn1            v27.4h, v4.4h, v8.4h
    trn2            v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
    trn1            v25.4h, v5.4h, v9.4h
    trn2            v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing

    trn1            v4.2s, v27.2s, v25.2s
    trn2            v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
    trn1            v8.2s, v29.2s, v31.2s
    trn2            v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....

    trn1            v27.4h, v10.4h, v14.4h
    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
    trn1            v25.4h, v11.4h, v15.4h
    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing

    trn1            v10.2s, v27.2s, v25.2s
    trn2            v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    trn1            v14.2s, v29.2s, v31.2s
    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....

    trn1            v27.4h, v12.4h, v16.4h
    trn2            v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
    trn1            v25.4h, v13.4h, v17.4h
    trn2            v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing

    trn1            v12.2s, v27.2s, v25.2s
    trn2            v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
    trn1            v16.2s, v29.2s, v31.2s
    trn2            v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....

    mov             v25.d[0], x19
    mov             v25.d[1], x20

    ////step6 operate on first four rows and find their idct
    ////register usage.extern        - storing and idct of rows
////    cosine constants     -     d0
////    sine constants         -     d1
////    element 0 first four     -     d2        -    y0
////    element 1 first four     -     d6        -    y1
////    element 2 first four     -     d3        -    y2
////    element 3 first four     -     d7        -    y3
////    element 4 first four     -     d4        -    y4
////    element 5 first four     -     d8        -    y5
////    element 6 first four     -     d5        -    y6
////    element 7 first four     -     d9        -    y7
////    element 0 second four    -     d10        -    y0
////    element 1 second four    -     d14     -    y1
////    element 2 second four    -     d11     -    y2
////    element 3 second four    -     d15     -    y3
////    element 4 second four    -     d12     -    y4
////    element 5 second four    -     d16     -    y5
////    element 6 second four    -     d13     -    y6
////    element 7 second four    -     d17     -    y7

    //// map between first kernel code seq and current
////        d2    ->    d2
////        d6    ->    d6
////        d3    ->    d3
////        d7    ->    d7
////        d10    ->    d4
////        d14    ->    d8
////        d11    ->    d5
////        d15    ->    d9
////        q3    ->    q3
////        q5    ->    q2
////        q7    ->    q4

    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)

    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)

    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
    smull           v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)

    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)


    smlal           v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    smlsl           v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    smlal           v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    smlal           v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    smlsl           v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    smlal           v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    add             v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    smlal           v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    smlsl           v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    smlal           v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    smlsl           v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)

    sub             v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    add             v4.4s, v2.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)


    add             v2.4s, v4.4s , v24.4s

    sub             v6.4s, v4.4s , v24.4s

    add             v8.4s, v22.4s , v30.4s

    sub             v24.4s, v22.4s , v30.4s

    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift

    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)


    add             v30.4s, v22.4s , v28.4s

    sub             v24.4s, v22.4s , v28.4s

    add             v28.4s, v18.4s , v26.4s

    sub             v22.4s, v18.4s , v26.4s
    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift


    umov            x19, v25.d[0]
    umov            x20, v25.d[1]

    trn1            v27.4h, v2.4h, v3.4h
    trn2            v29.4h, v2.4h, v3.4h
    trn1            v25.4h, v4.4h, v5.4h
    trn2            v31.4h, v4.4h, v5.4h

    trn1            v2.2s, v27.2s, v25.2s
    trn2            v4.2s, v27.2s, v25.2s
    trn1            v3.2s, v29.2s, v31.2s
    trn2            v5.2s, v29.2s, v31.2s

    trn1            v27.4h, v6.4h, v7.4h
    trn2            v29.4h, v6.4h, v7.4h
    trn1            v25.4h, v8.4h, v9.4h
    trn2            v31.4h, v8.4h, v9.4h

    trn1            v6.2s, v27.2s, v25.2s
    trn2            v8.2s, v27.2s, v25.2s
    trn1            v7.2s, v29.2s, v31.2s
    trn2            v9.2s, v29.2s, v31.2s

    mov             v25.d[0], x19
    mov             v25.d[1], x20


    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)

    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)

    add             x5, x8, x8, lsl #1  //
    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)

    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    add             x10, x7, x7, lsl #1 //
    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)


    smlal           v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)

    // swapping v3 and v6
    mov             v31.d[0], v3.d[0]
    mov             v3.d[0], v6.d[0]
    mov             v6.d[0], v31.d[0]

    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    // swapping v5 and v8
    mov             v31.d[0], v5.d[0]
    mov             v5.d[0], v8.d[0]
    mov             v8.d[0], v31.d[0]

    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)

    sub             v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
    add             v12.4s, v12.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)


    add             v0.4s, v12.4s , v24.4s


    sub             v24.4s, v12.4s , v24.4s


    add             v12.4s, v22.4s , v30.4s


    sub             v14.4s, v22.4s , v30.4s

    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift

    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)


    add             v0.4s, v22.4s , v28.4s


    sub             v24.4s, v22.4s , v28.4s


    add             v28.4s, v18.4s , v26.4s


    sub             v26.4s, v18.4s , v26.4s
    ld1             {v18.8b}, [x2], x8

    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
    ld1             {v20.8b}, [x2], x5


    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
    ld1             {v19.8b}, [x2], x8


    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
    ld1             {v22.8b}, [x4], x8


    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
    ld1             {v21.8b}, [x2], x5


pred_buff_addition:

    umov            x19, v25.d[0]
    umov            x20, v25.d[1]

    trn1            v27.4h, v10.4h, v11.4h
    trn2            v29.4h, v10.4h, v11.4h
    trn1            v25.4h, v12.4h, v13.4h
    trn2            v31.4h, v12.4h, v13.4h

    trn1            v10.2s, v27.2s, v25.2s
    trn2            v12.2s, v27.2s, v25.2s
    trn1            v11.2s, v29.2s, v31.2s
    trn2            v13.2s, v29.2s, v31.2s

    trn1            v27.4h, v14.4h, v15.4h
    trn2            v29.4h, v14.4h, v15.4h
    trn1            v25.4h, v16.4h, v17.4h
    trn2            v31.4h, v16.4h, v17.4h

    trn1            v14.2s, v27.2s, v25.2s
    trn2            v16.2s, v27.2s, v25.2s
    trn1            v15.2s, v29.2s, v31.2s
    trn2            v17.2s, v29.2s, v31.2s


    mov             v25.d[0], x19
    mov             v25.d[1], x20


    ld1             {v24.8b}, [x4], x5
    ld1             {v23.8b}, [x4], x8
    ld1             {v25.8b}, [x4], x5
    mov             v2.d[1], v3.d[0]
    mov             v4.d[1], v5.d[0]
    mov             v6.d[1], v7.d[0]
    mov             v8.d[1], v9.d[0]
    uaddw           v2.8h, v2.8h , v18.8b
    uaddw           v4.8h, v4.8h , v22.8b
    uaddw           v6.8h, v6.8h , v20.8b
    uaddw           v8.8h, v8.8h , v24.8b

    // swapping v11 and v14
    mov             v31.d[0], v11.d[0]
    mov             v11.d[0], v14.d[0]
    mov             v14.d[0], v31.d[0]

    // swapping v13 and v16
    mov             v31.d[0], v13.d[0]
    mov             v13.d[0], v16.d[0]
    mov             v16.d[0], v31.d[0]
// row values stored in the q register.

//q1 :x0
//q3: x1
//q2: x2
//q4: x3
//q5: x4
//q7: x5
//q6: x6
//q8: x7


///// adding the prediction buffer


    // load prediction data


    //adding recon with prediction


    mov             v10.d[1], v11.d[0]
    mov             v12.d[1], v13.d[0]
    mov             v14.d[1], v15.d[0]
    mov             v16.d[1], v17.d[0]
    uaddw           v10.8h, v10.8h , v19.8b
    sqxtun          v2.8b, v2.8h
    uaddw           v14.8h, v14.8h , v21.8b
    sqxtun          v4.8b, v4.8h
    uaddw           v12.8h, v12.8h , v23.8b
    sqxtun          v6.8b, v6.8h
    uaddw           v16.8h, v16.8h , v25.8b
    sqxtun          v8.8b, v8.8h


    st1             {v2.8b}, [x3], x7
    sqxtun          v10.8b, v10.8h
    st1             {v6.8b}, [x3], x10
    sqxtun          v14.8b, v14.8h
    st1             {v4.8b}, [x0], x7
    sqxtun          v12.8b, v12.8h
    st1             {v8.8b}, [x0], x10
    sqxtun          v16.8b, v16.8h


    st1             {v10.8b}, [x3], x7
    st1             {v14.8b}, [x3], x10
    st1             {v12.8b}, [x0], x7
    st1             {v16.8b}, [x0], x10


    // ldmfd sp!,{x4-x12,pc}
    ldp             x19, x20, [sp], #16
    pop_v_regs
    ret