Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2018 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 
     21 .text
     22 .align 4
     23 
     24 @/**
     25 @/*******************************************************************************
     26 @/*
     27 @/* @brief
     28 @/*  Residue calculation and Forward Transform for 4x4 block with 8-bit input
     29 @/*
     30 @/* @par Description:
     31 @/*  Performs residue calculation by subtracting source and  prediction and
     32 @/*  followed by forward transform
     33 @/*
     34 @/* @param[in] pu1_src
     35 @/*  Input 4x4 pixels
     36 @/*
     37 @/* @param[in] pu1_pred
     38 @/*  Prediction data
     39 @/*
     40 @/* @param[in] pi4_tmp
     41 @/*  Temporary buffer of size 4x4
     42 @/*
     43 @/* @param[out] pi2_dst
     44 @/*  Output 4x4 coefficients
     45 @/*
     46 @/* @param[in] src_strd
     47 @/*  Input stride
     48 @/*
     49 @/* @param[in] pred_strd
     50 @/*  Prediction Stride
     51 @/*
     52 @/* @param[in] dst_strd_chr_flag
     53 @/*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
     54 @/*
     55 @/* @returns  Void
     56 @/*
     57 @/* @remarks
     58 @/*  None
     59 @/*
     60 @/*******************************************************************************
     61 @/*/
     62 
     63 @/**************Variables Vs Registers*****************************************
     64 @    r0 => *pu1_src
     65 @    r1 => *pu1_pred
     66 @    r2 => *pi4_temp
     67 @    r3 => *pi2_dst
     68 @    r4 => src_strd
     69 @    r5 => pred_strd
     70 @    r6 => dst_strd_chr_flag
     71 
     72     .global ihevc_resi_trans_4x4_a9q
     73 
     74 ihevc_resi_trans_4x4_a9q:
     75 
     76     STMFD          sp!, {r4-r7, r14}   @ store all the register components from caller function to memory
     77     LDR            r4, [sp,#20]        @ r4 contains src_strd
     78     LDR            r5, [sp,#24]        @ r5 contains pred_strd
     79     LDR            r6, [sp,#28]        @ r6 contains dst_strd_chr_flag
     80 
     81     ANDS           r7, r6, #1          @check for chroma flag, if present interleaved data
     82     CMP            r7, #0
     83     BEQ            NON_INTERLEAVE_LOAD @if flag == 0, use non-interleaving loads
     84 
     85     VLD1.64        d0, [r0], r4        @ load row 0 src
     86     VLD1.64        d4, [r0], r4        @ load row 1 src
     87     VLD1.64        d1, [r0], r4        @ load row 2 src
     88     VLD1.64        d5, [r0], r4        @ load row 3 src
     89     VUZP.8         d0, d4              @ de-interleaving unzip instruction to get luma data of pu1_src in d0
     90     VUZP.8         d1, d5              @ de-interleaving unzip instruction to get luma data of pu1_src in d1
     91 
     92     VLD1.64        d2, [r1], r5        @ load row 0 pred
     93     VLD1.64        d6, [r1], r5        @ load row 1 pred
     94     VLD1.64        d3, [r1], r5        @ load row 2 pred
     95     VLD1.64        d7, [r1], r5        @ load row 3 pred
     96     VUZP.8         d2, d6              @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
     97     VUZP.8         d3, d7              @ de-interleaving unzip instruction to get luma data of pu1_pred in d3
     98 
     99     B LOAD_END
    100 
    101 NON_INTERLEAVE_LOAD:
    102     VLD1.U32     d0[0], [r0], r4       @ load row 0 src
    103     VLD1.U32     d0[1], [r0], r4       @ load row 1 src
    104     VLD1.U32     d1[0], [r0], r4       @ load row 2 src
    105     VLD1.U32     d1[1], [r0], r4       @ load row 3 src
    106 
    107     VLD1.U32     d2[0], [r1], r5       @ load row 0 pred
    108     VLD1.U32     d2[1], [r1], r5       @ load row 1 pred
    109     VLD1.U32     d3[0], [r1], r5       @ load row 2 pred
    110     VLD1.U32     d3[1], [r1], r5       @ load row 3 pred
    111 
    112 LOAD_END:
    113     @ Finding the residue
    114     VSUBL.U8    q2, d0, d2             @ q2 contains 1st 16-bit 8 residues
    115     VSUBL.U8    q3, d1, d3             @ q3 contains 2nd 16-bit 8 residues
    116 
    117     @ SAD caculation
    118     VABDL.U8    q12, d0, d2            @ q12 contains absolute differences
    119     VABAL.U8    q12, d1, d3            @ q12 accumulates absolute differences
    120     VADD.U16    d26, d24, d25          @ add d-registers of q12
    121     VPADDL.U16  d27, d26               @ d27 contains 2 32-bit values that have to be added
    122     VPADDL.U32  d28, d27               @ d28 contains 64-bit SAD, only LSB important
    123     VMOV.32     r0, d28[0]             @ SAD stored in r0 for return
    124     @ SAD caculation ends
    125 
    126     @ Forward transform - step 1
    127     VMOV.I16    d2, #64                @ generate immediate constant in d2 for even row multiplication
    128     VTRN.16     d4, d5                 @ 3-step transpose of residue matrix starts
    129     VTRN.16     d6, d7                 @ 2nd step of the 3-step matrix transpose
    130     VMOV.I16    d0, #83                @ generate immediate constant in d0 for odd row multiplication
    131     VTRN.32     q2, q3                 @ Final step of matrix transpose
    132 
    133     VMOV.I16    d1, #36                @ generate immediate constant in d1 for odd row multiplication
    134     VSWP        d6, d7                 @ vector swap to allow even and odd row calculation using Q registers
    135     VADD.S16    q10, q2, q3            @ q4 has the even array
    136     VSUB.S16    q11, q2, q3            @ q5 has the odd array
    137     VMULL.S16   q12, d20, d2           @ e[0]*64
    138     VMLAL.S16   q12, d21, d2[0]        @ row 1 of results: e[0]*64 + e[1]*64
    139     VMULL.S16   q13, d20, d2           @ e[0]*64
    140     VMLSL.S16   q13, d21, d2[0]        @ row 3 of results: e[0]*64 - e[1]*64
    141     VMULL.S16   q8, d22, d0            @ o[0]*83
    142     VMLAL.S16   q8, d23, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
    143     VMULL.S16   q9, d22, d1            @ o[0]*36
    144     VMLSL.S16   q9, d23, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
    145 
    146     @ Forward transform - step 2
    147     VMOV.I32    d2, #64                @ generate immediate constant in d2 for even row multiplication
    148     VMOV.I32    d0, #83                @ generate immediate constant in d0 for odd row multiplication
    149     VTRN.32     q12, q8                @ 4-step transpose of residue matrix starts
    150     VTRN.32     q13, q9                @ 2nd step of the 4-step matrix transpose
    151 
    152     VMOV.I32    d1, #36                @ generate immediate constant in d1 for odd row multiplication
    153     VSWP        d25, d26               @ 3rd step of the 4-step matrix transpose
    154     VSWP        d17, d18               @ 4th step of the 4-step matrix transpose
    155     VADD.S32    q2, q12, q9            @ e[0]
    156     VADD.S32    q3, q8, q13            @ e[1]
    157     VSUB.S32    q10, q12, q9           @ o[0]
    158     VSUB.S32    q11, q8, q13           @ o[1]
    159 
    160     VMUL.S32    q12, q2, d2[0]         @ e[0]*64
    161     VMLA.S32    q12, q3, d2[0]         @ row 1 of results: e[0]*64 + e[1]*64
    162     VMUL.S32    q13, q2, d2[0]         @ e[1]*64
    163     VMLS.S32    q13, q3, d2[0]         @ row 3 of results: e[0]*64 - e[1]*64
    164     VMUL.S32    q8, q10, d0[0]         @ o[0]*83
    165     VMLA.S32    q8, q11, d1[0]         @ row 2 of results: o[0]*83 + o[1]*36
    166     VMUL.S32    q9, q10, d1[0]         @ o[0]*36
    167     VMLS.S32    q9, q11, d0[0]         @ row 4 of results: o[0]*36 - o[1]*83
    168 
    169     VRSHRN.S32  d0, q12, #9            @ (row1 + 256)/512
    170     VRSHRN.S32  d1, q8, #9             @ (row2 + 256)/512
    171     VRSHRN.S32  d2, q13, #9            @ (row3 + 256)/512
    172     VRSHRN.S32  d3, q9, #9             @ (row4 + 256)/512
    173 
    174     LSR         r7, r6, #15            @ r7 = 2*dst_strd, as pi2_dst contains 2-bit integers
    175     VST1.U16    d0, [r3], r7           @ store 1st row of result
    176     VST1.U16    d1, [r3], r7           @ store 2nd row of result
    177     VST1.U16    d2, [r3], r7           @ store 3rd row of result
    178     VST1.U16    d3, [r3], r7           @ store 4th row of result
    179 
    180     LDMFD       sp!,{r4-r7,r15}        @ Reload the registers from SP
    181 
    182     @ Function End
    183 
    184 @/**
    185 @*******************************************************************************
    186 @*
    187 @* @brief
    188 @*  This function performs residue calculation and forward  transform type 1
    189 @*  on input pixels
    190 @*
    191 @* @description
    192 @*  Performs residue calculation by subtracting source and  prediction and
    193 @*  followed by forward transform
    194 @*
    195 @* @param[in] pu1_src
    196 @*  Input 4x4 pixels
    197 @*
    198 @* @param[in] pu1_pred
    199 @*  Prediction data
    200 @*
    201 @* @param[in] pi2_tmp
    202 @*  Temporary buffer of size 4x4
    203 @*
    204 @* @param[out] pi2_dst
    205 @*  Output 4x4 coefficients
    206 @*
    207 @* @param[in] src_strd
    208 @*  Input stride
    209 @*
    210 @* @param[in] pred_strd
    211 @*  Prediction Stride
    212 @*
    213 @* @param[in] dst_strd_chr_flag
    214 @*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
    215 @*
    216 @* @returns void
    217 @*
    218 @* @remarks
    219 @*  None
    220 @*
    221 @*******************************************************************************
    222 @*/
    223 @ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
    224 @                                     UWORD8 *pu1_pred,
    225 @                                        WORD32 *pi4_temp,
    226 @                                     WORD16 *pi2_dst,
    227 @                                     WORD32 src_strd,
    228 @                                     WORD32 pred_strd,
    229 @                                       WORD32 dst_strd_chr_flag);
    230 @
    231 @**************Variables Vs Registers*******************************************
    232 @
    233 @ r0 - pu1_src
    234 @ r1 - pu1_pred
    235 @ r2 - pi4_temp
    236 @ r3 - pi2_dst
    237 @
    238 @ [sp]   - src_strd
    239 @ [sp+4] - pred_strd
    240 @ [sp+8] - dst_strd_chr_flag
    241 @
    242 @*******************************************************************************
    243 
    244     .global ihevc_resi_trans_4x4_ttype1_a9q
    245 
    246 ihevc_resi_trans_4x4_ttype1_a9q:
    247 
    248     PUSH {r4}
    249     vpush {d8 - d15}
    250 
    251     LDR r2,[sp,#68]                 @ r2 = src_strd
    252     LDR r4,[sp,#72]                 @ r4 = pred_strd
    253 
    254     VLD1.32 d2[0],[r0],r2           @ Row 1 of source in d2[0]
    255     VLD1.32 d3[0],[r1],r4           @ Row 1 of prediction in d3[0]
    256     VLD1.32 d2[1],[r0],r2           @ Row 2 of source in d2[1]
    257     VLD1.32 d3[1],[r1],r4           @ Row 2 of prediction in d3[1]
    258 
    259     VLD1.32 d8[0],[r0],r2           @ Row 3 of source in d8[0]
    260     VABDL.U8 q0,d2,d3               @ Absolute differences of rows 1 and 2 in d0
    261                                     @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
    262     VLD1.32 d9[0],[r1],r4           @ Row 3 of prediction in d9[0]
    263     VSUBL.U8 q5,d2,d3               @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
    264     VLD1.32 d8[1],[r0]              @ Row 4 of source in d8[1]
    265     VTRN.16 d10,d11                 @ Transpose step 1
    266     VLD1.32 d9[1],[r1]              @ Row 4 of prediction in d9[1]
    267 
    268     VSUBL.U8 q6,d8,d9               @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
    269                                     @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
    270     VABAL.U8 q0,d8,d9               @ Absolute differences of rows 3 and 4 in d1
    271     VTRN.16 d12,d13                 @ Transpose step 2
    272     VTRN.32 q5,q6                   @ Transpose step 3, Residue block transposed
    273                                     @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
    274     VADD.S16 d23,d11,d13            @ d23 = C2 + C4
    275     VMOV.I32 d6,#55                 @ Constant used for multiplication
    276     VADD.S16 d22,d10,d13            @ d22 = C1 + C4
    277     VADD.U16 d0,d1,d0               @ Accumulating SAD step 1
    278     VMOV.I32 d7,#84                 @ Constant used for multiplication
    279     VMULL.S16 q7,d23,d6[0]          @ q7  = 55*C2 + 55*C4
    280     VMOV.I32 d4,#74                 @ Constant used for multiplication
    281     VMULL.S16 q9,d22,d7[0]          @ q9  = 84*C1 + 84*C4
    282     VADD.S16 d16,d10,d11            @ d16 = C1 + C2
    283     VMUL.S16 d12,d12,d4[0]          @ d12 = 74*C3
    284     VMOV.I32 d5,#29                 @ Constant used for multiplication
    285     VPADDL.U16 d0,d0                @ Accumulating SAD step 2
    286     VSUB.S16 d16,d16,d13            @ d16 = C1 + C2 - C4
    287     VMLAL.S16 q7,d22,d5[0]          @ q7  = 29*C1 + 55*C2 + 84*C4
    288     VMLSL.S16 q9,d23,d5[0]          @ q9  = 84*C1 - 29*C2 + 55*C4
    289     VMULL.S16 q8,d16,d4[0]          @ q8  = 74*C1 + 74*C2 - 74*C4
    290     VPADDL.U32 d0,d0                @ Accumulating SAD step 3, SAD in d0
    291     VSUB.S32 q10,q9,q7              @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
    292     VMOV.32 r0,d0[0]                @ Return SAD value
    293     VRSHR.S32 q8,q8,#1              @ Truncating the 1 bit in q8
    294 
    295     VADDW.S16 q7,q7,d12             @ q7  = 29*C1 + 55*C2 + 74*C3 + 84*C4
    296     VSUBW.S16 q9,q9,d12             @ q9  = 84*C1 - 29*C2 - 74*C3 + 55*C4
    297     VADDW.S16 q10,q10,d12           @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4
    298 
    299     VRSHR.S32 q7,q7,#1              @ Truncating the 1 bit in q7
    300     VRSHR.S32 q9,q9,#1              @ Truncating the 1 bit in q9
    301     VRSHR.S32 q10,q10,#1            @ Truncating the 1 bit in q10
    302                                     @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
    303     VTRN.32 q7,q8
    304     VTRN.32 q9,q10
    305     VSWP d15,d18
    306     VSWP d17,d20                    @ Residue block transposed
    307                                     @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
    308     VADD.S32 q13,q7,q8              @ q13 = S1 + S2
    309     VADD.S32 q1,q7,q10              @ q1 = S1 + S4
    310     VADD.S32 q4,q8,q10              @ q4 = S2 + S4
    311     VSUB.S32 q13,q13,q10            @ q13 = S1 + S2 - S4
    312     VMUL.S32 q12,q1,d5[0]           @ q12 = 29*S1 + 29*S4
    313     VMUL.S32 q14,q1,d7[0]           @ q14 = 84*S1 + 84*S4
    314     VMUL.S32 q13,q13,d4[0]          @ q13 = 74*S1 + 74*S2 - 74*S4
    315 
    316     VMLA.S32 q12,q4,d6[0]           @ q12 = 29*S1 + 55*S2 + 84*S4
    317     VMLS.S32 q14,q4,d5[0]           @ q14 = 84*S1 - 29*S2 + 55*S4
    318     VMUL.S32 q9,q9,d4[0]            @ q9 = 74*S3
    319 
    320     LDR r4,[sp,#76]                 @ r4 = dst_strd_chr_flag
    321     ASR r4,r4,#16                   @ r4 = dst_strd
    322     LSL r4,r4,#1                    @ r4 = 2*dst_strd
    323 
    324     VRSHRN.S32 d26,q13,#8
    325     VSUB.S32 q15,q14,q12            @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4
    326 
    327     VADD.S32 q12,q12,q9             @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
    328     VSUB.S32 q14,q14,q9             @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
    329     VADD.S32 q15,q15,q9             @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4
    330 
    331     VRSHRN.S32 d24,q12,#8
    332     VRSHRN.S32 d28,q14,#8
    333     VRSHRN.S32 d30,q15,#8           @ Truncating the last 8 bits
    334                                     @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
    335     VST1.64 d24,[r3],r4             @ Storing row 1 of transform stage 2
    336     VST1.64 d26,[r3],r4             @ Storing row 2 of transform stage 2
    337     VST1.64 d28,[r3],r4             @ Storing row 3 of transform stage 2
    338     VST1.64 d30,[r3]                @ Storing row 4 of transform stage 2
    339 
    340     vpop {d8 - d15}
    341     POP {r4}
    342     MOV pc,lr
    343 
    344 @/**
    345 @*******************************************************************************
    346 @*
    347 @* @brief
    348 @*  This function performs residue calculation and DCT integer forward transform
    349 @*  on 8x8 block
    350 @*
    351 @* @description
    352 @*  Performs residue calculation by subtracting source and prediction and
    353 @*  followed by DCT integer forward transform
    354 @*
    355 @* @param[in] pu1_src
    356 @*  Input 4x4 pixels
    357 @*
    358 @* @param[in] pu1_pred
    359 @*  Prediction data
    360 @*
    361 @* @param[in] pi2_tmp
    362 @*  Temporary buffer of size 8x8
    363 @*
    364 @* @param[out] pi2_dst
    365 @*  Output 8x8 coefficients
    366 @*
    367 @* @param[in] src_strd
    368 @*  Input stride
    369 @*
    370 @* @param[in] pred_strd
    371 @*  Prediction Stride
    372 @*
    373 @* @param[in] dst_strd_chr_flag
    374 @*  Output Stride and Chroma Flag packed in the MS and LS 16-bit
    375 @*
    376 @* @returns void
    377 @*
    378 @* @remarks
    379 @*  None
    380 @*
    381 @*******************************************************************************
    382 @*/
    383 @ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
    384 @                              UWORD8 *pu1_pred,
    385 @                              WORB32 *pi4_temp,
    386 @                              WORB16 *pi2_dst,
    387 @                              WORB32 src_strd,
    388 @                              WORB32 pred_strd,
    389 @                              WORB32 dst_strd_chr_flag);
    390 @
    391 @**************Variables Vs Registers*******************************************
    392 @
    393 @ r0 - pu1_src
    394 @ r1 - pu1_pred
    395 @ r2 - pi4_temp
    396 @ r3 - pi2_dst
    397 @
    398 @ [sp]   - src_strd
    399 @ [sp+4] - pred_strd
    400 @ [sp+8] - dst_strd_chr_flag
    401 @
    402 @*******************************************************************************
    403 
    404     .global ihevc_resi_trans_8x8_a9q
    405 
    406 ihevc_resi_trans_8x8_a9q:
    407 
    408     PUSH {r4,r5}
    409     vpush {d8 - d15}
    410 
    411     @ Loading Prediction and Source blocks of sixe 8x8
    412 
    413     LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
    414     AND r4,r4,#1                    @ r4 = chr_flag
    415     CMP r4,#1
    416     BNE CHROMA_LOAD
    417 
    418 LUMA_LOAD:
    419 
    420     LDR r5,[sp,#72]                 @ r5 = src_strd
    421     LDR r4,[sp,#76]                 @ r4 = pred_strd
    422 
    423     VLD2.8 {d0,d2},[r1],r4          @ Row 1 of prediction in d0
    424     VLD2.8 {d1,d3},[r0],r5          @ Row 1 of source in d1
    425 
    426     VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
    427     VLD2.8 {d2,d4},[r1],r4          @ Row 2 of prediction in d2
    428     VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
    429     VLD2.8 {d3,d5},[r0],r5          @ Row 2 of source in d3
    430 
    431     VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
    432     VLD2.8 {d4,d6},[r1],r4          @ Row 3 of prediction in d4
    433     VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
    434     VLD2.8 {d5,d7},[r0],r5          @ Row 3 of source in d5
    435 
    436     VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
    437     VLD2.8 {d6,d8},[r1],r4          @ Row 4 of prediction in d6
    438     VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
    439     VLD2.8 {d7,d9},[r0],r5          @ Row 4 of source in d7
    440 
    441     VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
    442     VLD2.8 {d8,d10},[r1],r4         @ Row 5 of prediction in d8
    443     VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
    444     VLD2.8 {d9,d11},[r0],r5         @ Row 5 of source in d9
    445 
    446     VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
    447     VLD2.8 {d10,d12},[r1],r4        @ Row 6 of prediction in d10
    448     VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
    449     VLD2.8 {d11,d13},[r0],r5        @ Row 6 of source in d11
    450 
    451     VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
    452     VLD2.8 {d12,d14},[r1],r4        @ Row 7 of prediction in d12
    453     VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
    454     VLD2.8 {d13,d15},[r0],r5        @ Row 7 of source in d13
    455 
    456     VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
    457     VLD2.8 {d14,d16},[r1]           @ Row 8 of prediction in d14
    458     VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
    459     VLD2.8 {d15,d17},[r0]           @ Row 8 of source in d15
    460 
    461     B CHROMA_LOAD_END
    462 
    463 CHROMA_LOAD:
    464 
    465     LDR r5,[sp,#72]                 @ r5 = src_strd
    466     LDR r4,[sp,#76]                 @ r4 = pred_strd
    467 
    468     VLD1.64 d0,[r1],r4              @ Row 1 of prediction in d0
    469     VLD1.64 d1,[r0],r5              @ Row 1 of source in d1
    470 
    471     VABDL.U8 q15,d1,d0              @ Row 1 of absolute difference in q15
    472     VLD1.64 d2,[r1],r4              @ Row 2 of prediction in d2
    473     VSUBL.U8 q0,d1,d0               @ Row 1 of residue in q0
    474     VLD1.64 d3,[r0],r5              @ Row 2 of source in d3
    475 
    476     VABDL.U8 q9,d3,d2               @ Row 2 of absolute difference in q9
    477     VLD1.64 d4,[r1],r4              @ Row 3 of prediction in d4
    478     VSUBL.U8 q1,d3,d2               @ Row 2 of residue in q1
    479     VLD1.64 d5,[r0],r5              @ Row 3 of source in d5
    480 
    481     VABAL.U8 q15,d5,d4              @ Row 3 of absolute difference accumulated in q15
    482     VLD1.64 d6,[r1],r4              @ Row 4 of prediction in d6
    483     VSUBL.U8 q2,d5,d4               @ Row 3 of residue in q2
    484     VLD1.64 d7,[r0],r5              @ Row 4 of source in d7
    485 
    486     VABAL.U8 q9,d7,d6               @ Row 4 of absolute difference accumulated in q9
    487     VLD1.64 d8,[r1],r4              @ Row 5 of prediction in d8
    488     VSUBL.U8 q3,d7,d6               @ Row 4 of residue in q3
    489     VLD1.64 d9,[r0],r5              @ Row 5 of source in d9
    490 
    491     VABDL.U8 q10,d9,d8              @ Row 5 of absolute difference in q10
    492     VLD1.64 d10,[r1],r4             @ Row 6 of prediction in d10
    493     VSUBL.U8 q4,d9,d8               @ Row 5 of residue in q4
    494     VLD1.64 d11,[r0],r5             @ Row 6 of source in d11
    495 
    496     VABAL.U8 q15,d11,d10            @ Row 6 of absolute difference accumulated in q15
    497     VLD1.64 d12,[r1],r4             @ Row 7 of prediction in d12
    498     VSUBL.U8 q5,d11,d10             @ Row 6 of residue in q5
    499     VLD1.64 d13,[r0],r5             @ Row 7 of source in d13
    500 
    501     VABAL.U8 q9,d13,d12             @ Row 7 of absolute difference accumulated in q9
    502     VLD1.64 d14,[r1]                @ Row 8 of prediction in d14
    503     VSUBL.U8 q6,d13,d12             @ Row 7 of residue in q6
    504     VLD1.64 d15,[r0]                @ Row 8 of source in d15
    505 
    506 CHROMA_LOAD_END:
    507 
    508     @ Transform stage 1
    509     @ Transposing residue matrix
    510 
    511     VABAL.U8 q10,d15,d14            @ Row 8 of absolute difference accumulated in q10
    512     VTRN.16 q0,q1                   @ Transpose residue matrix step (1a)
    513     VSUBL.U8 q7,d15,d14             @ Row 8 of residue in q7
    514     VTRN.16 q2,q3                   @ Transpose residue matrix step (1b)
    515 
    516     VTRN.16 q4,q5                   @ Transpose residue matrix step (1c)
    517     VTRN.16 q6,q7                   @ Transpose residue matrix step (1d)
    518     VTRN.32 q0,q2                   @ Transpose residue matrix step (2a)
    519     VTRN.32 q1,q3                   @ Transpose residue matrix step (2b)
    520 
    521     VADD.U16 q8,q15,q9              @ SAD calculation (1)
    522     VTRN.32 q4,q6                   @ Transpose residue matrix step (2c)
    523     VTRN.32 q5,q7                   @ Transpose residue matrix step (2d)
    524 
    525     VADD.U16 q8,q8,q10              @ SAD calculation (2)
    526     VSWP d1,d8                      @ Transpose residue matrix step (3a)
    527     VSWP d3,d10                     @ Transpose residue matrix step (3b)
    528 
    529     VADD.U16 d16,d16,d17            @ SAD calculation (3)
    530     VSWP d7,d14                     @ Transpose residue matrix step (3c)
    531     VSWP d5,d12                     @ Transpose residue matrix step (3d)
    532                                     @ Columns of residue C0-C7 (8x8 matrix) in q0-q7
    533     VPADDL.U16 d16,d16              @ SAD calculation (4)
    534 
    535     @ Evaluating first step in Butterfly diagram
    536 
    537     VADD.S16 q10,q0,q7              @ q10 = C0 + C7
    538     VADD.S16 q11,q1,q6              @ q11 = C1 + C6
    539     VPADDL.U32 d16,d16              @ SAD calculation (5)
    540     VADD.S16 q12,q2,q5              @ q12 = C2 + C5
    541     VADD.S16 q13,q3,q4              @ q13 = C3 + C4
    542 
    543     VSUB.S16 q4,q3,q4               @ q4  = C3 - C4
    544     VSUB.S16 q5,q2,q5               @ q5  = C2 - C5
    545     VSUB.S16 q6,q1,q6               @ q6  = C1 - C6
    546     VSUB.S16 q7,q0,q7               @ q7  = C0 - C7
    547 
    548     @ Calculating F0, F2, F4 and F6
    549 
    550     VADD.S16 q1,q11,q12             @ q1  = C1 + C2 + C5 + C6
    551     VADD.S16 q2,q10,q13             @ q2  = C0 + C3 + C4 + C7
    552 
    553     MOV r4,#50
    554     LSL r4,r4,#16
    555     ADD r4,r4,#18
    556     MOV r5,#89
    557     LSL r5,r5,#16
    558     ADD r5,r5,#75
    559     VMOV d0,r4,r5                   @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18
    560 
    561     MOV r4,#83
    562     LSL r4,r4,#16
    563     ADD r4,r4,#36
    564     VMOV d1,r4,r4                   @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36
    565 
    566     VSUB.S16 q10,q10,q13            @ q10 = C0 - C3 - C4 + C7
    567     VSUB.S16 q11,q11,q12            @ q11 = C1 - C2 - C5 + C6
    568     VMOV.32 r0,d16[0]               @ SAD calculation (6) : Return value = SAD
    569 
    570     VSUB.S16 q3,q2,q1               @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
    571     VADD.S16 q2,q2,q1               @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7
    572 
    573     VMULL.S16 q14,d20,d1[1]         @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
    574     VMULL.S16 q15,d21,d1[1]         @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
    575     VMULL.S16 q9,d20,d1[0]          @ q9  = [0] of 36*(C0 - C3 - C4 + C7)
    576     VMULL.S16 q10,d21,d1[0]         @ q10 = [1] of 36*(C0 - C3 - C4 + C7)
    577 
    578     VMLAL.S16 q14,d22,d1[0]         @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
    579     VSHLL.S16 q13,d6,#6             @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
    580     VMLAL.S16 q15,d23,d1[0]         @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
    581     VSHLL.S16 q3,d7,#6              @ q3  = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
    582     VMLSL.S16 q9,d22,d1[1]          @ q9  = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
    583     VSHLL.S16 q12,d4,#6             @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
    584     VMLSL.S16 q10,d23,d1[1]         @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
    585     VSHLL.S16 q2,d5,#6              @ q2  = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
    586 
    587     @ Calculating F1, F3, F5 and F7
    588 
    589     MOV r4,#48
    590     VST1.64 {d24,d25},[r2]!         @ Row 1 of transform stage 1 F0[0] stored
    591     VST1.64 {d4,d5},[r2],r4         @ Row 1 of transform stage 1 F0[1] stored
    592     VST1.64 {d28,d29},[r2]!         @ Row 3 of transform stage 1 F2[0] stored
    593     VST1.64 {d30,d31},[r2],r4       @ Row 3 of transform stage 1 F2[1] stored
    594 
    595     VST1.64 {d26,d27},[r2]!         @ Row 5 of transform stage 1 F4[0] stored
    596     VMULL.S16 q1,d14,d0[3]          @ q1  = [0] of 89*(C0 - C7)
    597     VMULL.S16 q8,d15,d0[3]          @ q8  = [1] of 89*(C0 - C7)
    598     VST1.64 {d6,d7},[r2],r4         @ Row 5 of transform stage 1 F4[1] stored
    599     VMULL.S16 q11,d14,d0[2]         @ q11 = [0] of 75*(C0 - C7)
    600     VMULL.S16 q13,d15,d0[2]         @ q13 = [1] of 75*(C0 - C7)
    601     VST1.64 {d18,d19},[r2]!         @ Row 7 of transform stage 1 F6[0] stored
    602     VMULL.S16 q3,d14,d0[1]          @ q3  = [0] of 50*(C0 - C7)
    603     VMULL.S16 q9,d15,d0[1]          @ q9  = [1] of 50*(C0 - C7)
    604     VST1.64 {d20,d21},[r2]          @ Row 7 of transform stage 1 F6[1] stored
    605     VMULL.S16 q10,d14,d0[0]         @ q10 = [0] of 18*(C0 - C7)
    606     VMULL.S16 q7,d15,d0[0]          @ q7  = [1] of 18*(C0 - C7)
    607 
    608     VMLAL.S16 q1,d12,d0[2]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6)
    609     VMLAL.S16 q8,d13,d0[2]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6)
    610     VMLSL.S16 q11,d12,d0[0]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
    611     VMLSL.S16 q13,d13,d0[0]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
    612     VMLSL.S16 q3,d12,d0[3]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6)
    613     VMLSL.S16 q9,d13,d0[3]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6)
    614     VMLSL.S16 q10,d12,d0[1]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
    615     VMLSL.S16 q7,d13,d0[1]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6)
    616 
    617     VMLAL.S16 q1,d10,d0[1]          @ q1  = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
    618     VMLAL.S16 q8,d11,d0[1]          @ q8  = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
    619     VMLSL.S16 q11,d10,d0[3]         @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
    620     VMLSL.S16 q13,d11,d0[3]         @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
    621     VMLAL.S16 q3,d10,d0[0]          @ q3  = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
    622     VMLAL.S16 q9,d11,d0[0]          @ q9  = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
    623     VMLAL.S16 q10,d10,d0[2]         @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
    624     VMLAL.S16 q7,d11,d0[2]          @ q7  = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
    625 
    626     VMLAL.S16 q1,d8,d0[0]           @ q1  = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
    627     VMLAL.S16 q8,d9,d0[0]           @ q8  = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
    628     VMLSL.S16 q11,d8,d0[1]          @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
    629     VMLSL.S16 q13,d9,d0[1]          @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
    630     SUB r2,r2,#176                  @ r2 now points to the second row
    631     VMLAL.S16 q3,d8,d0[2]           @ q3  = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
    632     VMLAL.S16 q9,d9,d0[2]           @ q9  = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
    633     VST1.64 {d2,d3},[r2]!           @ Row 2 of transform stage 1 F1[0] stored
    634     VMLSL.S16 q10,d8,d0[3]          @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
    635     VMLSL.S16 q7,d9,d0[3]           @ q7  = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
    636 
    637     VST1.64 {d16,d17},[r2],r4       @ Row 2 of transform stage 1 F1[1] stored
    638     VST1.64 {d22,d23},[r2]!         @ Row 4 of transform stage 1 F3[0] stored
    639     VST1.64 {d26,d27},[r2],r4       @ Row 4 of transform stage 1 F3[1] stored
    640     VST1.64 {d6,d7},[r2]!           @ Row 6 of transform stage 1 F5[0] stored
    641     VST1.64 {d18,d19},[r2],r4       @ Row 6 of transform stage 1 F5[1] stored
    642     VST1.64 {d20,d21},[r2]!         @ Row 8 of transform stage 1 F7[0] stored
    643     VST1.64 {d14,d15},[r2]          @ Row 8 of transform stage 1 F7[1] stored
    644 
    645     @ Transform stage 2 (for rows 1-4 of transform stage 1)
    646     @ Transposing the 4 rows (F0, F1, F2, F3)
    647     @ F0 = {q2,q12},  F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}
    648 
    649     VTRN.32 q12,q1                  @ Transposing first half of transform stage 1 (1a)
    650     VTRN.32 q14,q11                 @ Transposing first half of transform stage 1 (1b)
    651     VSWP d25,d28                    @ Transposing first half of transform stage 1 (2a)
    652     VSWP d22,d3                     @ Transposing first half of transform stage 1 (2b)
    653 
    654     VTRN.32 q2,q8                   @ Transposing first half of transform stage 1 (3a)
    655     VTRN.32 q15,q13                 @ Transposing first half of transform stage 1 (3b)
    656     VSWP d5,d30                     @ Transposing first half of transform stage 1 (4a)
    657     VSWP d26,d17                    @ Transposing first half of transform stage 1 (4b)
    658                                     @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13
    659 
    660     @ Evaluating first step in Butterfly diagram
    661 
    662     VADD.S32 q0,q12,q13             @ q0  = B0 + B7
    663     VADD.S32 q5,q11,q2              @ q5  = B3 + B4
    664     VADD.S32 q3,q1,q15              @ q3  = B1 + B6
    665     VADD.S32 q4,q14,q8              @ q4  = B2 + B5
    666 
    667     VSUB.S32 q7,q14,q8              @ q7  = B2 - B5
    668     VSUB.S32 q8,q1,q15              @ q8  = B1 - B6
    669     VSUB.S32 q6,q11,q2              @ q6  = B3 - B4
    670     VSUB.S32 q9,q12,q13             @ q9  = B0 - B7
    671 
    672     @ Calculating G0, G2, G4 and G6
    673 
    674     MOV r4,#18
    675     MOV r5,#50
    676     VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
    677     VSUB.S32 q2,q0,q5               @ q2  = B0 - B3 - B4 + B7
    678 
    679     MOV r4,#75
    680     MOV r5,#89
    681     VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
    682     VADD.S32 q10,q0,q5              @ q10 = B0 + B3 + B4 + B7
    683 
    684     MOV r4,#36
    685     MOV r5,#83
    686     VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
    687     VSUB.S32 q11,q3,q4              @ q11 = B1 - B2 - B5 + B6
    688     VADD.S32 q3,q3,q4               @ q3  = B1 + B2 + B5 + B6
    689 
    690     VMUL.S32 q12,q2,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
    691     VMUL.S32 q2,q2,d0[0]            @ q2  = 36*(B0 - B3 - B4 + B7)
    692     VMUL.S32 q5,q9,d3[1]            @ q5 = 89*(B0 - B7)
    693     VADD.S32 q14,q10,q3             @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
    694     VMUL.S32 q4,q9,d3[0]            @ q4 = 75*(B0 - B7)
    695     VSUB.S32 q15,q10,q3             @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
    696 @    VSHL.S32 q14,q14,#6             ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
    697 @    VSHL.S32 q15,q15,#6             ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
    698 
    699     VMLA.S32 q12,q11,d0[0]          @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
    700     VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in G0
    701     VMLS.S32 q2,q11,d0[1]           @ q2  = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
    702     VRSHRN.I32 d30,q15,#5           @ Truncating last 11 bits in G4
    703 
    704     LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
    705     ASR r4,r4,#16                   @ r4 = dst_strd
    706     LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
    707 
    708     VMUL.S32 q3,q9,d2[1]            @ q3 = 50*(B0 - B7)
    709     VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in G2
    710     VMUL.S32 q9,q9,d2[0]            @ q9 = 18*(B0 - B7)
    711     VRSHRN.I32 d4,q2,#11            @ Truncating last 11 bits in G6
    712 
    713     VMLA.S32 q5,q8,d3[0]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
    714     VST1.64 d28,[r3],r4             @ First half-row of row 1 of transform stage 2 (G0) stored
    715     VMLS.S32 q4,q8,d2[0]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6)
    716 
    717     VMLS.S32 q3,q8,d3[1]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
    718     VST1.64 d24,[r3],r4             @ First half-row of row 3 of transform stage 2 (G2) stored
    719     VMLS.S32 q9,q8,d2[1]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
    720 
    721     VMLA.S32 q5,q7,d2[1]            @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
    722     VST1.64 d30,[r3],r4             @ First half-row of row 5 of transform stage 2 (G4) stored
    723     VMLS.S32 q4,q7,d3[1]            @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
    724 
    725     VMLA.S32 q3,q7,d2[0]            @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
    726     VST1.64 d4,[r3]                 @ First half-row of row 7 of transform stage 2 (G6) stored
    727     VMLA.S32 q9,q7,d3[0]            @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
    728 
    729     VMLA.S32 q5,q6,d2[0]            @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
    730     VMLS.S32 q4,q6,d2[1]            @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
    731     VMLA.S32 q3,q6,d3[0]            @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
    732     VMLS.S32 q9,q6,d3[1]            @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
    733 
    734     SUB r3,r3,r4,LSL #1
    735     SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd*2
    736                                     @ r3 is moved from row 7 to row 2
    737     VRSHRN.I32 d10,q5,#11           @ Truncating last 11 bits in G1
    738     VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in G3
    739     VRSHRN.I32 d6,q3,#11            @ Truncating last 11 bits in G5
    740     VST1.64 d10,[r3],r4             @ First half-row of row 2 of transform stage 2 (G1) stored
    741     VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in G7
    742 
    743     VST1.64 d8,[r3],r4              @ First half-row of row 4 of transform stage 2 (G3) stored
    744     VST1.64 d6,[r3],r4              @ First half-row of row 6 of transform stage 2 (G5) stored
    745     VST1.64 d18,[r3]!               @ First half-row of row 8 of transform stage 2 (G7) stored
    746 
    747     @ Transform stage 2 (for rows 5-8 of transform stage 1)
    748     @ Loading the 4 rows (F4, F5, F6, F7)
    749 
    750     SUB r2,r2,#112                  @ r2 jumps from row 8 to row 5 in temporary memory
    751     VLD1.64 {d20,d21},[r2]!         @ q10 = F4[0]
    752     VLD1.64 {d22,d23},[r2]!         @ q11 = F4[1]
    753     VLD1.64 {d8,d9},[r2]!           @ q4  = F5[0]
    754     @ Transposing the 4 rows
    755     @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}
    756 
    757     VTRN.32 q10,q4                  @ Transposing second half of transform stage 1 (1a)
    758     VLD1.64 {d10,d11},[r2]!         @ q5  = F5[1]
    759     VLD1.64 {d4,d5},[r2]!           @ q2  = F6[0]
    760     VLD1.64 {d6,d7},[r2]!           @ q3  = F6[1]
    761     VLD1.64 {d24,d25},[r2]!         @ q12 = F7[0]
    762     VTRN.32 q2,q12                  @ Transposing second half of transform stage 1 (1b)
    763     VLD1.64 {d26,d27},[r2]          @ q13 = F7[1]
    764 
    765     VSWP d21,d4                     @ Transposing second half of transform stage 1 (2a)
    766     VSWP d24,d9                     @ Transposing second half of transform stage 1 (2b)
    767 
    768     VTRN.32 q11,q5                  @ Transposing second half of transform stage 1 (3a)
    769     VTRN.32 q3,q13                  @ Transposing second half of transform stage 1 (3b)
    770     VSWP d26,d11                    @ Transposing second half of transform stage 1 (4b)
    771     VSWP d23,d6                     @ Transposing second half of transform stage 1 (4a)
    772                                     @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13
    773 
    774     @ Evaluating first step in Butterfly diagram
    775 
    776     VADD.S32 q0,q10,q13             @ q0  = B0 + B7
    777     VADD.S32 q15,q12,q11            @ q15 = B3 + B4
    778     VADD.S32 q1,q4,q3               @ q1  = B1 + B6
    779     VADD.S32 q14,q2,q5              @ q14 = B2 + B5
    780 
    781     VSUB.S32 q9,q10,q13             @ q9  = B0 - B7
    782     VSUB.S32 q6,q12,q11             @ q6  = B3 - B4
    783     VSUB.S32 q7,q2,q5               @ q7  = B2 - B5
    784     VSUB.S32 q8,q4,q3               @ q8  = B1 - B6
    785 
    786     @ Calculating H0, H2, H4 and H6
    787 
    788     VADD.S32 q3,q1,q14              @ q3 = B1 + B2 + B5 + B6
    789     VSUB.S32 q5,q1,q14              @ q5 = B1 - B2 - B5 + B6
    790 
    791     MOV r4,#18
    792     MOV r5,#50
    793     VSUB.S32 q4,q0,q15              @ q4 = B0 - B3 - B4 + B7
    794     VMOV d2,r4,r5                   @ 32-bit aligned, d2[1] = 50, d2[0] = 18
    795 
    796     MOV r4,#75
    797     MOV r5,#89
    798     VADD.S32 q2,q0,q15              @ q2 = B0 + B3 + B4 + B7
    799     VMOV d3,r4,r5                   @ 32-bit aligned, d3[1] = 89, d3[0] = 75
    800 
    801     MOV r4,#36
    802     MOV r5,#83
    803 
    804     @ Calculating H1, H3, H5 and H7
    805 
    806     VMUL.S32 q10,q9,d3[1]           @ q10 = 89*(B0 - B7)
    807     VMOV d0,r4,r5                   @ 32-bit aligned, d0[1] = 83, d0[0] = 36
    808 
    809     VMUL.S32 q13,q9,d3[0]           @ q13 = 75*(B0 - B7)
    810 
    811     VMUL.S32 q12,q4,d0[1]           @ q12 = 83*(B0 - B3 - B4 + B7)
    812     VADD.S32 q14,q2,q3              @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
    813     VMUL.S32 q4,q4,d0[0]            @ q4  = 36*(B0 - B3 - B4 + B7)
    814     VSUB.S32 q2,q2,q3               @ q2  = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
    815 
    816 
    817     VMLA.S32 q12,q5,d0[0]           @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
    818 @    VSHL.S32 q14,q14,#6             ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
    819     VMLS.S32 q4,q5,d0[1]            @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
    820 @    VSHL.S32 q2,q15,#6              ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
    821 
    822     VMUL.S32 q11,q9,d2[1]           @ q11 = 50*(B0 - B7)
    823     VRSHRN.I32 d28,q14,#5           @ Truncating last 11 bits in H0
    824     VMUL.S32 q9,q9,d2[0]            @ q9  = 18*(B0 - B7)
    825     VRSHRN.I32 d24,q12,#11          @ Truncating last 11 bits in H2
    826 
    827     VMLA.S32 q10,q8,d3[0]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
    828     VRSHRN.I32 d4,q2,#5             @ Truncating last 11 bits in H4
    829     VMLS.S32 q13,q8,d2[0]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
    830     VRSHRN.I32 d8,q4,#11            @ Truncating last 11 bits in H6
    831 
    832     LDR r4,[sp,#80]                 @ r4 = dst_strd_chr_flag
    833     ASR r4,r4,#16                   @ r4 = dst_strd
    834     LSL r4,r4,#2                    @ r4 = 2*dst_strd*2
    835 
    836     SUB r3,r3,r4,LSL #2
    837     ADD r3,r3,r4,ASR #1             @ r3 = r3 - 7*dst_strd*2
    838                                     @ r3 is moved from row 8 to row 1
    839     VMLS.S32 q11,q8,d3[1]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
    840     VST1.64 d28,[r3],r4             @ Second half-row of row 1 of transform stage 2 (H0) stored
    841     VMLS.S32 q9,q8,d2[1]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6)
    842 
    843     VMLA.S32 q10,q7,d2[1]           @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
    844     VST1.64 d24,[r3],r4             @ Second half-row of row 3 of transform stage 2 (H2) stored
    845     VMLS.S32 q13,q7,d3[1]           @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
    846 
    847     VMLA.S32 q11,q7,d2[0]           @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
    848     VST1.64 d4,[r3],r4              @ Second half-row of row 5 of transform stage 2 (H4) stored
    849     VMLA.S32 q9,q7,d3[0]            @ q9  = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
    850 
    851     VMLA.S32 q10,q6,d2[0]           @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
    852     VST1.64 d8,[r3]                 @ Second half-row of row 7 of transform stage 2 (H6) stored
    853     VMLS.S32 q13,q6,d2[1]           @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
    854 
    855     VMLA.S32 q11,q6,d3[0]           @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
    856     VMLS.S32 q9,q6,d3[1]            @ q9  = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
    857 
    858     SUB r3,r3,r4,LSL #1
    859     SUB r3,r3,r4,ASR #1             @ r3 = r3 - 5*dst_strd
    860                                     @ r3 is moved from row 7 to row 2
    861     VRSHRN.I32 d20,q10,#11          @ Truncating last 11 bits in H1
    862     VRSHRN.I32 d26,q13,#11          @ Truncating last 11 bits in H3
    863     VRSHRN.I32 d22,q11,#11          @ Truncating last 11 bits in H5
    864     VST1.64 d20,[r3],r4             @ Second half-row of row 2 of transform stage 2 (H1) stored
    865     VRSHRN.I32 d18,q9,#11           @ Truncating last 11 bits in H7
    866 
    867     VST1.64 d26,[r3],r4             @ Second half-row of row 4 of transform stage 2 (H3) stored
    868     VST1.64 d22,[r3],r4             @ Second half-row of row 6 of transform stage 2 (H5) stored
    869     VST1.64 d18,[r3]                @ Second half-row of row 8 of transform stage 2 (H7) stored
    870 
    871     vpop {d8 - d15}
    872     POP {r4,r5}
    873     MOV pc,lr
    874 
    875 @/**
    876 @*/ *******************************************************************************
    877 @*/
    878 @*/@brief
    879 @*/  This function performs residue calculation and forward  transform on
    880 @*/ input pixels
    881 @*/
    882 @*/@par Description:
    883 @*/ Performs residue calculation by subtracting source and  prediction and
    884 @*/ followed by forward transform
    885 @*/
    886 @*/ @param[in] pu1_src
    887 @*/  Input 16x16 pixels
    888 @*/
    889 @*/ @param[in] pu1_pred
    890 @*/  Prediction data
    891 @*/
    892 @*/ @param[in] pi2_tmp
    893 @*/  Temporary buffer of size 16x16
    894 @*/
    895 @*/ @param[out] pi2_dst
    896 @*/  Output 16x16 coefficients
    897 @*/
    898 @*/ @param[in] src_strd
    899 @*/  Input stride
    900 @*/
    901 @*/ @param[in] pred_strd
    902 @*/  Prediction Stride
    903 @*/
    904 @*/ @param[in] dst_strd_chr_flag
    905 @*/  Output Stride and Chroma Flag packed in the MS and LS 16-bit
    906 @*/
    907 @*/ @returns  Void
    908 @*/
    909 @*/ @remarks
    910 @*/  None
    911 @*/
    912 @*/*******************************************************************************
    913 @*/
    914 
    915 .extern g_ai2_ihevc_trans_16
    916 .extern g_ai4_ihevc_trans_16
    917 
    918 g_ai2_ihevc_trans_16_addr_1:
    919 .long g_ai2_ihevc_trans_16 - ulbl1 - 8
    920 
    921 g_ai2_ihevc_trans_16_addr_2:
    922 .long g_ai2_ihevc_trans_16 - ulbl2 - 8
    923 
    924 g_ai4_ihevc_trans_16_addr:
    925 .long g_ai4_ihevc_trans_16 - ulbl3 - 8
    926 
    927     .global ihevc_resi_trans_16x16_a9q
    928 
    929 ihevc_resi_trans_16x16_a9q:
    930 
    931 .equ TMP_STRIDE        ,  64            @16*4, Stride of tmp register
    932 .equ SHIFT             ,  13            @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
    933 .equ RADD              ,  4096          @1 << (shift - 1);
    934 
    935 .equ COFF_STD_2B       ,  32            @Stride for g_ai2_ihevc_trans_16 in bytes
    936 .equ COFF_STD_W        ,  32            @Stride for g_ai4_ihevc_trans_16 in bytes
    937 
    938 @;LOAD the fucntion
    939     STMFD          SP!,{r4-r12,LR}      @stack store values of the arguments
    940     vpush          {d8 - d15}
    941     SUB            SP,SP,#32
    942 
    943     LDR             R4,[SP,#136]            @get src_strd
    944     LDR             R5,[SP,#140]         @get pred_strd
    945     LDR             R6,[SP,#144]         @get dst_strd_chr_flag
    946 
    947     MOV R8,#0                           @Set loop counter
    948     LDR R9,g_ai2_ihevc_trans_16_addr_1    @get 16 bit transform matrix
    949 ulbl1:
    950     ADD R9, R9, PC
    951     @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
    952     @and write to stack
    953     MOV R12,#COFF_STD_2B
    954     LSL R12,#2
    955 
    956     VLD1.S32 D30[0],[R9],R12
    957     VLD1.S32 D30[1],[R9],R12
    958     VLD1.S32 D31[0],[R9],R12
    959     VLD1.S32 D31[1],[R9],R12
    960 
    961     VTRN.S32 D30,D31
    962     VTRN.S16 D30,D31
    963     VST1.S16 {d30,d31},[SP]
    964 
    965     LDR R9,g_ai2_ihevc_trans_16_addr_2      @get back 16 bit transform matrix
    966 ulbl2:
    967     ADD R9, R9, PC
    968 
    969     MOV R7,#TMP_STRIDE
    970     AND R14,R6,#0x1
    971 
    972     VMOV.S32 Q14,#0
    973 
    974 @R0         pu1_src
    975 @R1         pu1_pred
    976 @R2         pi4_tmp
    977 @R3         pi2_dst
    978 @R4         src_strd
    979 @R5         pred_strd
    980 @R6         dst_strd_chr_flag
    981 @R7         tmp_dst Nx4 block stride
    982 @R8         loop cntr
    983 @R9         g_ai2_ihevc_trans_16
    984 @R10        tmp_dst Nx4 block offset
    985 @R11        tmp register
    986 @R12        ------
    987 @R14        ------.
    988 @q14        shift 32 bit
    989 @q15        add 32 bit
    990 
    991 CORE_LOOP_16X16_HORIZ:
    992 
    993     CMP R14,#1
    994     BEQ INTERLEAVED_LOAD_S1
    995 
    996     VLD1.U8 {d0,d1},[R0],R4             @LOAD 1-16 src row 1
    997     VLD1.U8 {d2,d3},[R1],R5             @LOAD 1-16 pred row 1
    998     VLD1.U8 {d4,d5},[R0],R4             @LOAD 1-16 src row 2
    999     VLD1.U8 {d6,d7},[R1],R5             @LOAD 1-16 pred row 2
   1000     B    LOAD_DONE
   1001 
   1002 INTERLEAVED_LOAD_S1:
   1003 
   1004     VLD2.U8 {Q0,Q1},[R0],R4             @LOAD 1-16 src row 1
   1005     VLD2.U8 {Q1,Q2},[R1],R5             @LOAD 1-16 pred row 1
   1006     VLD2.U8 {Q2,Q3},[R0],R4             @LOAD 1-16 src row 2
   1007     VLD2.U8 {Q3,Q4},[R1],R5             @LOAD 1-16 pred row 2
   1008 LOAD_DONE:
   1009 
   1010     VSUBL.U8 Q4,D0,D2                   @Get residue 1-8 row 1
   1011     VSUBL.U8 Q5,D1,D3                   @Get residue 9-16 row 1
   1012     VSUBL.U8 Q6,D4,D6                   @Get residue 1-8 row 2
   1013     VSUBL.U8 Q7,D5,D7                   @Get residue 9-16 row 2
   1014 
   1015     @Get blk sads
   1016     VABDL.U8 Q15,D0,D2
   1017     VABAL.U8 Q15,D1,D3
   1018     VABAL.U8 Q15,D4,D6
   1019     VABAL.U8 Q15,D5,D7
   1020     VADDW.S16 Q14,Q14,D30
   1021     VADDW.S16 Q14,Q14,D31
   1022 
   1023     VREV64.S16 Q5,Q5                    @Rev row 1
   1024     VREV64.S16 Q7,Q7                    @Rev row 2
   1025     VSWP D10,D11
   1026     VSWP D14,D15
   1027 
   1028     VADD.S16 Q8 ,Q4,Q5                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 1
   1029     VSUB.S16 Q9 ,Q4,Q5                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 1
   1030     VADD.S16 Q10,Q6,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-8 row 2
   1031     VSUB.S16 Q11,Q6,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2     k ->9-16 row 2
   1032 
   1033     VREV64.S16    D24,D17               @rev e[k] k-> 4-7 row 1
   1034     VREV64.S16    D25,D21               @rev e[k] k-> 4-7 row 2
   1035     VMOV.S16    D17,D20
   1036 
   1037     @arrangement OF DATA
   1038     @Q8     A1 A2 A3 A4 B1 B2 B3 B4
   1039     @Q12    A8 A7 A6 A5 B8 B7 B6 B5
   1040 
   1041     VADD.S16 Q13,Q8,Q12                 @ee[k] = e[k] + e[7 - k] row 1 & 2
   1042     VSUB.S16 Q0,Q8,Q12                  @eo[k] = e[k] - e[7 - k] row 1 & 2
   1043 
   1044     @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
   1045     @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
   1046     VTRN.S32 D26,D27                    @1-cycle stall before it?
   1047     @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
   1048     @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
   1049     VREV32.16 D2,D27                    @1-cycle stall before it?
   1050     @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
   1051     @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
   1052     VMOV.S16 D27,D26
   1053     VNEG.S16 D3,D2
   1054     @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1]  R1ee[0]  R1ee[1]  R2ee[0]  R2ee[1]
   1055     @Q1  R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]
   1056 
   1057     @D8 : [0 0] [4 0] [8 0] [12 0]
   1058     @D9 : [0 1] [4 1] [8 1] [12 1]
   1059     VLD1.S16 {d8,d9},[SP]               @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
   1060     VADD.S16 Q1,Q13,Q1                  @ 1-cycle stall before it?
   1061     @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
   1062 
   1063     @Q1  R1eee[0] R1eee[1] R2eee[0] R2eee[1]
   1064     @    R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
   1065     VTRN.S16 D2,D3                      @2-cycle stall before it?
   1066     @Q1  R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
   1067     @     R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]
   1068 
   1069     VDUP.S32 D4,D2[0]    @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]    ;1-cycle stall?
   1070     VDUP.S32 D5,D2[1]    @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
   1071     VDUP.S32 D6,D3[0]    @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
   1072     VDUP.S32 D7,D3[1]    @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
   1073 
   1074     @---------------Process EO--------------------
   1075     @ Early start to avoid stalls
   1076     MOV R12,#COFF_STD_2B                @Get stride of coeffs
   1077 
   1078     VMULL.S16 Q5,D4,D8                  @   g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
   1079     VMLAL.S16 Q5,D6,D9                  @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
   1080     VMULL.S16 Q6,D5,D8                  @   g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
   1081     VMLAL.S16 Q6,D7,D9                  @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
   1082 
   1083     ADD R11,R9,R12,LSL #1               @Load address of g_ai2_ihevc_trans_16[2]
   1084     LSL R12,R12,#2
   1085 
   1086     VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[2][0-4]]
   1087 
   1088     VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[6][0-4]
   1089     VMULL.S16 Q1,D26,D0                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R1
   1090 
   1091     VMULL.S16 Q2,D26,D1                 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]    R2
   1092 
   1093     VZIP.S32 Q5,Q6                      @3-cycle instruction
   1094     VMULL.S16 Q3,D27,D0                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R1
   1095 
   1096 
   1097     VLD1.S16 D26,[R11],R12              @LOAD g_ai2_ihevc_trans_16[10][0-4]
   1098     VMULL.S16 Q4,D27,D1                 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4]    R2
   1099 
   1100     @These values must go to 0 4 8 12 colums hence we need stride *4
   1101     LSL R10,R7,#2
   1102 
   1103     VLD1.S16 D27,[R11],R12              @LOAD g_ai2_ihevc_trans_16[14][0-4]
   1104 
   1105     VST1.32 D10,[R2],R10
   1106     VMULL.S16 Q8,D27,D1                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2
   1107 
   1108     VST1.32 D11,[R2],R10
   1109     VMULL.S16 Q7,D27,D0                 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1
   1110 
   1111     VST1.32 D12,[R2],R10
   1112     VMULL.S16 Q5,D26,D0                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1
   1113 
   1114     VST1.32 D13,[R2],R10
   1115     VMULL.S16 Q6,D26,D1                 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2
   1116 
   1117     SUB R2,R2,R10,LSL #2
   1118 
   1119     @transpose the 4x4 matrix row1
   1120     VTRN.32 Q1, Q3                      @R1 transpose1 -- 2 cycles
   1121 
   1122     @transpose the 4x4 matrix row2
   1123     VTRN.32 Q2,Q4                       @R2 transpose1 -- 2 cycles
   1124 
   1125     VTRN.32 Q5, Q7                      @R1 transpose1 -- 2 cycles
   1126 
   1127     VTRN.32 Q6,Q8                       @R2 transpose1 -- 2 cycles
   1128 
   1129     VSWP    D10,D3                      @R1 transpose2
   1130     VSWP    D14,D7                      @R1 transpose2
   1131 
   1132     VSWP    D12,D5                      @R2 transpose2
   1133     VSWP    D16,D9                      @R2 transpose2
   1134 
   1135     VADD.S32 Q5,Q5,Q1                   @R1 add
   1136     VADD.S32 Q3,Q3,Q7                   @R1 add
   1137 
   1138     VADD.S32 Q2,Q2,Q4                   @R2 add
   1139     VADD.S32 Q6,Q6,Q8                   @R2 add
   1140 
   1141     VADD.S32 Q5,Q5,Q3                   @R1 add
   1142 
   1143     VADD.S32 Q4,Q6,Q2                   @R2 add
   1144 
   1145     @-----------------------Processing O ----------------------------
   1146     @ Early start to avoid stalls
   1147     MOV R12,#COFF_STD_2B                @Get coeffs stride
   1148     LSL R12,R12,#1
   1149     ADD R11,R9,#COFF_STD_2B             @Get address of g_ai2_ihevc_trans_16[1]
   1150 
   1151     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles
   1152 
   1153     VZIP.S32 Q5,Q4                      @ 3 cycle instruction
   1154     VMULL.S16 Q6,D18,D4                 @o[0][0-3]*  R1
   1155 
   1156 
   1157     VMLAL.S16 Q6,D19,D5                 @o[0][4-7]*  R1     ; follows MULL instruction: Multiplier accumulator forwarding
   1158     @write to memory
   1159     @this should go to 2 6 10 14
   1160     LSL R10,R7,#2
   1161     ADD R2,R2,R7,LSL #1                 @move to third row
   1162     VST1.32 D10,[R2],R10
   1163     VMULL.S16 Q7,D22,D4                 @o[0][0-3]*  R2
   1164 
   1165     VST1.32 D11,[R2],R10
   1166     VMLAL.S16 Q7,D23,D5                 @o[0][4-7]*  R2
   1167 
   1168     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
   1169 
   1170     VST1.32 D8,[R2],R10
   1171     VMULL.S16 Q8,D18,D4                 @o[1][0-3]*  R1
   1172 
   1173     VST1.32 D9,[R2],R10
   1174     VMLAL.S16 Q8,D19,D5                 @o[1][4-7]*  R1
   1175     SUB R2,R2,R10,LSL #2
   1176     SUB R2,R2,R7,LSL #1
   1177 
   1178     @--------------------Done procrssing EO -------------------------
   1179 
   1180     @ -----------------Processing O continues------------------------
   1181 
   1182     VMULL.S16 Q10,D22,D4                @o[1][0-3]*  R2
   1183     VMLAL.S16 Q10,D23,D5                @o[1][4-7]*  R2
   1184 
   1185     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
   1186 
   1187     VLD1.S16 {d6,d7},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
   1188     VMULL.S16 Q12,D18,D4                @o[2][0-3]*  R1
   1189 
   1190     VMLAL.S16 Q12,D19,D5                @o[2][4-7]*  R1
   1191     VMULL.S16 Q0,D18,D6                 @o[3][0-3]*  R1
   1192     VMLAL.S16 Q0,D19,D7                 @o[3][4-7]*  R1
   1193 
   1194     VMULL.S16 Q13,D22,D4                @o[2][0-3]*  R2
   1195     VMLAL.S16 Q13,D23,D5                @o[2][4-7]*  R2
   1196     VMULL.S16 Q1,D22,D6                 @o[3][0-3]*  R2
   1197     VMLAL.S16 Q1,D23,D7                 @o[3][4-7]*  R2
   1198 
   1199     @transpose the 4x4 matrix R1
   1200     VTRN.32 Q6, Q8                      @ 2-cycle instruction
   1201 
   1202     VTRN.32 Q12,Q0                      @ 2-cycle instruction
   1203 
   1204     @transpose the 4x4 matrix R2
   1205     VTRN.32 Q7,Q10                      @ 2-cycle instruction
   1206 
   1207     VTRN.32 Q13,Q1                      @ 2-cycle instruction
   1208 
   1209     VSWP    D24,D13
   1210     VSWP    D0, D17
   1211 
   1212     VSWP     D26,D15
   1213     VSWP    D2,D21
   1214 
   1215     VADD.S32 Q8 ,Q8 ,Q6
   1216     VADD.S32 Q12,Q12,Q0
   1217 
   1218     VADD.S32 Q10,Q10,Q7
   1219     VADD.S32 Q13,Q13,Q1
   1220 
   1221     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[9][0-7]
   1222     VADD.S32 Q12 ,Q12 ,Q8
   1223 
   1224     VADD.S32 Q13,Q13,Q10
   1225     VMULL.S16 Q3,D18,D4                 @o[4][0-3]*  R1
   1226     VMLAL.S16 Q3,D19,D5                 @o[4][4-7]*  R1
   1227 
   1228     VZIP.S32 Q12,Q13
   1229     VMULL.S16 Q4,D22,D4                 @o[0][0-3]*  R2
   1230 
   1231 
   1232     VMLAL.S16 Q4,D23,D5                 @o[0][4-7]*  R2
   1233     @write to memory
   1234     @this should go to 1 3 5 7
   1235     ADD R2,R2,R7
   1236     LSL R7,R7,#1
   1237     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[11][0-7]
   1238 
   1239     VST1.32 D24,[R2],R7
   1240     VMULL.S16 Q5,D18,D4                 @o[5][0-3]*  R1
   1241 
   1242     VST1.32 D25,[R2],R7
   1243     VMLAL.S16 Q5,D19,D5                 @o[5][4-7]*  R1
   1244 
   1245     VST1.32 D26,[R2],R7
   1246     VMULL.S16 Q6,D22,D4                 @o[0][0-3]*  R2
   1247 
   1248     VST1.32 D27,[R2],R7
   1249     VMLAL.S16 Q6,D23,D5                 @o[0][4-7]*  R2
   1250 
   1251     VLD1.S16 {d4,d5},[R11],R12          @g_ai2_ihevc_trans_16[13][0-7]
   1252 
   1253     VLD1.S16 {d2,d3},[R11],R12          @g_ai2_ihevc_trans_16[15][0-7]
   1254     VMULL.S16 Q7,D18,D4                 @o[6][0-3]*  R1
   1255 
   1256     VMLAL.S16 Q7,D19,D5                 @o[6][4-7]*  R1
   1257     VMULL.S16 Q10,D18,D2                @o[7][0-3]*  R1
   1258     VMLAL.S16 Q10,D19,D3                @o[7][4-7]*  R1
   1259 
   1260     VMULL.S16 Q8,D22,D4                 @o[0][0-3]*  R2
   1261     VMLAL.S16 Q8,D23,D5                 @o[0][4-7]*  R2
   1262     VMULL.S16 Q12,D22,D2                @o[0][0-3]*  R2
   1263     VMLAL.S16 Q12,D23,D3                @o[0][4-7]*  R2
   1264 
   1265 
   1266     @transpose the 4x4 matrix R1
   1267     VTRN.32 Q3 ,Q5                      @ 2-cycle instruction
   1268 
   1269     VTRN.32 Q7 ,Q10                     @ transpose step 2 R1 , 2-cycle instruction
   1270 
   1271     @transpose the 4x4 matrix R2
   1272     VTRN.32 Q4 ,Q6                      @ 2-cycle instruction
   1273 
   1274     VTRN.32 Q8 ,Q12                     @ transpose step 2 R2 , 2-cycle instruction
   1275 
   1276     VSWP    D14,D7                      @ transpose step 3, R1
   1277     VSWP    D20,D11                     @ transpose step 4, R1
   1278     VSWP    D16,D9                      @ transpose step 3, R2
   1279     VSWP    D24,D13                     @ transpose step 4, R2
   1280 
   1281     VADD.S32 Q5 ,Q5 ,Q3
   1282     VADD.S32 Q10,Q10,Q7
   1283     VADD.S32 Q6 ,Q6 ,Q4
   1284     VADD.S32 Q12,Q12,Q8
   1285     VADD.S32 Q10,Q10,Q5
   1286     VADD.S32 Q12,Q12,Q6
   1287 
   1288     @ 2-cycle stall
   1289     VZIP.S32 Q10,Q12                    @ 3-cycle instruction
   1290 
   1291     @ 2-cycle stall
   1292     @this should go to 9 11 13 15
   1293     VST1.32 D20,[R2],R7
   1294 
   1295     VST1.32 D21,[R2],R7
   1296 
   1297     VST1.32 D24,[R2],R7
   1298 
   1299     VST1.32 D25,[R2],R7
   1300 
   1301     SUB R2,R2,R7,LSL #3
   1302     LSR R7,R7,#1
   1303     SUB R2,R2,R7
   1304 
   1305     ADD R2,R2,#8                        @MOVE TO NEXT to next COLUMN - pi4_tmp
   1306 
   1307     ADD R8,R8,#2                        @increment loop cntr
   1308     CMP R8,#16                          @check lllop cntr
   1309     BNE CORE_LOOP_16X16_HORIZ           @jump acc
   1310 
   1311 
   1312 @*****************Vertical transform************************************
   1313 
   1314 @Initialization for vert transform
   1315 @pi4_tmp will be the new src
   1316 @tmp stride will be new src stride
   1317 @dst will be new pi4_tmp
   1318 @dst stride will be new tmp stride
   1319 @trans table will be of 32 bit
   1320 
   1321     LDR R9,g_ai4_ihevc_trans_16_addr        @get 32 bit transform matrix
   1322 ulbl3:
   1323     ADD R9, R9, PC
   1324 
   1325     SUB R0,R2,#64                       @set tmp as src [-32 to move back to orgin]
   1326     MOV R2,R3                           @set dst as tmp
   1327     MOV R4,#TMP_STRIDE                  @set tmp stride as src stride
   1328     LSR R7,R6,#15                       @Set dst stride as tmp stride
   1329     SUB R4,#48                          @Adjust stride 3 previous loads
   1330 
   1331     @Block SAD
   1332     VADD.S32 D28,D28,D29
   1333     VPADD.S32 D28,D28,D29
   1334     VMOV.S32 R3,D28[0]
   1335     @ SAD calculation ends -- final value in R3.
   1336 
   1337     @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
   1338     @values of g_ai4_ihevc_trans_16 and write to stack
   1339     MOV R12,#COFF_STD_W
   1340     LSL R12,R12,#2
   1341     VLD1.S32 D28,[R9],R12
   1342     VLD1.S32 D29,[R9],R12
   1343     VLD1.S32 D30,[R9],R12
   1344     VLD1.S32 D31,[R9],R12
   1345     SUB R9,R9,R12,LSL #2
   1346 
   1347     VREV64.32 Q15,Q15
   1348     VTRN.S32 Q14,Q15
   1349     VST1.S32 {Q14-Q15},[SP]
   1350 
   1351     VMOV.U32 Q14,#RADD                  @get the round factor to q14
   1352     VMOV.U32 Q15,#SHIFT                 @Get the shift to neon
   1353 
   1354     MOV R8,#0                           @INIT LOOP
   1355 
   1356 CORE_LOOP_16X16_VERT:
   1357 
   1358     VLD1.S32 {D0,D1},[R0]!              @LOAD 1-4 src R1
   1359     VLD1.S32 {D2,D3},[R0]!              @LOAD 5-8 pred R1
   1360     VLD1.S32 {D4,D5},[R0]!              @LOAD 9-12 src R1
   1361     VLD1.S32 {D6,D7},[R0],R4            @LOAD 12-16 pred R1
   1362 
   1363     VLD1.S32 {D8,D9},[R0]!              @LOAD 1-4 src R2
   1364     VLD1.S32 {D10,D11},[R0]!            @LOAD 5-8 pred R2
   1365     VLD1.S32 {D12,D13},[R0]!            @LOAD 9-12 src R2
   1366     VLD1.S32 {D14,D15},[R0],R4          @LOAD 12-16 pred R2
   1367 
   1368     VREV64.S32 Q2,Q2                    @Rev 9-12 R1
   1369     VREV64.S32 Q3,Q3                    @Rev 12-16 R1
   1370     VREV64.S32 Q6,Q6                    @Rev 9-12 R2
   1371     VREV64.S32 Q7,Q7                    @Rev 12-16 R2
   1372 
   1373     VSWP D6,D7
   1374     VSWP D4,D5
   1375     VADD.S32 Q8 ,Q0,Q3                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R1
   1376     VSWP D12,D13                        @ dual issued with prev. instruction
   1377     VADD.S32 Q9 ,Q1,Q2                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R1
   1378     VSWP D14,D15                        @ dual issued with prev. instruction
   1379     VSUB.S32 Q10,Q0,Q3                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R1
   1380     VSUB.S32 Q11,Q1,Q2                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R1
   1381 
   1382     VADD.S32 Q12,Q4,Q7                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 1-4  R2
   1383     VREV64.S32    Q9 ,Q9                @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
   1384     VADD.S32 Q13,Q5,Q6                  @e[k] = resi_tmp_1 + resi_tmp_2  k -> 5-8  R2
   1385     VSUB.S32 Q0 ,Q4,Q7                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 1-4  R2
   1386     VSWP D18,D19                        @ dual issued with prev. instruction
   1387     VSUB.S32 Q1 ,Q5,Q6                  @o[k] = resi_tmp_1 - resi_tmp_2  k -> 5-8  R2
   1388     VREV64.S32    Q13,Q13               @rev e[k] k-> 4-7 R2, dual issued with prev. instruction
   1389 
   1390     VADD.S32 Q2,Q8,Q9                   @ee[k] = e[k] + e[7 - k] row R1
   1391     VSUB.S32 Q3,Q8,Q9                   @eo[k] = e[k] - e[7 - k] row R1
   1392     VSWP D26,D27
   1393 
   1394 
   1395     VADD.S32 Q4,Q12,Q13                 @ee[k] = e[k] + e[7 - k] row R2
   1396     VSUB.S32 Q5,Q12,Q13                 @eo[k] = e[k] - e[7 - k] row R2
   1397     VREV64.S32 D5,D5                    @rev ee[k] 4-7 R1, dual issued with prev. instruction
   1398 
   1399     VADD.S32 D12,D4,D5                  @eee[0] eee[1]    R1
   1400     VSUB.S32 D13,D4,D5                  @eeo[0] eeo[1]    R1
   1401     VREV64.S32 D9,D9                    @rev ee[k] 4-7 R2, dual issued with prev. instruction
   1402 
   1403 
   1404     VADD.S32 D14,D8,D9                  @eee[0] eee[1]    R2
   1405     VSUB.S32 D15,D8,D9                  @eeo[0] eeo[1]    R2
   1406 
   1407     VLD1.S32 {Q12,Q13},[SP]             @Load g_ai2_ihevc_trans_16[xx]->  Q12 : [0 0] [8 0] [4 0] [12 0]  Q13 : [0 1] [8 1] [4 1] [12 1]
   1408     VREV64.S32 Q8,Q6                    @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1   ->     ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1
   1409 
   1410     VREV64.S32 Q9,Q7                    @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2     ->    ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2
   1411 
   1412 
   1413     VMUL.S32 Q4,Q6,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R1
   1414     VMLA.S32 Q4,Q8,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0]    R1
   1415 
   1416     VMUL.S32 Q6,Q7,Q12                  @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1]    R2
   1417     VMLA.S32 Q6,Q9,Q13                  @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2
   1418 
   1419                                         @Q3    :R1E00 R1E01 R1E02 R1E03
   1420                                         @Q5    :R2E00 R2E01 R2E02 R2E03
   1421     VSWP D7,D10                         @ dual issued with prev. instruction
   1422                                         @Q3    :R1E00 R1E01 R2E00 R2E01
   1423                                         @Q5    :R1E02 R1E03 R2E02 R2E03
   1424     VSWP D7,D11
   1425                                         @Q3    :R1E00 R1E01 R2E02 R2E03
   1426                                         @Q5    :R1E02 R1E03 R2E00 R2E01
   1427 
   1428     MOV R12,#COFF_STD_W
   1429     ADD R11,R9,R12,LSL #1               @Get to the 2nd row of src
   1430     LSL R12,R12,#2
   1431 
   1432     VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.
   1433 
   1434     VADD.S32  Q4,Q4,Q14                 @ROUND  R1
   1435     VMUL.S32  Q12,Q3,Q7                 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
   1436     VSWP      D14,D15                   @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction
   1437 
   1438     VADD.S32 Q6,Q6,Q14                  @ROUND  R2
   1439 
   1440     VSHRN.S32 D8,Q4,#SHIFT              @NARROW R1
   1441 
   1442     VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[6][0-4]
   1443     VSHRN.S32 D9,Q6,#SHIFT              @NARROW R2, dual issued in 2nd cycle
   1444 
   1445     VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
   1446     VSWP      D16,D17                   @dual issued with prev. instr.
   1447 
   1448     VZIP.S16 D8,D9                      @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
   1449     VMLA.S32  Q12,Q5,Q7                 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction
   1450 
   1451 
   1452     @WRITE INTO MEM the values or wait to be shuffled
   1453     @These values must go to 0 4 8 12 colums
   1454     LSL R10,R7,#2
   1455     VST1.S32 D8[0],[R2],R10
   1456 
   1457     VST1.S32 D9[0],[R2],R10
   1458 
   1459     VST1.S32 D8[1],[R2],R10
   1460     VPADD.S32 D18,D24,D25               @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
   1461                                         @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01
   1462 
   1463     VST1.S32 D9[1],[R2],R10
   1464     VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
   1465     LSL R10,R10,#2
   1466     SUB R2,R2,R10
   1467 
   1468     VLD1.S32  {D14,D15},[R11],R12       @LOAD g_ai2_ihevc_trans_16[10][0-4]
   1469 
   1470     VMUL.S32  Q6,Q3,Q7                  @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
   1471     VSWP      D14,D15                   @ dual issued with prev. instruction
   1472     VPADD.S32 D19,D4,D5
   1473 
   1474     VLD1.S32  {D16,D17},[R11],R12       @LOAD g_ai2_ihevc_trans_16[14][0-4]
   1475     VMUL.S32  Q2,Q3,Q8                  @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
   1476     VSWP      D16,D17
   1477 
   1478     VMLA.S32  Q6,Q5,Q7                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
   1479     VADD.S32 Q9,Q9,Q14                  @Round by RADD R1
   1480     VMLA.S32  Q2,Q5,Q8                  @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
   1481     VSHRN.S32 D8,Q9,#SHIFT              @Shift by SHIFT
   1482     VPADD.S32 D24,D12,D13
   1483     @---------------Processing O, Row 1 and Row 2--------------------------------------
   1484     @ Early start to avoid stalls
   1485     MOV R12,#COFF_STD_W
   1486     ADD R11,R9,R12                      @Get 1ST row
   1487     LSL R12,R12,#1
   1488 
   1489     LSL R10,R7,#2
   1490     ADD R2,R2,R7,LSL #1                 @move to third row
   1491     @this should go to 2  6 10 14
   1492     VST1.S32 D8[0],[R2],R10
   1493 
   1494     VST1.S32 D8[1],[R2],R10
   1495     VPADD.S32 D25,D4,D5                 @ dual issued with prev. instruction in 2nd cycle
   1496 
   1497     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
   1498     VADD.S32 Q12,Q12,Q14                @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
   1499     VMUL.S32 Q6,Q2,Q0                   @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
   1500     VMLA.S32 Q6,Q3,Q1                   @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
   1501     VSHRN.S32 D9,Q12,#SHIFT             @Shift by SHIFT
   1502 
   1503     VMUL.S32 Q2,Q2,Q10                  @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
   1504     VMLA.S32 Q2,Q3,Q11                  @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
   1505     VADD.S32 D11,D12,D13                @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
   1506     VST1.S32 D9[0],[R2],R10
   1507 
   1508     VST1.S32 D9[1],[R2],R10
   1509     VADD.S32 D10,D4,D5                  @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
   1510     LSL R10,R10,#2                      @go back to orgin
   1511     SUB R2,R2,R10
   1512     SUB R2,R2,R7,LSL #1
   1513 
   1514     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
   1515 
   1516     VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
   1517     VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
   1518     VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
   1519     VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
   1520 
   1521     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[5][0-7]
   1522     VADD.S32 D18,D14,D15
   1523     VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
   1524     VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
   1525     VADD.S32 D19,D16,D17
   1526     VMUL.S32 Q4,Q2,Q0
   1527     VMLA.S32 Q4,Q3,Q1
   1528     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[7][0-7]
   1529     VADD.S32 D26,D24,D25                @ dual issued with prev. instr.
   1530     VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
   1531     VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
   1532     VADD.S32 D27,D8,D9
   1533     VMUL.S32 Q4,Q2,Q0
   1534     VMLA.S32 Q4,Q3,Q1
   1535     VADD.S32 D12,D12,D13
   1536     @Q5 Q9 Q13 Q6
   1537     VPADD.S32 D14,D10,D11
   1538     VPADD.S32 D15,D18,D19
   1539     VPADD.S32 D16,D26,D27
   1540     VADD.S32  D13,D8,D9
   1541     VADD.S32 Q9,Q7,Q14
   1542     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[0][0-7]
   1543     VPADD.S32 D17,D12,D13               @ dual issued with prev. instr. in 2nd cycle
   1544 
   1545     VMUL.S32 Q4,Q2,Q10                  @o[0][0-3]
   1546     VMLA.S32 Q4,Q3,Q11                  @o[0][4-7]
   1547 
   1548     VADD.S32 Q12,Q8,Q14
   1549 
   1550     VMUL.S32 Q6,Q2,Q0                   @o[0][0-3]
   1551     VMLA.S32 Q6,Q3,Q1                   @o[0][4-7]
   1552 
   1553     VSHRN.S32 D26,Q9,#SHIFT
   1554     VSHRN.S32 D27,Q12,#SHIFT
   1555     VADD.S32 D10,D8,D9
   1556     @write to memory this should go to 1 3 5 7
   1557     ADD R2,R2,R7
   1558     LSL R7,R7,#1
   1559     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[1][0-7]
   1560     VADD.S32 D11,D12,D13                @ dual issued with prev. instr.
   1561 
   1562     VST1.S32 D26[0],[R2],R7
   1563     VMUL.S32 Q7,Q2,Q10                  @o[0][0-3]
   1564     VMLA.S32 Q7,Q3,Q11                  @o[0][4-7]
   1565     VST1.S32 D26[1],[R2],R7
   1566     VMUL.S32 Q8,Q2,Q0                   @o[0][0-3]
   1567     VMLA.S32 Q8,Q3,Q1                   @o[0][4-7]
   1568     VST1.S32 D27[0],[R2],R7
   1569     VADD.S32 D18,D14,D15
   1570     VST1.S32 D27[1],[R2],R7
   1571 
   1572     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[2][0-7]
   1573     VADD.S32 D19,D16,D17                @ dual issued with prev. instr.
   1574 
   1575     VMUL.S32 Q12,Q2,Q10                 @o[0][0-3]
   1576     VMLA.S32 Q12,Q3,Q11                 @o[0][4-7]
   1577     VMUL.S32 Q4,Q2,Q0
   1578     VMLA.S32 Q4,Q3,Q1
   1579 
   1580     VLD1.S32 {Q2,Q3},[R11],R12          @g_ai2_ihevc_trans_16[3][0-7]
   1581     VADD.S32 D26,D24,D25
   1582 
   1583     VMUL.S32 Q6,Q2,Q10                  @o[0][0-3]
   1584     VMLA.S32 Q6,Q3,Q11                  @o[0][4-7]
   1585     VADD.S32  D27,D8,D9
   1586 
   1587     VMUL.S32 Q4,Q2,Q0
   1588     VMLA.S32 Q4,Q3,Q1
   1589     VADD.S32 D12,D12,D13
   1590     @Q5 Q9 Q13 Q6
   1591     VPADD.S32 D14,D10,D11
   1592     VPADD.S32 D15,D18,D19
   1593     VPADD.S32 D16,D26,D27
   1594     VADD.S32  D13,D8,D9
   1595     VADD.S32 Q9,Q7,Q14
   1596     @ 1- cycle stall?
   1597     VPADD.S32 D17,D12,D13
   1598     VSHRN.S32 D22,Q9,#SHIFT
   1599     VADD.S32 Q10,Q8,Q14
   1600     @ 2-cycle stall?
   1601     VSHRN.S32 D23,Q10,#SHIFT
   1602 
   1603     @this should go to 9 11 13 15
   1604     @LSL R11,R7,#1
   1605     VST1.S32 D22[0],[R2],R7
   1606     VST1.S32 D22[1],[R2],R7
   1607     VST1.S32 D23[0],[R2],R7
   1608     VST1.S32 D23[1],[R2],R7
   1609 
   1610     SUB R2,R2,R7,LSL #3
   1611     LSR R7,R7,#1
   1612     SUB R2,R2,R7
   1613 
   1614     ADD R2,R2,#4                        @MOVE TO NEXT to next COLUMN
   1615 
   1616     ADD R8,R8,#2                        @increment loop cntr by 2 since we process loop as 2 cols
   1617     CMP R8,#16                          @check loop cntr
   1618     BNE CORE_LOOP_16X16_VERT            @jump acc
   1619 
   1620     MOV R0,R3
   1621 
   1622     ADD SP,SP,#32
   1623     vpop {d8 - d15}
   1624     LDMFD          sp!,{r4-r12,PC}      @stack store values of the arguments
   1625 
   1626