Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_TransformResidual4x4_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;// Transform Residual 4x4 Coefficients
     14 ;//
     15 ;//
     16 
     17 
     18 ;// Include standard headers
     19 
     20         INCLUDE omxtypes_s.h
     21         INCLUDE armCOMM_s.h
     22 
     23         M_VARIANTS CortexA8
     24 
     25 ;// Import symbols required from other files
     26 ;// (For example tables)
     27 
     28 
     29 
     30 
     31 ;// Set debugging level
     32 ;//DEBUG_ON    SETL {TRUE}
     33 
     34 
     35 
     36 ;// Guarding implementation by the processor name
     37 
     38 
     39 
     40 
     41 
     42 
     43 
     44 
     45 ;// Guarding implementation by the processor name
     46 
     47     IF  CortexA8
     48 
     49 ;// ARM Registers
     50 
     51 ;//Input Registers
     52 pDst                RN  0
     53 pSrc                RN  1
     54 
     55 
     56 ;// Neon Registers
     57 
     58 ;// Packed Input pixels
     59 dIn0                DN  D0.S16
     60 dIn1                DN  D1.S16
     61 dIn2                DN  D2.S16
     62 dIn3                DN  D3.S16
     63 
     64 ;// Intermediate calculations
     65 dZero               DN  D4.S16
     66 de0                 DN  D5.S16
     67 de1                 DN  D6.S16
     68 de2                 DN  D7.S16
     69 de3                 DN  D8.S16
     70 dIn1RS              DN  D7.S16
     71 dIn3RS              DN  D8.S16
     72 df0                 DN  D0.S16
     73 df1                 DN  D1.S16
     74 df2                 DN  D2.S16
     75 df3                 DN  D3.S16
     76 qf01                QN  Q0.32
     77 qf23                QN  Q1.32
     78 dg0                 DN  D5.S16
     79 dg1                 DN  D6.S16
     80 dg2                 DN  D7.S16
     81 dg3                 DN  D8.S16
     82 df1RS               DN  D7.S16
     83 df3RS               DN  D8.S16
     84 
     85 ;// Output pixels
     86 dh0                 DN  D0.S16
     87 dh1                 DN  D1.S16
     88 dh2                 DN  D2.S16
     89 dh3                 DN  D3.S16
     90 
     91 
     92     ;// Allocate stack memory required by the function
     93 
     94 
     95     ;// Write function header
     96         M_START armVCM4P10_TransformResidual4x4, ,d8
     97 
     98         ;******************************************************************
     99         ;// The strategy used in implementing the transform is as follows:*
    100         ;// Load the 4x4 block into 8 registers                           *
    101         ;// Transpose the 4x4 matrix                                      *
    102         ;// Perform the row operations (on columns) using SIMD            *
    103         ;// Transpose the 4x4 result matrix                               *
    104         ;// Perform the coloumn operations                                *
    105         ;// Store the 4x4 block at one go                                 *
    106         ;******************************************************************
    107 
    108         ;// Load all the 4x4 pixels in transposed form
    109 
    110         VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
    111 
    112         VMOV    dZero,#0                                    ;// Used to right shift by 1
    113 
    114 
    115         ;****************************************
    116         ;// Row Operations (Performed on columns)
    117         ;****************************************
    118 
    119 
    120         VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
    121         VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
    122         VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
    123         VHADD       dIn3RS,dIn3,dZero
    124         VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
    125         VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
    126         VADD        df0,de0,de3                         ;//  f0 = e0 + e3
    127         VADD        df1,de1,de2                            ;//  f1 = e1 + e2
    128         VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
    129         VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
    130 
    131 
    132 
    133         ;*****************************************************************
    134         ;// Transpose the resultant matrix
    135         ;*****************************************************************
    136 
    137         VTRN    df0,df1
    138         VTRN    df2,df3
    139         VTRN    qf01,qf23
    140 
    141 
    142         ;*******************************
    143         ;// Coloumn Operations
    144         ;*******************************
    145 
    146 
    147         VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
    148         VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
    149         VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
    150         VHADD       df3RS,df3,dZero
    151         VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
    152         VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
    153         VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
    154         VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
    155         VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
    156         VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
    157 
    158 
    159         ;************************************************
    160         ;// Calculate final value (colOp[i][j] + 32)>>6
    161         ;************************************************
    162 
    163         VRSHR       dh0,#6
    164         VRSHR       dh1,#6
    165         VRSHR       dh2,#6
    166         VRSHR       dh3,#6
    167 
    168 
    169         ;***************************
    170         ;// Store all the 4x4 pixels
    171         ;***************************
    172 
    173         VST1   {dh0,dh1,dh2,dh3},[pDst]
    174 
    175 
    176         ;// Set return value
    177 
    178 End
    179 
    180 
    181         ;// Write function tail
    182         M_END
    183 
    184     ENDIF                                                           ;//CortexA8
    185 
    186     END