Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_TransformResidual4x4_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;// Transform Residual 4x4 Coefficients
     28 ;//
     29 ;//
     30 
     31 
     32 ;// Include standard headers
     33 
     34         INCLUDE omxtypes_s.h
     35         INCLUDE armCOMM_s.h
     36 
     37         M_VARIANTS CortexA8
     38 
     39 ;// Import symbols required from other files
     40 ;// (For example tables)
     41 
     42 
     43 
     44 
     45 ;// Set debugging level
     46 ;//DEBUG_ON    SETL {TRUE}
     47 
     48 
     49 
     50 ;// Guarding implementation by the processor name
     51 
     52 
     53 
     54 
     55 
     56 
     57 
     58 
     59 ;// Guarding implementation by the processor name
     60 
     61     IF  CortexA8
     62 
     63 ;// ARM Registers
     64 
     65 ;//Input Registers
     66 pDst                RN  0
     67 pSrc                RN  1
     68 
     69 
     70 ;// Neon Registers
     71 
     72 ;// Packed Input pixels
     73 dIn0                DN  D0.S16
     74 dIn1                DN  D1.S16
     75 dIn2                DN  D2.S16
     76 dIn3                DN  D3.S16
     77 
     78 ;// Intermediate calculations
     79 dZero               DN  D4.S16
     80 de0                 DN  D5.S16
     81 de1                 DN  D6.S16
     82 de2                 DN  D7.S16
     83 de3                 DN  D8.S16
     84 dIn1RS              DN  D7.S16
     85 dIn3RS              DN  D8.S16
     86 df0                 DN  D0.S16
     87 df1                 DN  D1.S16
     88 df2                 DN  D2.S16
     89 df3                 DN  D3.S16
     90 qf01                QN  Q0.32
     91 qf23                QN  Q1.32
     92 dg0                 DN  D5.S16
     93 dg1                 DN  D6.S16
     94 dg2                 DN  D7.S16
     95 dg3                 DN  D8.S16
     96 df1RS               DN  D7.S16
     97 df3RS               DN  D8.S16
     98 
     99 ;// Output pixels
    100 dh0                 DN  D0.S16
    101 dh1                 DN  D1.S16
    102 dh2                 DN  D2.S16
    103 dh3                 DN  D3.S16
    104 
    105 
    106     ;// Allocate stack memory required by the function
    107 
    108 
    109     ;// Write function header
    110         M_START armVCM4P10_TransformResidual4x4, ,d8
    111 
    112         ;******************************************************************
    113         ;// The strategy used in implementing the transform is as follows:*
    114         ;// Load the 4x4 block into 8 registers                           *
    115         ;// Transpose the 4x4 matrix                                      *
    116         ;// Perform the row operations (on columns) using SIMD            *
    117         ;// Transpose the 4x4 result matrix                               *
    118         ;// Perform the coloumn operations                                *
    119         ;// Store the 4x4 block at one go                                 *
    120         ;******************************************************************
    121 
    122         ;// Load all the 4x4 pixels in transposed form
    123 
    124         VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
    125 
    126         VMOV    dZero,#0                                    ;// Used to right shift by 1
    127 
    128 
    129         ;****************************************
    130         ;// Row Operations (Performed on columns)
    131         ;****************************************
    132 
    133 
    134         VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
    135         VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
    136         VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
    137         VHADD       dIn3RS,dIn3,dZero
    138         VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
    139         VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
    140         VADD        df0,de0,de3                         ;//  f0 = e0 + e3
    141         VADD        df1,de1,de2                            ;//  f1 = e1 + e2
    142         VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
    143         VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
    144 
    145 
    146 
    147         ;*****************************************************************
    148         ;// Transpose the resultant matrix
    149         ;*****************************************************************
    150 
    151         VTRN    df0,df1
    152         VTRN    df2,df3
    153         VTRN    qf01,qf23
    154 
    155 
    156         ;*******************************
    157         ;// Coloumn Operations
    158         ;*******************************
    159 
    160 
    161         VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
    162         VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
    163         VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
    164         VHADD       df3RS,df3,dZero
    165         VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
    166         VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
    167         VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
    168         VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
    169         VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
    170         VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
    171 
    172 
    173         ;************************************************
    174         ;// Calculate final value (colOp[i][j] + 32)>>6
    175         ;************************************************
    176 
    177         VRSHR       dh0,#6
    178         VRSHR       dh1,#6
    179         VRSHR       dh2,#6
    180         VRSHR       dh3,#6
    181 
    182 
    183         ;***************************
    184         ;// Store all the 4x4 pixels
    185         ;***************************
    186 
    187         VST1   {dh0,dh1,dh2,dh3},[pDst]
    188 
    189 
    190         ;// Set return value
    191 
    192 End
    193 
    194 
    195         ;// Write function tail
    196         M_END
    197 
    198     ENDIF                                                           ;//CortexA8
    199 
    200     END
    201