Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_TransformResidual4x4_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;// Transform Residual 4x4 Coefficients
     14 ;//
     15 ;//
     16 
     17 
     18 ;// Include standard headers
     19 
     20         INCLUDE omxtypes_s.h
     21         INCLUDE armCOMM_s.h
     22 
     23         M_VARIANTS ARM1136JS
     24 
     25 ;// Import symbols required from other files
     26 ;// (For example tables)
     27 
     28 
     29 
     30 
     31 ;// Set debugging level
     32 ;//DEBUG_ON    SETL {TRUE}
     33 
     34 
     35 
     36 ;// Guarding implementation by the processor name
     37 
     38     IF  ARM1136JS
     39 
     40 ;//Input Registers
     41 pDst                RN  0
     42 pSrc                RN  1
     43 
     44 ;//Output Registers
     45 
     46 
     47 ;//Local Scratch Registers
     48 
     49 ;// Packed Input pixels
     50 in00                RN  2                   ;// Src[0] & Src[1]
     51 in02                RN  3                   ;// Src[2] & Src[3]
     52 in10                RN  4                   ;// Src[4] & Src[5]
     53 in12                RN  5                   ;// Src[6] & Src[7]
     54 in20                RN  6                   ;// Src[8] & Src[9]
     55 in22                RN  7                   ;// Src[10] & Src[11]
     56 in30                RN  8                   ;// Src[12] & Src[13]
     57 in32                RN  9                   ;// Src[14] & Src[15]
     58 
     59 ;// Transpose for Row operations (Rows to cols)
     60 trRow00             RN  2
     61 trRow10             RN  10
     62 trRow02             RN  3
     63 trRow12             RN  5
     64 trRow20             RN  11
     65 trRow30             RN  12
     66 trRow32             RN  14
     67 trRow22             RN  7
     68 
     69 ;// Intermediate calculations
     70 e0                  RN  4
     71 e1                  RN  6
     72 e2                  RN  8
     73 e3                  RN  9
     74 constZero           RN  1
     75 
     76 ;// Row operated pixels
     77 rowOp00             RN  2
     78 rowOp10             RN  10
     79 rowOp20             RN  11
     80 rowOp30             RN  12
     81 rowOp02             RN  3
     82 rowOp12             RN  5
     83 rowOp22             RN  7
     84 rowOp32             RN  14
     85 
     86 ;// Transpose for colulmn operations
     87 trCol00             RN  2
     88 trCol02             RN  3
     89 trCol10             RN  4
     90 trCol12             RN  5
     91 trCol20             RN  6
     92 trCol22             RN  7
     93 trCol30             RN  8
     94 trCol32             RN  9
     95 
     96 ;// Intermediate calculations
     97 g0                  RN  10
     98 g1                  RN  11
     99 g2                  RN  12
    100 g3                  RN  14
    101 
    102 ;// Coloumn operated pixels
    103 colOp00             RN  2
    104 colOp02             RN  3
    105 colOp10             RN  4
    106 colOp12             RN  5
    107 colOp20             RN  6
    108 colOp22             RN  7
    109 colOp30             RN  8
    110 colOp32             RN  9
    111 
    112 
    113 temp1               RN  10                  ;// Temporary scratch varaibles
    114 const1              RN  11
    115 const2              RN  12
    116 mask                RN  14
    117 
    118 ;// Output pixels
    119 out00               RN  2
    120 out02               RN  3
    121 out10               RN  4
    122 out12               RN  5
    123 out20               RN  6
    124 out22               RN  7
    125 out30               RN  8
    126 out32               RN  9
    127 
    128 
    129 
    130     ;// Allocate stack memory required by the function
    131 
    132 
    133     ;// Write function header
    134         M_START armVCM4P10_TransformResidual4x4,r11
    135 
    136         ;******************************************************************
    137         ;// The strategy used in implementing the transform is as follows:*
    138         ;// Load the 4x4 block into 8 registers                           *
    139         ;// Transpose the 4x4 matrix                                      *
    140         ;// Perform the row operations (on columns) using SIMD            *
    141         ;// Transpose the 4x4 result matrix                               *
    142         ;// Perform the coloumn operations                                *
    143         ;// Store the 4x4 block at one go                                 *
    144         ;******************************************************************
    145 
    146         ;// Load all the 4x4 pixels
    147 
    148         LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
    149 
    150         MOV       constZero,#0                                     ;// Used to right shift by 1
    151         ;LDR       constZero,=0x00000000
    152 
    153         ;*****************************************************************
    154         ;//
    155         ;// Transpose the matrix inorder to perform row ops as coloumn ops
    156         ;// Input:   in[][] = original matrix
    157         ;// Output:  trRow[][]= transposed matrix
    158         ;// Step1: Obtain the LL part of the transposed matrix
    159         ;// Step2: Obtain the HL part
    160         ;// step3: Obtain the LH part
    161         ;// Step4: Obtain the HH part
    162         ;//
    163         ;*****************************************************************
    164 
    165         ;// LL 2x2 transposed matrix
    166         ;//   d0 d1 - -
    167         ;//   d4 d5 - -
    168         ;//   -  -  - -
    169         ;//   -  -  - -
    170 
    171         PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
    172         PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
    173 
    174         ;// HL 2x2 transposed matrix
    175         ;//    -   -   - -
    176         ;//    -   -   - -
    177         ;//    d8  d9  - -
    178         ;//   d12 d13  - -
    179 
    180 
    181          PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
    182          PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
    183 
    184         ;// LH 2x2 transposed matrix
    185         ;//   - - d2 d3
    186         ;//   - - d6 d7
    187         ;//   - - -  -
    188         ;//   - - -  -
    189 
    190         PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
    191         PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
    192 
    193 
    194 
    195 
    196         ;// HH 2x2 transposed matrix
    197         ;//    - -   -   -
    198         ;//    - -   -   -
    199         ;//    - -  d10 d11
    200         ;//    - -  d14 d15
    201 
    202         PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
    203         PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
    204 
    205 
    206         ;****************************************
    207         ;// Row Operations (Performed on columns)
    208         ;****************************************
    209 
    210 
    211         ;// SIMD operations on first two columns(two rows of the original matrix)
    212 
    213 
    214         SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
    215         SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
    216         SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
    217         SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
    218         SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
    219         SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
    220         SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
    221         SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
    222         SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
    223         SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
    224 
    225         ;// SIMD operations on next two columns(next two rows of the original matrix)
    226 
    227         SADD16      e0, trRow02,trRow22
    228         SSUB16    e1, trRow02,trRow22
    229         SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
    230         SHADD16   e3, trRow32,constZero
    231         SSUB16    e2, e2, trRow32
    232         SADD16    e3, e3, trRow12
    233         SADD16    rowOp02, e0, e3
    234         SADD16    rowOp12, e1, e2
    235         SSUB16    rowOp22, e1, e2
    236         SSUB16    rowOp32, e0, e3
    237 
    238 
    239         ;*****************************************************************
    240         ;// Transpose the resultant matrix
    241         ;// Input:  rowOp[][]
    242         ;// Output: trCol[][]
    243         ;*****************************************************************
    244 
    245         ;// LL 2x2 transposed matrix
    246         ;//   d0 d1 - -
    247         ;//   d4 d5 - -
    248         ;//   -  -  - -
    249         ;//   -  -  - -
    250 
    251         PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
    252         PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
    253 
    254         ;// HL 2x2 transposed matrix
    255         ;//    -   -   - -
    256         ;//    -   -   - -
    257         ;//    d8  d9  - -
    258         ;//   d12 d13  - -
    259 
    260 
    261          PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
    262          PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
    263 
    264         ;// LH 2x2 transposed matrix
    265         ;//   - - d2 d3
    266         ;//   - - d6 d7
    267         ;//   - - -  -
    268         ;//   - - -  -
    269 
    270         PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
    271         PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
    272 
    273 
    274 
    275 
    276         ;// HH 2x2 transposed matrix
    277         ;//    - -   -   -
    278         ;//    - -   -   -
    279         ;//    - -  d10 d11
    280         ;//    - -  d14 d15
    281 
    282         PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
    283         PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
    284 
    285 
    286         ;*******************************
    287         ;// Coloumn Operations
    288         ;*******************************
    289 
    290 
    291         ;// SIMD operations on first two columns
    292 
    293 
    294         SADD16      g0, trCol00,trCol20
    295         SSUB16    g1, trCol00,trCol20
    296         SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
    297         SHADD16   g3, trCol30,constZero
    298         SSUB16    g2, g2, trCol30
    299         SADD16    g3, g3, trCol10
    300         SADD16    colOp00, g0, g3
    301         SADD16    colOp10, g1, g2
    302         SSUB16    colOp20, g1, g2
    303         SSUB16    colOp30, g0, g3
    304 
    305         ;// SIMD operations on next two columns
    306 
    307         SADD16      g0, trCol02,trCol22
    308         SSUB16    g1, trCol02,trCol22
    309         SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
    310         SHADD16   g3, trCol32,constZero
    311         SSUB16    g2, g2, trCol32
    312         SADD16    g3, g3, trCol12
    313         SADD16    colOp02, g0, g3
    314         SADD16    colOp12, g1, g2
    315         SSUB16    colOp22, g1, g2
    316         SSUB16    colOp32, g0, g3
    317 
    318 
    319 
    320 
    321 
    322         ;************************************************
    323         ;// Calculate final value (colOp[i][j] + 32)>>6
    324         ;************************************************
    325 
    326         ;// const1: Serves dual purpose
    327         ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
    328         ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
    329 
    330         LDR     const1, =0x00208020
    331 
    332         LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits
    333 
    334         ;// const2(#512): used to convert the lower 16bit number back to signed value
    335 
    336         MOV     const2,#0x200                           ;// const2 = 2^9
    337 
    338         ;// First Row
    339 
    340         SADD16    colOp00, colOp00, const1
    341         SADD16    colOp02, colOp02, const1
    342         AND     colOp00, mask, colOp00, ASR #6
    343         AND     colOp02, mask, colOp02, ASR #6
    344         SSUB16  out00,colOp00,const2
    345         SSUB16  out02,colOp02,const2
    346 
    347 
    348         ;// Second Row
    349 
    350         SADD16    colOp10, colOp10, const1
    351         SADD16    colOp12, colOp12, const1
    352         AND     colOp10, mask, colOp10, ASR #6
    353         AND     colOp12, mask, colOp12, ASR #6
    354         SSUB16  out10,colOp10,const2
    355         SSUB16  out12,colOp12,const2
    356 
    357 
    358         ;// Third Row
    359 
    360         SADD16    colOp20, colOp20, const1
    361         SADD16    colOp22, colOp22, const1
    362         AND     colOp20, mask, colOp20, ASR #6
    363         AND     colOp22, mask, colOp22, ASR #6
    364         SSUB16  out20,colOp20,const2
    365         SSUB16  out22,colOp22,const2
    366 
    367 
    368         ;// Fourth Row
    369 
    370         SADD16    colOp30, colOp30, const1
    371         SADD16    colOp32, colOp32, const1
    372         AND     colOp30, mask, colOp30, ASR #6
    373         AND     colOp32, mask, colOp32, ASR #6
    374         SSUB16  out30,colOp30,const2
    375         SSUB16  out32,colOp32,const2
    376 
    377 
    378 
    379 
    380         ;***************************
    381         ;// Store all the 4x4 pixels
    382         ;***************************
    383 
    384         STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
    385 
    386 
    387 
    388         ;// Set return value
    389 
    390 End
    391 
    392 
    393         ;// Write function tail
    394         M_END
    395 
    396     ENDIF                                                           ;//ARM1136JS
    397 
    398 
    399 
    400 
    401 
    402 
    403 
    404 ;// Guarding implementation by the processor name
    405 
    406 
    407     END