Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P2_MCReconBlock_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;//
     28 ;//
     29 
     30 ;// Include standard headers
     31     INCLUDE omxtypes_s.h
     32     INCLUDE armCOMM_s.h
     33 
     34 ;// Import symbols required from other files
     35 
     36     M_VARIANTS ARM1136JS
     37 
     38 ;// ***************************************************************************
     39 ;// ARM1136JS implementation
     40 ;// ***************************************************************************
     41     IF  ARM1136JS
     42 
     43 ;// ***************************************************************************
     44 ;// MACRO DEFINITIONS
     45 ;// ***************************************************************************
     46     ;// Description:
     47     ;//
     48     ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
     49     ;//
     50     ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
     51     ;// each sum before dividing by two, if round is 1
     52     ;//
     53     ;// Syntax:
     54     ;// M_UHADD8R   $dest, $x, $y, $round, $mask
     55     ;//
     56     ;// Inputs:
     57     ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
     58     ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
     59     ;// $round    0 if no rounding to be added, 1 if rounding to be done
     60     ;// $mask     some register set to 0x80808080
     61     ;//
     62     ;// Outputs:
     63     ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
     64 
     65     MACRO
     66     M_UHADD8R   $dest, $x, $y, $round, $mask
     67     IF $round = 1
     68         IF  $dest /= $y
     69             MVN         $dest, $x
     70             UHSUB8      $dest, $y, $dest
     71             EOR         $dest, $dest, $mask
     72         ELSE
     73             MVN         $dest, $y
     74             UHSUB8      $dest, $x, $dest
     75             EOR         $dest, $dest, $mask
     76         ENDIF
     77     ELSE
     78         UHADD8      $dest, $x, $y
     79     ENDIF
     80     MEND
     81 ;// ***************************************************************************
     82     ;// Description:
     83     ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
     84     ;//
     85     ;// Syntax:
     86     ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
     87     ;//
     88     ;// Inputs:
     89     ;// $pSrc       4 byte aligned source pointer to an address just less than
     90     ;//             or equal to the data location
     91     ;// $srcStep    The stride on source
     92     ;// $scratch    A scratch register, used internally for temp calculations
     93     ;// $offset     Difference of source data location to the source pointer
     94     ;//             Use when $offset != 0 (unaligned load)
     95     ;//
     96     ;// Outputs:
     97     ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
     98     ;//             that value, else unchanged
     99     ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
    100     ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
    101     ;//
    102     ;// Note: {$out0, $out1, $scratch} should be registers with ascending
    103     ;// register numbering. In case offset is 0, $scratch is not modified.
    104 
    105     MACRO
    106     M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
    107         IF $offset = 0
    108             LDM         $pSrc, {$out0, $out1}
    109             ADD         $pSrc, $pSrc, $srcStep
    110         ELSE
    111             LDM         $pSrc, {$out0, $out1, $scratch}
    112             ADD         $pSrc, $pSrc, $srcStep
    113 
    114             MOV         $out0, $out0, LSR #8 * $offset
    115             ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
    116             MOV         $out1, $out1, LSR #8 * $offset
    117             ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
    118         ENDIF
    119     MEND
    120 
    121 ;// ***************************************************************************
    122     ;// Description:
    123     ;// Loads three words for X interpolation, update pointer to next row. For
    124     ;// X interpolation, given a truncated-4byteAligned source pointer,
    125     ;// invariably three continous words are required from there to get the
    126     ;// nine bytes from the source pointer for filtering.
    127     ;//
    128     ;// Syntax:
    129     ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
    130     ;//
    131     ;// Inputs:
    132     ;// $pSrc       4 byte aligned source pointer to an address just less than
    133     ;//             or equal to the data location
    134     ;//
    135     ;// $srcStep    The stride on source
    136     ;//
    137     ;// $offset     Difference of source data location to the source pointer
    138     ;//             Use when $offset != 0 (unaligned load)
    139     ;//
    140     ;// Outputs:
    141     ;// $pSrc       Incremented by $srcStep
    142     ;//
    143     ;// $word0, $word1, $word2, $word3
    144     ;//             Three of these are outputs based on the $offset parameter.
    145     ;//             The outputs are specifically generated to be processed by
    146     ;//             the M_EXT_XINT macro. Following is the illustration to show
    147     ;//             how the nine bytes are spanned for different offsets from
    148     ;//             notTruncatedForAlignmentSourcePointer.
    149     ;//
    150     ;//              ------------------------------------------------------
    151     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    152     ;//             |------------------------------------------------------|
    153     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
    154     ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
    155     ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
    156     ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
    157     ;//              ------------------------------------------------------
    158     ;//
    159     ;//             where the numbering (0-8) is to designate the 9 bytes from
    160     ;//             start of a particular row. The illustration doesn't take in
    161     ;//             account the positioning of bytes with in the word and the
    162     ;//             macro combination with M_EXT_XINT will work only in little
    163     ;//             endian environs
    164     ;//
    165     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    166     ;// register numbering
    167 
    168     MACRO
    169     M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
    170         IF $offset /= 3
    171             LDM         $pSrc, {$word0, $word1, $word2}
    172         ELSE
    173             LDM         $pSrc, {$word0, $word2, $word3}
    174         ENDIF
    175         ADD         $pSrc, $pSrc, $srcStep
    176     MEND
    177 
    178 ;// ***************************************************************************
    179     ;// Description:
    180     ;// Extract four registers of four pixels for X interpolation
    181     ;//
    182     ;// Syntax:
    183     ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
    184     ;//
    185     ;// Inputs:
    186     ;// $offset     Difference of source data location to the source pointer
    187     ;//             Use when $offset != 0 (unaligned load)
    188     ;//
    189     ;// $word0, $word1, $word2, $word3
    190     ;//             Three of these are inputs based on the $offset parameter.
    191     ;//             The inputs are specifically selected to be processed by
    192     ;//             the M_EXT_XINT macro.
    193     ;//
    194     ;//              ------------------------------------------------------
    195     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    196     ;//             |------------------------------------------------------|
    197     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
    198     ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
    199     ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
    200     ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
    201     ;//              ------------------------------------------------------
    202     ;//
    203     ;// Outputs:
    204     ;// $word0, $word1, $word2, $word3
    205     ;//             Bytes from the original source pointer (not truncated for
    206     ;//             4 byte alignment) as shown in the table.
    207     ;//              -------------------------------
    208     ;//             | word0 | word1 | word2 | word3 |
    209     ;//             |-------------------------------|
    210     ;//             | 0123  | 4567  | 1234  | 5678  |
    211     ;//              -------------------------------
    212     ;//
    213     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    214     ;// register numbering
    215 
    216     MACRO
    217     M_EXT_XINT $offset, $word0, $word1, $word2, $word3
    218         IF $offset = 0
    219             ; $word0 and $word1 are ok
    220             ; $word2, $word3 are just 8 shifted versions
    221             MOV         $word3, $word1, LSR #8
    222             ORR         $word3, $word3, $word2, LSL #24
    223             MOV         $word2, $word0, LSR #8
    224             ORR         $word2, $word2, $word1, LSL #24
    225         ELIF $offset = 3
    226             ; $word2 and $word3 are ok (taken care while loading itself)
    227             ; set $word0 & $word1
    228             MOV         $word0, $word0, LSR #24
    229             ORR         $word0, $word0, $word2, LSL #8
    230             MOV         $word1, $word2, LSR #24
    231             ORR         $word1, $word1, $word3, LSL #8
    232         ELSE
    233             MOV         $word0, $word0, LSR #8 * $offset
    234             ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
    235             MOV         $word1, $word1, LSR #8 * $offset
    236             ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
    237 
    238             MOV         $word3, $word1, LSR #8
    239             ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
    240             MOV         $word2, $word0, LSR #8
    241             ORR         $word2, $word2, $word1, LSL #24
    242         ENDIF
    243     MEND
    244 
    245 ;// ***************************************************************************
    246     ;// Description:
    247     ;// Computes half-sum and xor of two inputs and puts them in the input
    248     ;// registers in that order
    249     ;//
    250     ;// Syntax:
    251     ;// M_HSUM_XOR      $v0, $v1, $tmp
    252     ;//
    253     ;// Inputs:
    254     ;// $v0         a, first input
    255     ;// $v1         b, second input
    256     ;// $tmp        scratch register
    257     ;//
    258     ;// Outputs:
    259     ;// $v0         (a + b)/2
    260     ;// $v1         a ^ b
    261 
    262     MACRO
    263     M_HSUM_XOR      $v0, $v1, $tmp
    264         UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
    265         EOR         $v1, $v0, $v1      ;// l0 = a ^ b
    266         MOV         $v0, $tmp          ;// s0
    267     MEND
    268 ;// ***************************************************************************
    269     ;// Description:
    270     ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
    271     ;// mcReconBlock module. Very specific to the implementation of
    272     ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
    273     ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
    274     ;// not significant and are used by the callee for row counter (y)
    275     ;//
    276     ;// Some points to note are:
    277     ;// 1. Input is pair of pair-averages and Xors
    278     ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
    279     ;//    running average
    280     ;// 3. Output is in the first argument
    281     ;//
    282     ;// Syntax:
    283     ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
    284     ;//
    285     ;// Inputs:
    286     ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
    287     ;// $lsb0       (a ^ b)
    288     ;// $sum1       (c + d) >> 1. Not modified
    289     ;// $lsb1       (c ^ d)       Not modified
    290     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    291     ;//
    292     ;// Outputs:
    293     ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
    294     ;//             (a + b + c + d + 2) / 4 : If rounding
    295 
    296     MACRO
    297     M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
    298         LCLS OP1
    299         LCLS OP2
    300         IF $rndVal = 0 ;// rounding case
    301 OP1 SETS "AND"
    302 OP2 SETS "ORR"
    303         ELSE           ;// Not rounding case
    304 OP1 SETS "ORR"
    305 OP2 SETS "AND"
    306         ENDIF
    307 
    308         LCLS lsb2
    309         LCLS sum2
    310         LCLS dest
    311 
    312 lsb2  SETS "tmp"
    313 sum2  SETS "$lsb0"
    314 dest  SETS "$sum0"
    315 
    316         $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
    317         EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
    318         $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
    319         AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
    320         UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
    321         UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
    322     MEND
    323 ;// ***************************************************************************
    324 ;// Motion compensation handler macros
    325 ;// ***************************************************************************
    326     ;// Description:
    327     ;// Implement motion compensation routines using the named registers in
    328     ;// callee function. Each of the following 4 implement the 4 predict type
    329     ;// Each handles 8 cases each ie all the combinations of 4 types of source
    330     ;// alignment offsets and 2 types of rounding flag
    331     ;//
    332     ;// Syntax:
    333     ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    334     ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
    335     ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
    336     ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
    337     ;//
    338     ;// Inputs:
    339     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    340     ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
    341     ;//
    342     ;// Outputs:
    343     ;// Outputs come in the named registers of the callee functions
    344     ;// The macro loads the data from the source pointer, processes it and
    345     ;// stores in the destination pointer. Does the whole prediction cycle
    346     ;// of Motion Compensation routine for a particular predictType
    347     ;// After this only residue addition to the predicted values remain
    348 
    349     MACRO
    350     M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    351     ;// Algorithmic Description:
    352     ;// This handles motion compensation for IntegerPixel predictType. Both
    353     ;// rounding cases are handled by the same code base. It is just a copy
    354     ;// from source to destination. Two lines are done per loop to reduce
    355     ;// stalls. Loop has been software pipelined as well for that purpose.
    356     ;//
    357     ;// M_LOAD_X loads a whole row in two registers and then they are stored
    358 
    359 CaseIntegerPixelRnd0Offset$offset
    360 CaseIntegerPixelRnd1Offset$offset
    361     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    362     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    363 YloopIntegerPixelOffset$offset
    364     SUBS        y, y, #2
    365     STRD        tmp1, tmp2, [pDst], dstStep
    366     STRD        tmp3, tmp4, [pDst], dstStep
    367     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    368     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    369     BGT         YloopIntegerPixelOffset$offset
    370 
    371     B           SwitchPredictTypeEnd
    372     MEND
    373 ;// ***************************************************************************
    374     MACRO
    375     M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
    376     ;// Algorithmic Description:
    377     ;// This handles motion compensation for HalfPixelX predictType. The two
    378     ;// rounding cases are handled by the different code base and spanned by
    379     ;// different macro calls. Loop has been software pipelined to reduce
    380     ;// stalls.
    381     ;//
    382     ;// Filtering involves averaging a pixel with the next horizontal pixel.
    383     ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
    384     ;// all pixels in a row with 4 pixel in each register and another 2
    385     ;// registers with pixels corresponding to one horizontally shifted pixel
    386     ;// corresponding to the initial row pixels. These are set of packed
    387     ;// registers appropriate to do 4 lane SIMD.
    388     ;// After that M_UHADD8R macro does the averaging taking care of the
    389     ;// rounding as required
    390 
    391 CaseHalfPixelXRnd$rndVal.Offset$offset
    392     IF $rndVal = 0
    393         LDR mask, =0x80808080
    394     ENDIF
    395 
    396     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
    397 YloopHalfPixelXRnd$rndVal.Offset$offset
    398     SUBS        y, y, #1
    399     M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
    400     M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
    401     M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
    402     STRD        tmp5, tmp6, [pDst], dstStep
    403     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
    404     BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
    405 
    406     B           SwitchPredictTypeEnd
    407     MEND
    408 ;// ***************************************************************************
    409     MACRO
    410     M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
    411     ;// Algorithmic Description:
    412     ;// This handles motion compensation for HalfPixelY predictType. The two
    413     ;// rounding cases are handled by the different code base and spanned by
    414     ;// different macro calls. PreLoading is used to avoid reload of same data.
    415     ;//
    416     ;// Filtering involves averaging a pixel with the next vertical pixel.
    417     ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
    418     ;// each register. These are set of packed registers appropriate to do
    419     ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
    420     ;// of the rounding as required
    421 
    422 CaseHalfPixelYRnd$rndVal.Offset$offset
    423     IF $rndVal = 0
    424         LDR mask, =0x80808080
    425     ENDIF
    426 
    427     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
    428 YloopHalfPixelYRnd$rndVal.Offset$offset
    429     SUBS        y, y, #2
    430     ;// Processing one line
    431     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    432     M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
    433     M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
    434     STRD        tmp1, tmp2, [pDst], dstStep
    435     ;// Processing another line
    436     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
    437     M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
    438     M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
    439     STRD        tmp3, tmp4, [pDst], dstStep
    440 
    441     BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
    442 
    443     B           SwitchPredictTypeEnd
    444     MEND
    445 ;// ***************************************************************************
    446     MACRO
    447     M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
    448     ;// Algorithmic Description:
    449     ;// This handles motion compensation for HalfPixelXY predictType. The two
    450     ;// rounding cases are handled by the different code base and spanned by
    451     ;// different macro calls. PreLoading is used to avoid reload of same data.
    452     ;//
    453     ;// Filtering involves averaging a pixel with the next vertical, horizontal
    454     ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
    455     ;// and M_EXT_XINT combination generates 4 registers with a row and its
    456     ;// 1 pixel right shifted version, with 4 pixels in one register. Another
    457     ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
    458     ;// called to get mutual half-sum and xor combinations of a row with its
    459     ;// shifted version as they are inputs to the M_AVG4 macro which computes
    460     ;// the 4 element average with rounding. Note that it is the half-sum/xor
    461     ;// values that are preserved for next row as they can be re-used in the
    462     ;// next call to the M_AVG4 and saves recomputation.
    463     ;// Due to lack of register, the row counter and a masking value required
    464     ;// in M_AVG4 are packed into a single register yMask where the last nibble
    465     ;// holds the row counter values and rest holds the masking variable left
    466     ;// shifted by 4
    467 
    468 CaseHalfPixelXYRnd$rndVal.Offset$offset
    469     LDR         yMask, =((0x01010101 << 4) + 8)
    470 
    471     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    472     M_EXT_XINT  $offset, t00, t01, t10, t11
    473     M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
    474     M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
    475 
    476 YloopHalfPixelXYRnd$rndVal.Offset$offset
    477     ;// Processsing one line
    478     ;// t00, t01, t10, t11 required from previous loop
    479     M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
    480     SUB         yMask, yMask, #2
    481     M_EXT_XINT  $offset, t20, t21, t30, t31
    482     M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
    483     M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
    484     M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
    485     M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
    486     STRD        t00, t01, [pDst], dstStep   ;// store the average
    487 
    488     ;// Processsing another line
    489     ;// t20, t21, t30, t31 required from above
    490     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    491     TST         yMask, #7
    492     M_EXT_XINT  $offset, t00, t01, t10, t11
    493     M_HSUM_XOR  t00, t10, tmp
    494     M_HSUM_XOR  t01, t11, tmp
    495     M_AVG4      t20, t30, t00, t10, $rndVal
    496     M_AVG4      t21, t31, t01, t11, $rndVal
    497     STRD        t20, t21, [pDst], dstStep
    498 
    499     BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
    500 
    501     IF $offset/=3 :LOR: $rndVal/=1
    502         B           SwitchPredictTypeEnd
    503     ENDIF
    504     MEND
    505 ;// ***************************************************************************
    506 ;// Motion compensation handler macros end here
    507 ;// ***************************************************************************
    508     ;// Description:
    509     ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
    510     ;// combination in the "switch" to prediction processing code segment
    511     ;//
    512     ;// Syntax:
    513     ;// M_CASE_OFFSET $rnd, $predictType
    514     ;//
    515     ;// Inputs:
    516     ;// $rnd            0 for rounding, 1 for no rounding
    517     ;// $predictType    The prediction mode
    518     ;//
    519     ;// Outputs:
    520     ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
    521 
    522     MACRO
    523     M_CASE_OFFSET $rnd, $predictType
    524         M_CASE      Case$predictType.Rnd$rnd.Offset0
    525         M_CASE      Case$predictType.Rnd$rnd.Offset1
    526         M_CASE      Case$predictType.Rnd$rnd.Offset2
    527         M_CASE      Case$predictType.Rnd$rnd.Offset3
    528     MEND
    529 ;// ***************************************************************************
    530     ;// Description:
    531     ;// Populates all 2 kinds of rounding "cases" for each predictType in the
    532     ;// "switch" to prediction processing code segment
    533     ;//
    534     ;// Syntax:
    535     ;// M_CASE_OFFSET $predictType
    536     ;//
    537     ;// Inputs:
    538     ;// $predictType    The prediction mode
    539     ;//
    540     ;// Outputs:
    541     ;// Populated list of "M_CASE_OFFSET" macros
    542 
    543     MACRO
    544     M_CASE_MCRECONBLOCK $predictType
    545         M_CASE_OFFSET  0, $predictType ;// 0 for rounding
    546         M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
    547     MEND
    548 ;// ***************************************************************************
    549     ;// Description:
    550     ;// Populates all 8 kinds of rounding and offset combinations handling macros
    551     ;// for the specified predictType. In case of "IntegerPixel" predictType,
    552     ;// rounding is not required so same code segment handles both cases
    553     ;//
    554     ;// Syntax:
    555     ;// M_MCRECONBLOCK    $predictType
    556     ;//
    557     ;// Inputs:
    558     ;// $predictType    The prediction mode
    559     ;//
    560     ;// Outputs:
    561     ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
    562     ;// predictType. Each
    563     ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
    564     ;// is an code segment (starting with a label indicating the predictType,
    565     ;// rounding and offset combination)
    566     ;// Four calls of this macro with the 4 prediction modes populate all the 32
    567     ;// handlers
    568 
    569     MACRO
    570     M_MCRECONBLOCK $predictType
    571         M_MCRECONBLOCK_$predictType 0, 0
    572         M_MCRECONBLOCK_$predictType 0, 1
    573         M_MCRECONBLOCK_$predictType 0, 2
    574         M_MCRECONBLOCK_$predictType 0, 3
    575     IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
    576         M_MCRECONBLOCK_$predictType 1, 0
    577         M_MCRECONBLOCK_$predictType 1, 1
    578         M_MCRECONBLOCK_$predictType 1, 2
    579         M_MCRECONBLOCK_$predictType 1, 3
    580     ENDIF
    581     MEND
    582 ;// ***************************************************************************
    583 ;// Input/Output Registers
    584 pSrc                  RN 0
    585 srcStep               RN 1
    586 arg_pSrcResidue       RN 2
    587 pSrcResidue           RN 12
    588 pDst                  RN 3
    589 dstStep               RN 2
    590 predictType           RN 10
    591 rndVal                RN 11
    592 mask                  RN 11
    593 
    594 ;// Local Scratch Registers
    595 zero                  RN 12
    596 y                     RN 14
    597 
    598 tmp1                  RN 4
    599 tmp2                  RN 5
    600 tmp3                  RN 6
    601 tmp4                  RN 7
    602 tmp5                  RN 8
    603 tmp6                  RN 9
    604 tmp7                  RN 10
    605 tmp8                  RN 11
    606 tmp9                  RN 12
    607 
    608 t00                   RN 4
    609 t01                   RN 5
    610 t10                   RN 6
    611 t11                   RN 7
    612 t20                   RN 8
    613 t21                   RN 9
    614 t30                   RN 10
    615 t31                   RN 11
    616 tmp                   RN 12
    617 
    618 yMask                 RN 14
    619 
    620 dst                   RN 1
    621 return                RN 0
    622 
    623     ;// Allocate memory on stack
    624     M_ALLOC4    Stk_pDst,           4
    625     M_ALLOC4    Stk_pSrcResidue,    4
    626     ;// Function header
    627     M_START     omxVCM4P2_MCReconBlock, r11
    628     ;// Define stack arguments
    629     M_ARG       Arg_dstStep,        4
    630     M_ARG       Arg_predictType,    4
    631     M_ARG       Arg_rndVal,         4
    632     ;// Save on stack
    633     M_STR       pDst, Stk_pDst
    634     M_STR       arg_pSrcResidue, Stk_pSrcResidue
    635     ;// Load argument from the stack
    636     M_LDR       dstStep, Arg_dstStep
    637     M_LDR       predictType, Arg_predictType
    638     M_LDR       rndVal, Arg_rndVal
    639 
    640     MOV         y, #8
    641 
    642     AND         tmp1, pSrc, #3
    643     ORR         predictType, tmp1, predictType, LSL #3
    644     ORR         predictType, predictType, rndVal, LSL #2
    645     ;// Truncating source pointer to align to 4 byte location
    646     BIC         pSrc, pSrc, #3
    647 
    648     ;// Implementation takes care of all combinations of different
    649     ;// predictTypes, rounding cases and source pointer offsets to alignment
    650     ;// of 4 bytes in different code bases unless one of these parameter wasn't
    651     ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
    652     ;// macros branch into 8 M_CASE macros for all combinations of the 2
    653     ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
    654     ;// alignment.
    655     M_SWITCH    predictType
    656         M_CASE_MCRECONBLOCK IntegerPixel
    657         M_CASE_MCRECONBLOCK HalfPixelX
    658         M_CASE_MCRECONBLOCK HalfPixelY
    659         M_CASE_MCRECONBLOCK HalfPixelXY
    660     M_ENDSWITCH
    661 
    662     ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
    663     ;// particular macros (4 in case of IntegerPixel as rounding makes no
    664     ;// difference there) to generate the code for all cases of rounding and
    665     ;// offsets. LTORG is used to segment the code as code size bloated beyond
    666     ;// 4KB.
    667     M_MCRECONBLOCK IntegerPixel
    668     M_MCRECONBLOCK HalfPixelX
    669     LTORG
    670     M_MCRECONBLOCK HalfPixelY
    671     M_MCRECONBLOCK HalfPixelXY
    672 SwitchPredictTypeEnd
    673 
    674     ;// Residue Addition
    675     ;// This is done in 2 lane SIMD though loads are further optimized and
    676     ;// 4 bytes are loaded in case of destination buffer. Algorithmic
    677     ;// details are in inlined comments
    678     M_LDR       pSrcResidue, Stk_pSrcResidue
    679     CMP         pSrcResidue, #0
    680     BEQ         pSrcResidueConditionEnd
    681 pSrcResidueNotNull
    682     M_LDR       pDst, Stk_pDst
    683     MOV         y, #8
    684     SUB         dstStep, dstStep, #4
    685 Yloop_pSrcResidueNotNull
    686     SUBS        y, y, #1
    687     LDR         dst, [pDst]                ;// dst = [dcba]
    688     LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
    689     PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
    690     PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
    691     UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
    692     UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
    693     QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
    694     QADD16      tmp2, tmp2, tmp4
    695     USAT16      tmp1, #8, tmp1
    696     USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
    697     ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
    698     STR         tmp1, [pDst], #4
    699 
    700     LDR         dst, [pDst]
    701     LDMIA       pSrcResidue!, {tmp1, tmp2}
    702     PKHBT       tmp3, tmp1, tmp2, LSL #16
    703     PKHTB       tmp4, tmp2, tmp1, ASR #16
    704     UXTB16      tmp1, dst
    705     UXTB16      tmp2, dst, ROR #8
    706     QADD16      tmp1, tmp1, tmp3
    707     QADD16      tmp2, tmp2, tmp4
    708     USAT16      tmp1, #8, tmp1
    709     USAT16      tmp2, #8, tmp2
    710     ORR         tmp1, tmp1, tmp2, LSL #8
    711     STR         tmp1, [pDst], dstStep
    712 
    713     BGT         Yloop_pSrcResidueNotNull
    714 pSrcResidueConditionEnd
    715 
    716     MOV         return, #OMX_Sts_NoErr
    717 
    718     M_END
    719     ENDIF ;// ARM1136JS
    720 
    721 ;// ***************************************************************************
    722 ;// CortexA8 implementation
    723 ;// ***************************************************************************
    724     END
    725 ;// ***************************************************************************
    726 ;// omxVCM4P2_MCReconBlock ends
    727 ;// ***************************************************************************
    728