Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P2_MCReconBlock_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;//
     14 ;//
     15 
     16 ;// Include standard headers
     17     INCLUDE omxtypes_s.h
     18     INCLUDE armCOMM_s.h
     19 
     20 ;// Import symbols required from other files
     21 
     22     M_VARIANTS ARM1136JS
     23 
     24 ;// ***************************************************************************
     25 ;// ARM1136JS implementation
     26 ;// ***************************************************************************
     27     IF  ARM1136JS
     28 
     29 ;// ***************************************************************************
     30 ;// MACRO DEFINITIONS
     31 ;// ***************************************************************************
     32     ;// Description:
     33     ;//
     34     ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
     35     ;//
     36     ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
     37     ;// each sum before dividing by two, if round is 1
     38     ;//
     39     ;// Syntax:
     40     ;// M_UHADD8R   $dest, $x, $y, $round, $mask
     41     ;//
     42     ;// Inputs:
     43     ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
     44     ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
     45     ;// $round    0 if no rounding to be added, 1 if rounding to be done
     46     ;// $mask     some register set to 0x80808080
     47     ;//
     48     ;// Outputs:
     49     ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
     50 
     51     MACRO
     52     M_UHADD8R   $dest, $x, $y, $round, $mask
     53     IF $round = 1
     54         IF  $dest /= $y
     55             MVN         $dest, $x
     56             UHSUB8      $dest, $y, $dest
     57             EOR         $dest, $dest, $mask
     58         ELSE
     59             MVN         $dest, $y
     60             UHSUB8      $dest, $x, $dest
     61             EOR         $dest, $dest, $mask
     62         ENDIF
     63     ELSE
     64         UHADD8      $dest, $x, $y
     65     ENDIF
     66     MEND
     67 ;// ***************************************************************************
     68     ;// Description:
     69     ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
     70     ;//
     71     ;// Syntax:
     72     ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
     73     ;//
     74     ;// Inputs:
     75     ;// $pSrc       4 byte aligned source pointer to an address just less than
     76     ;//             or equal to the data location
     77     ;// $srcStep    The stride on source
     78     ;// $scratch    A scratch register, used internally for temp calculations
     79     ;// $offset     Difference of source data location to the source pointer
     80     ;//             Use when $offset != 0 (unaligned load)
     81     ;//
     82     ;// Outputs:
     83     ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
     84     ;//             that value, else unchanged
     85     ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
     86     ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
     87     ;//
     88     ;// Note: {$out0, $out1, $scratch} should be registers with ascending
     89     ;// register numbering. In case offset is 0, $scratch is not modified.
     90 
     91     MACRO
     92     M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
     93         IF $offset = 0
     94             LDM         $pSrc, {$out0, $out1}
     95             ADD         $pSrc, $pSrc, $srcStep
     96         ELSE
     97             LDM         $pSrc, {$out0, $out1, $scratch}
     98             ADD         $pSrc, $pSrc, $srcStep
     99 
    100             MOV         $out0, $out0, LSR #8 * $offset
    101             ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
    102             MOV         $out1, $out1, LSR #8 * $offset
    103             ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
    104         ENDIF
    105     MEND
    106 
    107 ;// ***************************************************************************
    108     ;// Description:
    109     ;// Loads three words for X interpolation, update pointer to next row. For
    110     ;// X interpolation, given a truncated-4byteAligned source pointer,
    111     ;// invariably three continous words are required from there to get the
    112     ;// nine bytes from the source pointer for filtering.
    113     ;//
    114     ;// Syntax:
    115     ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
    116     ;//
    117     ;// Inputs:
    118     ;// $pSrc       4 byte aligned source pointer to an address just less than
    119     ;//             or equal to the data location
    120     ;//
    121     ;// $srcStep    The stride on source
    122     ;//
    123     ;// $offset     Difference of source data location to the source pointer
    124     ;//             Use when $offset != 0 (unaligned load)
    125     ;//
    126     ;// Outputs:
    127     ;// $pSrc       Incremented by $srcStep
    128     ;//
    129     ;// $word0, $word1, $word2, $word3
    130     ;//             Three of these are outputs based on the $offset parameter.
    131     ;//             The outputs are specifically generated to be processed by
    132     ;//             the M_EXT_XINT macro. Following is the illustration to show
    133     ;//             how the nine bytes are spanned for different offsets from
    134     ;//             notTruncatedForAlignmentSourcePointer.
    135     ;//
    136     ;//              ------------------------------------------------------
    137     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    138     ;//             |------------------------------------------------------|
    139     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
    140     ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
    141     ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
    142     ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
    143     ;//              ------------------------------------------------------
    144     ;//
    145     ;//             where the numbering (0-8) is to designate the 9 bytes from
    146     ;//             start of a particular row. The illustration doesn't take in
    147     ;//             account the positioning of bytes with in the word and the
    148     ;//             macro combination with M_EXT_XINT will work only in little
    149     ;//             endian environs
    150     ;//
    151     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    152     ;// register numbering
    153 
    154     MACRO
    155     M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
    156         IF $offset /= 3
    157             LDM         $pSrc, {$word0, $word1, $word2}
    158         ELSE
    159             LDM         $pSrc, {$word0, $word2, $word3}
    160         ENDIF
    161         ADD         $pSrc, $pSrc, $srcStep
    162     MEND
    163 
    164 ;// ***************************************************************************
    165     ;// Description:
    166     ;// Extract four registers of four pixels for X interpolation
    167     ;//
    168     ;// Syntax:
    169     ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
    170     ;//
    171     ;// Inputs:
    172     ;// $offset     Difference of source data location to the source pointer
    173     ;//             Use when $offset != 0 (unaligned load)
    174     ;//
    175     ;// $word0, $word1, $word2, $word3
    176     ;//             Three of these are inputs based on the $offset parameter.
    177     ;//             The inputs are specifically selected to be processed by
    178     ;//             the M_EXT_XINT macro.
    179     ;//
    180     ;//              ------------------------------------------------------
    181     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    182     ;//             |------------------------------------------------------|
    183     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
    184     ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
    185     ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
    186     ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
    187     ;//              ------------------------------------------------------
    188     ;//
    189     ;// Outputs:
    190     ;// $word0, $word1, $word2, $word3
    191     ;//             Bytes from the original source pointer (not truncated for
    192     ;//             4 byte alignment) as shown in the table.
    193     ;//              -------------------------------
    194     ;//             | word0 | word1 | word2 | word3 |
    195     ;//             |-------------------------------|
    196     ;//             | 0123  | 4567  | 1234  | 5678  |
    197     ;//              -------------------------------
    198     ;//
    199     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    200     ;// register numbering
    201 
    202     MACRO
    203     M_EXT_XINT $offset, $word0, $word1, $word2, $word3
    204         IF $offset = 0
    205             ; $word0 and $word1 are ok
    206             ; $word2, $word3 are just 8 shifted versions
    207             MOV         $word3, $word1, LSR #8
    208             ORR         $word3, $word3, $word2, LSL #24
    209             MOV         $word2, $word0, LSR #8
    210             ORR         $word2, $word2, $word1, LSL #24
    211         ELIF $offset = 3
    212             ; $word2 and $word3 are ok (taken care while loading itself)
    213             ; set $word0 & $word1
    214             MOV         $word0, $word0, LSR #24
    215             ORR         $word0, $word0, $word2, LSL #8
    216             MOV         $word1, $word2, LSR #24
    217             ORR         $word1, $word1, $word3, LSL #8
    218         ELSE
    219             MOV         $word0, $word0, LSR #8 * $offset
    220             ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
    221             MOV         $word1, $word1, LSR #8 * $offset
    222             ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
    223 
    224             MOV         $word3, $word1, LSR #8
    225             ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
    226             MOV         $word2, $word0, LSR #8
    227             ORR         $word2, $word2, $word1, LSL #24
    228         ENDIF
    229     MEND
    230 
    231 ;// ***************************************************************************
    232     ;// Description:
    233     ;// Computes half-sum and xor of two inputs and puts them in the input
    234     ;// registers in that order
    235     ;//
    236     ;// Syntax:
    237     ;// M_HSUM_XOR      $v0, $v1, $tmp
    238     ;//
    239     ;// Inputs:
    240     ;// $v0         a, first input
    241     ;// $v1         b, second input
    242     ;// $tmp        scratch register
    243     ;//
    244     ;// Outputs:
    245     ;// $v0         (a + b)/2
    246     ;// $v1         a ^ b
    247 
    248     MACRO
    249     M_HSUM_XOR      $v0, $v1, $tmp
    250         UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
    251         EOR         $v1, $v0, $v1      ;// l0 = a ^ b
    252         MOV         $v0, $tmp          ;// s0
    253     MEND
    254 ;// ***************************************************************************
    255     ;// Description:
    256     ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
    257     ;// mcReconBlock module. Very specific to the implementation of
    258     ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
    259     ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
    260     ;// not significant and are used by the callee for row counter (y)
    261     ;//
    262     ;// Some points to note are:
    263     ;// 1. Input is pair of pair-averages and Xors
    264     ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
    265     ;//    running average
    266     ;// 3. Output is in the first argument
    267     ;//
    268     ;// Syntax:
    269     ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
    270     ;//
    271     ;// Inputs:
    272     ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
    273     ;// $lsb0       (a ^ b)
    274     ;// $sum1       (c + d) >> 1. Not modified
    275     ;// $lsb1       (c ^ d)       Not modified
    276     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    277     ;//
    278     ;// Outputs:
    279     ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
    280     ;//             (a + b + c + d + 2) / 4 : If rounding
    281 
    282     MACRO
    283     M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
    284         LCLS OP1
    285         LCLS OP2
    286         IF $rndVal = 0 ;// rounding case
    287 OP1 SETS "AND"
    288 OP2 SETS "ORR"
    289         ELSE           ;// Not rounding case
    290 OP1 SETS "ORR"
    291 OP2 SETS "AND"
    292         ENDIF
    293 
    294         LCLS lsb2
    295         LCLS sum2
    296         LCLS dest
    297 
    298 lsb2  SETS "tmp"
    299 sum2  SETS "$lsb0"
    300 dest  SETS "$sum0"
    301 
    302         $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
    303         EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
    304         $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
    305         AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
    306         UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
    307         UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
    308     MEND
    309 ;// ***************************************************************************
    310 ;// Motion compensation handler macros
    311 ;// ***************************************************************************
    312     ;// Description:
    313     ;// Implement motion compensation routines using the named registers in
    314     ;// callee function. Each of the following 4 implement the 4 predict type
    315     ;// Each handles 8 cases each ie all the combinations of 4 types of source
    316     ;// alignment offsets and 2 types of rounding flag
    317     ;//
    318     ;// Syntax:
    319     ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    320     ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
    321     ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
    322     ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
    323     ;//
    324     ;// Inputs:
    325     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    326     ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
    327     ;//
    328     ;// Outputs:
    329     ;// Outputs come in the named registers of the callee functions
    330     ;// The macro loads the data from the source pointer, processes it and
    331     ;// stores in the destination pointer. Does the whole prediction cycle
    332     ;// of Motion Compensation routine for a particular predictType
    333     ;// After this only residue addition to the predicted values remain
    334 
    335     MACRO
    336     M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    337     ;// Algorithmic Description:
    338     ;// This handles motion compensation for IntegerPixel predictType. Both
    339     ;// rounding cases are handled by the same code base. It is just a copy
    340     ;// from source to destination. Two lines are done per loop to reduce
    341     ;// stalls. Loop has been software pipelined as well for that purpose.
    342     ;//
    343     ;// M_LOAD_X loads a whole row in two registers and then they are stored
    344 
    345 CaseIntegerPixelRnd0Offset$offset
    346 CaseIntegerPixelRnd1Offset$offset
    347     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    348     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    349 YloopIntegerPixelOffset$offset
    350     SUBS        y, y, #2
    351     STRD        tmp1, tmp2, [pDst], dstStep
    352     STRD        tmp3, tmp4, [pDst], dstStep
    353     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    354     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    355     BGT         YloopIntegerPixelOffset$offset
    356 
    357     B           SwitchPredictTypeEnd
    358     MEND
    359 ;// ***************************************************************************
    360     MACRO
    361     M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
    362     ;// Algorithmic Description:
    363     ;// This handles motion compensation for HalfPixelX predictType. The two
    364     ;// rounding cases are handled by the different code base and spanned by
    365     ;// different macro calls. Loop has been software pipelined to reduce
    366     ;// stalls.
    367     ;//
    368     ;// Filtering involves averaging a pixel with the next horizontal pixel.
    369     ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
    370     ;// all pixels in a row with 4 pixel in each register and another 2
    371     ;// registers with pixels corresponding to one horizontally shifted pixel
    372     ;// corresponding to the initial row pixels. These are set of packed
    373     ;// registers appropriate to do 4 lane SIMD.
    374     ;// After that M_UHADD8R macro does the averaging taking care of the
    375     ;// rounding as required
    376 
    377 CaseHalfPixelXRnd$rndVal.Offset$offset
    378     IF $rndVal = 0
    379         LDR mask, =0x80808080
    380     ENDIF
    381 
    382     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
    383 YloopHalfPixelXRnd$rndVal.Offset$offset
    384     SUBS        y, y, #1
    385     M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
    386     M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
    387     M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
    388     STRD        tmp5, tmp6, [pDst], dstStep
    389     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
    390     BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
    391 
    392     B           SwitchPredictTypeEnd
    393     MEND
    394 ;// ***************************************************************************
    395     MACRO
    396     M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
    397     ;// Algorithmic Description:
    398     ;// This handles motion compensation for HalfPixelY predictType. The two
    399     ;// rounding cases are handled by the different code base and spanned by
    400     ;// different macro calls. PreLoading is used to avoid reload of same data.
    401     ;//
    402     ;// Filtering involves averaging a pixel with the next vertical pixel.
    403     ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
    404     ;// each register. These are set of packed registers appropriate to do
    405     ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
    406     ;// of the rounding as required
    407 
    408 CaseHalfPixelYRnd$rndVal.Offset$offset
    409     IF $rndVal = 0
    410         LDR mask, =0x80808080
    411     ENDIF
    412 
    413     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
    414 YloopHalfPixelYRnd$rndVal.Offset$offset
    415     SUBS        y, y, #2
    416     ;// Processing one line
    417     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    418     M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
    419     M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
    420     STRD        tmp1, tmp2, [pDst], dstStep
    421     ;// Processing another line
    422     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
    423     M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
    424     M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
    425     STRD        tmp3, tmp4, [pDst], dstStep
    426 
    427     BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
    428 
    429     B           SwitchPredictTypeEnd
    430     MEND
    431 ;// ***************************************************************************
    432     MACRO
    433     M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
    434     ;// Algorithmic Description:
    435     ;// This handles motion compensation for HalfPixelXY predictType. The two
    436     ;// rounding cases are handled by the different code base and spanned by
    437     ;// different macro calls. PreLoading is used to avoid reload of same data.
    438     ;//
    439     ;// Filtering involves averaging a pixel with the next vertical, horizontal
    440     ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
    441     ;// and M_EXT_XINT combination generates 4 registers with a row and its
    442     ;// 1 pixel right shifted version, with 4 pixels in one register. Another
    443     ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
    444     ;// called to get mutual half-sum and xor combinations of a row with its
    445     ;// shifted version as they are inputs to the M_AVG4 macro which computes
    446     ;// the 4 element average with rounding. Note that it is the half-sum/xor
    447     ;// values that are preserved for next row as they can be re-used in the
    448     ;// next call to the M_AVG4 and saves recomputation.
    449     ;// Due to lack of register, the row counter and a masking value required
    450     ;// in M_AVG4 are packed into a single register yMask where the last nibble
    451     ;// holds the row counter values and rest holds the masking variable left
    452     ;// shifted by 4
    453 
    454 CaseHalfPixelXYRnd$rndVal.Offset$offset
    455     LDR         yMask, =((0x01010101 << 4) + 8)
    456 
    457     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    458     M_EXT_XINT  $offset, t00, t01, t10, t11
    459     M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
    460     M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
    461 
    462 YloopHalfPixelXYRnd$rndVal.Offset$offset
    463     ;// Processsing one line
    464     ;// t00, t01, t10, t11 required from previous loop
    465     M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
    466     SUB         yMask, yMask, #2
    467     M_EXT_XINT  $offset, t20, t21, t30, t31
    468     M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
    469     M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
    470     M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
    471     M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
    472     STRD        t00, t01, [pDst], dstStep   ;// store the average
    473 
    474     ;// Processsing another line
    475     ;// t20, t21, t30, t31 required from above
    476     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    477     TST         yMask, #7
    478     M_EXT_XINT  $offset, t00, t01, t10, t11
    479     M_HSUM_XOR  t00, t10, tmp
    480     M_HSUM_XOR  t01, t11, tmp
    481     M_AVG4      t20, t30, t00, t10, $rndVal
    482     M_AVG4      t21, t31, t01, t11, $rndVal
    483     STRD        t20, t21, [pDst], dstStep
    484 
    485     BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
    486 
    487     IF $offset/=3 :LOR: $rndVal/=1
    488         B           SwitchPredictTypeEnd
    489     ENDIF
    490     MEND
    491 ;// ***************************************************************************
    492 ;// Motion compensation handler macros end here
    493 ;// ***************************************************************************
    494     ;// Description:
    495     ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
    496     ;// combination in the "switch" to prediction processing code segment
    497     ;//
    498     ;// Syntax:
    499     ;// M_CASE_OFFSET $rnd, $predictType
    500     ;//
    501     ;// Inputs:
    502     ;// $rnd            0 for rounding, 1 for no rounding
    503     ;// $predictType    The prediction mode
    504     ;//
    505     ;// Outputs:
    506     ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
    507 
    508     MACRO
    509     M_CASE_OFFSET $rnd, $predictType
    510         M_CASE      Case$predictType.Rnd$rnd.Offset0
    511         M_CASE      Case$predictType.Rnd$rnd.Offset1
    512         M_CASE      Case$predictType.Rnd$rnd.Offset2
    513         M_CASE      Case$predictType.Rnd$rnd.Offset3
    514     MEND
    515 ;// ***************************************************************************
    516     ;// Description:
    517     ;// Populates all 2 kinds of rounding "cases" for each predictType in the
    518     ;// "switch" to prediction processing code segment
    519     ;//
    520     ;// Syntax:
    521     ;// M_CASE_OFFSET $predictType
    522     ;//
    523     ;// Inputs:
    524     ;// $predictType    The prediction mode
    525     ;//
    526     ;// Outputs:
    527     ;// Populated list of "M_CASE_OFFSET" macros
    528 
    529     MACRO
    530     M_CASE_MCRECONBLOCK $predictType
    531         M_CASE_OFFSET  0, $predictType ;// 0 for rounding
    532         M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
    533     MEND
    534 ;// ***************************************************************************
    535     ;// Description:
    536     ;// Populates all 8 kinds of rounding and offset combinations handling macros
    537     ;// for the specified predictType. In case of "IntegerPixel" predictType,
    538     ;// rounding is not required so same code segment handles both cases
    539     ;//
    540     ;// Syntax:
    541     ;// M_MCRECONBLOCK    $predictType
    542     ;//
    543     ;// Inputs:
    544     ;// $predictType    The prediction mode
    545     ;//
    546     ;// Outputs:
    547     ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
    548     ;// predictType. Each
    549     ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
    550     ;// is an code segment (starting with a label indicating the predictType,
    551     ;// rounding and offset combination)
    552     ;// Four calls of this macro with the 4 prediction modes populate all the 32
    553     ;// handlers
    554 
    555     MACRO
    556     M_MCRECONBLOCK $predictType
    557         M_MCRECONBLOCK_$predictType 0, 0
    558         M_MCRECONBLOCK_$predictType 0, 1
    559         M_MCRECONBLOCK_$predictType 0, 2
    560         M_MCRECONBLOCK_$predictType 0, 3
    561     IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
    562         M_MCRECONBLOCK_$predictType 1, 0
    563         M_MCRECONBLOCK_$predictType 1, 1
    564         M_MCRECONBLOCK_$predictType 1, 2
    565         M_MCRECONBLOCK_$predictType 1, 3
    566     ENDIF
    567     MEND
    568 ;// ***************************************************************************
    569 ;// Input/Output Registers
    570 pSrc                  RN 0
    571 srcStep               RN 1
    572 arg_pSrcResidue       RN 2
    573 pSrcResidue           RN 12
    574 pDst                  RN 3
    575 dstStep               RN 2
    576 predictType           RN 10
    577 rndVal                RN 11
    578 mask                  RN 11
    579 
    580 ;// Local Scratch Registers
    581 zero                  RN 12
    582 y                     RN 14
    583 
    584 tmp1                  RN 4
    585 tmp2                  RN 5
    586 tmp3                  RN 6
    587 tmp4                  RN 7
    588 tmp5                  RN 8
    589 tmp6                  RN 9
    590 tmp7                  RN 10
    591 tmp8                  RN 11
    592 tmp9                  RN 12
    593 
    594 t00                   RN 4
    595 t01                   RN 5
    596 t10                   RN 6
    597 t11                   RN 7
    598 t20                   RN 8
    599 t21                   RN 9
    600 t30                   RN 10
    601 t31                   RN 11
    602 tmp                   RN 12
    603 
    604 yMask                 RN 14
    605 
    606 dst                   RN 1
    607 return                RN 0
    608 
    609     ;// Allocate memory on stack
    610     M_ALLOC4    Stk_pDst,           4
    611     M_ALLOC4    Stk_pSrcResidue,    4
    612     ;// Function header
    613     M_START     omxVCM4P2_MCReconBlock, r11
    614     ;// Define stack arguments
    615     M_ARG       Arg_dstStep,        4
    616     M_ARG       Arg_predictType,    4
    617     M_ARG       Arg_rndVal,         4
    618     ;// Save on stack
    619     M_STR       pDst, Stk_pDst
    620     M_STR       arg_pSrcResidue, Stk_pSrcResidue
    621     ;// Load argument from the stack
    622     M_LDR       dstStep, Arg_dstStep
    623     M_LDR       predictType, Arg_predictType
    624     M_LDR       rndVal, Arg_rndVal
    625 
    626     MOV         y, #8
    627 
    628     AND         tmp1, pSrc, #3
    629     ORR         predictType, tmp1, predictType, LSL #3
    630     ORR         predictType, predictType, rndVal, LSL #2
    631     ;// Truncating source pointer to align to 4 byte location
    632     BIC         pSrc, pSrc, #3
    633 
    634     ;// Implementation takes care of all combinations of different
    635     ;// predictTypes, rounding cases and source pointer offsets to alignment
    636     ;// of 4 bytes in different code bases unless one of these parameter wasn't
    637     ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
    638     ;// macros branch into 8 M_CASE macros for all combinations of the 2
    639     ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
    640     ;// alignment.
    641     M_SWITCH    predictType
    642         M_CASE_MCRECONBLOCK IntegerPixel
    643         M_CASE_MCRECONBLOCK HalfPixelX
    644         M_CASE_MCRECONBLOCK HalfPixelY
    645         M_CASE_MCRECONBLOCK HalfPixelXY
    646     M_ENDSWITCH
    647 
    648     ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
    649     ;// particular macros (4 in case of IntegerPixel as rounding makes no
    650     ;// difference there) to generate the code for all cases of rounding and
    651     ;// offsets. LTORG is used to segment the code as code size bloated beyond
    652     ;// 4KB.
    653     M_MCRECONBLOCK IntegerPixel
    654     M_MCRECONBLOCK HalfPixelX
    655     LTORG
    656     M_MCRECONBLOCK HalfPixelY
    657     M_MCRECONBLOCK HalfPixelXY
    658 SwitchPredictTypeEnd
    659 
    660     ;// Residue Addition
    661     ;// This is done in 2 lane SIMD though loads are further optimized and
    662     ;// 4 bytes are loaded in case of destination buffer. Algorithmic
    663     ;// details are in inlined comments
    664     M_LDR       pSrcResidue, Stk_pSrcResidue
    665     CMP         pSrcResidue, #0
    666     BEQ         pSrcResidueConditionEnd
    667 pSrcResidueNotNull
    668     M_LDR       pDst, Stk_pDst
    669     MOV         y, #8
    670     SUB         dstStep, dstStep, #4
    671 Yloop_pSrcResidueNotNull
    672     SUBS        y, y, #1
    673     LDR         dst, [pDst]                ;// dst = [dcba]
    674     LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
    675     PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
    676     PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
    677     UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
    678     UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
    679     QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
    680     QADD16      tmp2, tmp2, tmp4
    681     USAT16      tmp1, #8, tmp1
    682     USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
    683     ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
    684     STR         tmp1, [pDst], #4
    685 
    686     LDR         dst, [pDst]
    687     LDMIA       pSrcResidue!, {tmp1, tmp2}
    688     PKHBT       tmp3, tmp1, tmp2, LSL #16
    689     PKHTB       tmp4, tmp2, tmp1, ASR #16
    690     UXTB16      tmp1, dst
    691     UXTB16      tmp2, dst, ROR #8
    692     QADD16      tmp1, tmp1, tmp3
    693     QADD16      tmp2, tmp2, tmp4
    694     USAT16      tmp1, #8, tmp1
    695     USAT16      tmp2, #8, tmp2
    696     ORR         tmp1, tmp1, tmp2, LSL #8
    697     STR         tmp1, [pDst], dstStep
    698 
    699     BGT         Yloop_pSrcResidueNotNull
    700 pSrcResidueConditionEnd
    701 
    702     MOV         return, #OMX_Sts_NoErr
    703 
    704     M_END
    705     ENDIF ;// ARM1136JS
    706 
    707 ;// ***************************************************************************
    708 ;// CortexA8 implementation
    709 ;// ***************************************************************************
    710     END
    711 ;// ***************************************************************************
    712 ;// omxVCM4P2_MCReconBlock ends
    713 ;// ***************************************************************************
    714