Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS ARM1136JS
     17 
     18 
     19 
     20     IF  ARM1136JS
     21 
     22 MASK_1  EQU 0x01010101
     23 
     24 ;// Declare input registers
     25 
     26 pQ0        RN 0
     27 StepArg    RN 1
     28 tC0Arg     RN 2
     29 alpha      RN 6
     30 
     31 beta       RN 14
     32 bS         RN 14
     33 tC0        RN 14
     34 ptC0       RN 1
     35 
     36 ;// Declare Local/Temporary variables
     37 
     38 ;// Pixels
     39 p_0     RN 3
     40 p_1     RN 5
     41 p_2     RN 4
     42 p_3     RN 2
     43 q_0     RN 8
     44 q_1     RN 9
     45 q_2     RN 10
     46 q_3     RN 12
     47 
     48 
     49 ;// Filtering
     50 
     51 ap0q0   RN 1
     52 filt    RN 2
     53 
     54 m00     RN 7
     55 m01     RN 11
     56 
     57 apflg   RN 0
     58 aqflg   RN 6
     59 
     60 tC      RN 1
     61 
     62 
     63 ;//Declarations for bSLT4 kernel
     64 
     65 pos     RN 7
     66 neg     RN 12
     67 
     68 P0a     RN 1
     69 P1a     RN 8
     70 Q0a     RN 7
     71 Q1a     RN 4
     72 
     73 u1      RN 3
     74 max     RN 12
     75 min     RN 2
     76 
     77 
     78 
     79 ;//Declarations for bSGE4 kernel
     80 
     81 q_3b    RN 9
     82 p_3b    RN 0
     83 apqflg  RN 12
     84 
     85 P0b     RN 6
     86 P1b     RN 7
     87 P2b     RN 1
     88 
     89 Q0b     RN 9
     90 Q1b     RN 0
     91 Q2b     RN 2
     92 
     93 ;// Miscellanous
     94 
     95 a       RN 0
     96 t0      RN 3
     97 t1      RN 12
     98 t2      RN 7
     99 t3      RN 11
    100 t4      RN 4
    101 t5      RN 1
    102 t8      RN 6
    103 t9      RN 14
    104 t10     RN 5
    105 t11     RN 9
    106 
    107 ;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
    108 ;//
    109 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
    110 ;//        - 2 - filt, 0 - apflg, 6 - aqflg
    111 ;//        - 11 - m01, 7 - tC0
    112 ;//
    113 ;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
    114 ;//
    115 ;// Registers Corrupted - 0-3,5-12,14
    116 
    117 
    118         M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
    119 
    120         ;// Since beta <= 18 and alpha <= 255 we know
    121         ;// -254 <= p0-q0 <= 254
    122         ;//  -17 <= q1-q0 <= 17
    123         ;//  -17 <= p1-p0 <= 17
    124 
    125         ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
    126         ;//
    127         ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
    128         ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
    129         ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
    130 
    131         USUB8   t1, p_1, p_0
    132         MUL     tC0, t2, m01
    133 
    134         USUB8   t2, q_1, q_0
    135         SSUB8   t1, t1, t2
    136 
    137         USUB8   t2, p_0, q_0
    138         AND     t2, t2, m01
    139         SHSUB8  t1, t1, t2
    140         UHSUB8  t5, p_0, q_0
    141         SSUB8   t1, t1, t2
    142         SHSUB8  t1, t1, t5
    143         MOV     m00, #0
    144         SADD8   t1, t1, m01
    145         SHSUB8  t1, t1, t5
    146 
    147         ;// tC = tC0
    148         ;// if (ap < beta) tC++;
    149         ;// if (aq < beta) tC++;
    150         USUB8   t5, filt, m01
    151         SEL     tC0, tC0, m00
    152         UQADD8  tC, tC0, apflg
    153         SSUB8   t1, t1, m00
    154         UQADD8  tC, tC, aqflg
    155 
    156         ;// Split into positive and negative part and clip
    157         SEL     pos, t1, m00
    158         USUB8   neg, pos, t1
    159         USUB8   t3, pos, tC
    160         SEL     pos, tC, pos
    161         USUB8   t3, neg, tC
    162         SEL     neg, tC, neg
    163 
    164         ;//Reload m01
    165         LDR     m01,=MASK_1
    166 
    167         UQADD8  P0a, p_0, pos
    168         UQSUB8  Q0a, q_0, pos
    169         UQSUB8  P0a, P0a, neg
    170         UQADD8  Q0a, Q0a, neg
    171 
    172         ;// Choose to store the filtered
    173         ;// value or the original pixel
    174         USUB8   t1, filt, m01
    175         SEL     P0a, P0a, p_0
    176         SEL     Q0a, Q0a, q_0
    177 
    178         ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
    179         ;// u1 = (p0 + q0 + 1)>>1
    180         ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
    181         MVN     p_0, p_0
    182         UHSUB8  u1, q_0, p_0
    183         UQADD8  max, p_1, tC0
    184         EOR     u1, u1, m01 ,LSL #7
    185 
    186         ;// Calculate A = (p2+u1)>>1
    187         ;// Then delta = Clip3( -tC0, tC0, A - p1)
    188 
    189         ;// Clip P1
    190         UHADD8  P1a, p_2, u1
    191         UQSUB8  min, p_1, tC0
    192         USUB8   t4, P1a, max
    193         SEL     P1a, max, P1a
    194         USUB8   t4, P1a, min
    195         SEL     P1a, P1a, min
    196 
    197         ;// Clip Q1
    198         UHADD8  Q1a, q_2, u1
    199         UQADD8  max, q_1, tC0
    200         UQSUB8  min, q_1, tC0
    201         USUB8   t0, Q1a, max
    202         SEL     Q1a, max, Q1a
    203         USUB8   t0, Q1a, min
    204         SEL     Q1a, Q1a, min
    205 
    206         ;// Choose to store the filtered
    207         ;// value or the original pixel
    208         USUB8   t0, apflg, m01
    209         SEL     P1a, P1a, p_1
    210         USUB8   t0, aqflg, m01
    211         SEL     t3, Q1a, q_1
    212 
    213         M_END
    214 
    215 ;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
    216 ;//
    217 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
    218 ;//        - 2 - filt, 0 - apflg,aqflg
    219 ;//        - 1 - ap0q0, 6 - alpha
    220 ;//        - 7 - m00, 11 - m01
    221 ;//
    222 ;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
    223 ;//
    224 ;// Registers Corrupted - 0-3,5-12,14
    225 
    226         M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
    227 
    228         ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
    229         ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
    230 
    231         M_ARG   pDummy,4
    232         M_ARG   pQ_3,4
    233         M_ARG   pP_3,4
    234 
    235         UHADD8  alpha, alpha, m00
    236         USUB8   t9, p_2, p_0    ;//t9 = dp2p0
    237         UHADD8  alpha, alpha, m00
    238         ADD     alpha, alpha, m01, LSL #1
    239         USUB8   ap0q0, ap0q0, alpha
    240         SEL     apqflg, m00, apflg
    241 
    242         ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
    243         ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
    244         ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
    245 
    246         ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
    247         ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
    248 
    249         ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
    250         ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
    251         ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
    252 
    253         ;// Compute P0b
    254         USUB8   t2, p_0, q_0
    255         SSUB8   t5, t9, t2
    256 
    257         USUB8   t8, q_1, q_0
    258         SHADD8  t8, t5, t8
    259 
    260         USUB8   t9, p_1, p_0
    261         SADD8   t8, t8, t9
    262         SHSUB8  t8, t8, t2
    263         SHADD8  t5, t5, t9
    264         SHADD8  t8, t8, m01
    265         SHADD8  t9, t5, m01
    266         SADD8   P0b, p_0, t8
    267         ;// P0b ready
    268 
    269         ;// Compute P1b
    270         M_LDR   p_3b, pP_3
    271         SADD8   P1b, p_0, t9
    272         ;// P1b ready
    273 
    274         ;// Compute P2b
    275         USUB8   t9, p_2, p_0
    276         SADD8   t5, t5, t9
    277         UHSUB8  t9, p_3b, p_0
    278         EOR     a, p_3b, p_0
    279         AND     a, a, m01
    280         SHADD8  t5, t5, a
    281         UHADD8  a, p_0, q_1
    282         SADD8   t5, t5, m01
    283         SHADD8  t5, t5, t9
    284         MVN     t9, p_1
    285         SADD8   P2b, p_0, t5
    286         ;// P2b ready
    287 
    288         UHSUB8  a, a, t9
    289         ORR     t9, apqflg, m01
    290         USUB8   t9, apqflg, t9
    291 
    292         EOR     a, a, m01, LSL #7
    293         SEL     P0b, P0b, a
    294         SEL     P1b, P1b, p_1
    295         SEL     P2b, P2b, p_2
    296 
    297         USUB8   t4, filt, m01
    298         SEL     P0b, P0b, p_0
    299 
    300 
    301         ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
    302         ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
    303         ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
    304 
    305         ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
    306         ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
    307 
    308         ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
    309         ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
    310         ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
    311 
    312 
    313         ;// Compute Q0b Q1b
    314         USUB8   t4, q_2, q_0
    315         USUB8   a, p_0, q_0
    316         USUB8   t9, p_1, p_0
    317         SADD8   t0, t4, a
    318         SHADD8  t9, t0, t9
    319         UHADD8  t10, q_0, p_1
    320         SADD8   t9, t9, a
    321         USUB8   a, q_1, q_0
    322         SHADD8  t9, t9, a
    323         SHADD8  t0, t0, a
    324         SHADD8  t9, t9, m01
    325         SHADD8  a, t0, m01
    326         SADD8   t9, q_0, t9
    327         ;// Q0b ready - t9
    328 
    329         MOV     t4, #0
    330         UHADD8  apqflg, apqflg, t4
    331 
    332         SADD8   Q1b, q_0, a
    333         ;// Q1b ready
    334 
    335         USUB8   t4, apqflg, m01
    336         SEL     Q1b, Q1b, q_1
    337         MVN     t11, q_1
    338         UHSUB8  t10, t10, t11
    339         M_LDR   q_3b, pQ_3
    340         EOR     t10, t10, m01, LSL #7
    341         SEL     t9, t9, t10
    342 
    343         ;// Compute Q2b
    344         USUB8   t4, q_2, q_0
    345         SADD8   t4, t0, t4
    346         EOR     t0, q_3b, q_0
    347         AND     t0, t0, m01
    348         SHADD8  t4, t4, t0
    349         UHSUB8  t10, q_3b, q_0
    350         SADD8   t4, t4, m01
    351         SHADD8  t4, t4, t10
    352 
    353         USUB8   t10, filt, m01
    354         SEL     Q0b, t9, q_0
    355 
    356         SADD8   t4, q_0, t4
    357         ;// Q2b ready - t4
    358 
    359         USUB8   t10, apqflg, m01
    360         SEL     Q2b, t4, q_2
    361 
    362         M_END
    363 
    364     ENDIF
    365 
    366         END