Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS ARM1136JS
     31 
     32 
     33 
     34     IF  ARM1136JS
     35 
     36 MASK_1  EQU 0x01010101
     37 
     38 ;// Declare input registers
     39 
     40 pQ0        RN 0
     41 StepArg    RN 1
     42 tC0Arg     RN 2
     43 alpha      RN 6
     44 
     45 beta       RN 14
     46 bS         RN 14
     47 tC0        RN 14
     48 ptC0       RN 1
     49 
     50 ;// Declare Local/Temporary variables
     51 
     52 ;// Pixels
     53 p_0     RN 3
     54 p_1     RN 5
     55 p_2     RN 4
     56 p_3     RN 2
     57 q_0     RN 8
     58 q_1     RN 9
     59 q_2     RN 10
     60 q_3     RN 12
     61 
     62 
     63 ;// Filtering
     64 
     65 ap0q0   RN 1
     66 filt    RN 2
     67 
     68 m00     RN 7
     69 m01     RN 11
     70 
     71 apflg   RN 0
     72 aqflg   RN 6
     73 
     74 tC      RN 1
     75 
     76 
     77 ;//Declarations for bSLT4 kernel
     78 
     79 pos     RN 7
     80 neg     RN 12
     81 
     82 P0a     RN 1
     83 P1a     RN 8
     84 Q0a     RN 7
     85 Q1a     RN 4
     86 
     87 u1      RN 3
     88 max     RN 12
     89 min     RN 2
     90 
     91 
     92 
     93 ;//Declarations for bSGE4 kernel
     94 
     95 q_3b    RN 9
     96 p_3b    RN 0
     97 apqflg  RN 12
     98 
     99 P0b     RN 6
    100 P1b     RN 7
    101 P2b     RN 1
    102 
    103 Q0b     RN 9
    104 Q1b     RN 0
    105 Q2b     RN 2
    106 
    107 ;// Miscellanous
    108 
    109 a       RN 0
    110 t0      RN 3
    111 t1      RN 12
    112 t2      RN 7
    113 t3      RN 11
    114 t4      RN 4
    115 t5      RN 1
    116 t8      RN 6
    117 t9      RN 14
    118 t10     RN 5
    119 t11     RN 9
    120 
    121 ;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
    122 ;//
    123 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
    124 ;//        - 2 - filt, 0 - apflg, 6 - aqflg
    125 ;//        - 11 - m01, 7 - tC0
    126 ;//
    127 ;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
    128 ;//
    129 ;// Registers Corrupted - 0-3,5-12,14
    130 
    131 
    132         M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
    133 
    134         ;// Since beta <= 18 and alpha <= 255 we know
    135         ;// -254 <= p0-q0 <= 254
    136         ;//  -17 <= q1-q0 <= 17
    137         ;//  -17 <= p1-p0 <= 17
    138 
    139         ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
    140         ;//
    141         ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
    142         ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
    143         ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
    144 
    145         USUB8   t1, p_1, p_0
    146         MUL     tC0, t2, m01
    147 
    148         USUB8   t2, q_1, q_0
    149         SSUB8   t1, t1, t2
    150 
    151         USUB8   t2, p_0, q_0
    152         AND     t2, t2, m01
    153         SHSUB8  t1, t1, t2
    154         UHSUB8  t5, p_0, q_0
    155         SSUB8   t1, t1, t2
    156         SHSUB8  t1, t1, t5
    157         MOV     m00, #0
    158         SADD8   t1, t1, m01
    159         SHSUB8  t1, t1, t5
    160 
    161         ;// tC = tC0
    162         ;// if (ap < beta) tC++;
    163         ;// if (aq < beta) tC++;
    164         USUB8   t5, filt, m01
    165         SEL     tC0, tC0, m00
    166         UQADD8  tC, tC0, apflg
    167         SSUB8   t1, t1, m00
    168         UQADD8  tC, tC, aqflg
    169 
    170         ;// Split into positive and negative part and clip
    171         SEL     pos, t1, m00
    172         USUB8   neg, pos, t1
    173         USUB8   t3, pos, tC
    174         SEL     pos, tC, pos
    175         USUB8   t3, neg, tC
    176         SEL     neg, tC, neg
    177 
    178         ;//Reload m01
    179         LDR     m01,=MASK_1
    180 
    181         UQADD8  P0a, p_0, pos
    182         UQSUB8  Q0a, q_0, pos
    183         UQSUB8  P0a, P0a, neg
    184         UQADD8  Q0a, Q0a, neg
    185 
    186         ;// Choose to store the filtered
    187         ;// value or the original pixel
    188         USUB8   t1, filt, m01
    189         SEL     P0a, P0a, p_0
    190         SEL     Q0a, Q0a, q_0
    191 
    192         ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
    193         ;// u1 = (p0 + q0 + 1)>>1
    194         ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
    195         MVN     p_0, p_0
    196         UHSUB8  u1, q_0, p_0
    197         UQADD8  max, p_1, tC0
    198         EOR     u1, u1, m01 ,LSL #7
    199 
    200         ;// Calculate A = (p2+u1)>>1
    201         ;// Then delta = Clip3( -tC0, tC0, A - p1)
    202 
    203         ;// Clip P1
    204         UHADD8  P1a, p_2, u1
    205         UQSUB8  min, p_1, tC0
    206         USUB8   t4, P1a, max
    207         SEL     P1a, max, P1a
    208         USUB8   t4, P1a, min
    209         SEL     P1a, P1a, min
    210 
    211         ;// Clip Q1
    212         UHADD8  Q1a, q_2, u1
    213         UQADD8  max, q_1, tC0
    214         UQSUB8  min, q_1, tC0
    215         USUB8   t0, Q1a, max
    216         SEL     Q1a, max, Q1a
    217         USUB8   t0, Q1a, min
    218         SEL     Q1a, Q1a, min
    219 
    220         ;// Choose to store the filtered
    221         ;// value or the original pixel
    222         USUB8   t0, apflg, m01
    223         SEL     P1a, P1a, p_1
    224         USUB8   t0, aqflg, m01
    225         SEL     t3, Q1a, q_1
    226 
    227         M_END
    228 
    229 ;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
    230 ;//
    231 ;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
    232 ;//        - 2 - filt, 0 - apflg,aqflg
    233 ;//        - 1 - ap0q0, 6 - alpha
    234 ;//        - 7 - m00, 11 - m01
    235 ;//
    236 ;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
    237 ;//
    238 ;// Registers Corrupted - 0-3,5-12,14
    239 
    240         M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
    241 
    242         ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
    243         ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
    244 
    245         M_ARG   pDummy,4
    246         M_ARG   pQ_3,4
    247         M_ARG   pP_3,4
    248 
    249         UHADD8  alpha, alpha, m00
    250         USUB8   t9, p_2, p_0    ;//t9 = dp2p0
    251         UHADD8  alpha, alpha, m00
    252         ADD     alpha, alpha, m01, LSL #1
    253         USUB8   ap0q0, ap0q0, alpha
    254         SEL     apqflg, m00, apflg
    255 
    256         ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
    257         ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
    258         ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
    259 
    260         ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
    261         ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
    262 
    263         ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
    264         ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
    265         ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
    266 
    267         ;// Compute P0b
    268         USUB8   t2, p_0, q_0
    269         SSUB8   t5, t9, t2
    270 
    271         USUB8   t8, q_1, q_0
    272         SHADD8  t8, t5, t8
    273 
    274         USUB8   t9, p_1, p_0
    275         SADD8   t8, t8, t9
    276         SHSUB8  t8, t8, t2
    277         SHADD8  t5, t5, t9
    278         SHADD8  t8, t8, m01
    279         SHADD8  t9, t5, m01
    280         SADD8   P0b, p_0, t8
    281         ;// P0b ready
    282 
    283         ;// Compute P1b
    284         M_LDR   p_3b, pP_3
    285         SADD8   P1b, p_0, t9
    286         ;// P1b ready
    287 
    288         ;// Compute P2b
    289         USUB8   t9, p_2, p_0
    290         SADD8   t5, t5, t9
    291         UHSUB8  t9, p_3b, p_0
    292         EOR     a, p_3b, p_0
    293         AND     a, a, m01
    294         SHADD8  t5, t5, a
    295         UHADD8  a, p_0, q_1
    296         SADD8   t5, t5, m01
    297         SHADD8  t5, t5, t9
    298         MVN     t9, p_1
    299         SADD8   P2b, p_0, t5
    300         ;// P2b ready
    301 
    302         UHSUB8  a, a, t9
    303         ORR     t9, apqflg, m01
    304         USUB8   t9, apqflg, t9
    305 
    306         EOR     a, a, m01, LSL #7
    307         SEL     P0b, P0b, a
    308         SEL     P1b, P1b, p_1
    309         SEL     P2b, P2b, p_2
    310 
    311         USUB8   t4, filt, m01
    312         SEL     P0b, P0b, p_0
    313 
    314 
    315         ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
    316         ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
    317         ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
    318 
    319         ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
    320         ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
    321 
    322         ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
    323         ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
    324         ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
    325 
    326 
    327         ;// Compute Q0b Q1b
    328         USUB8   t4, q_2, q_0
    329         USUB8   a, p_0, q_0
    330         USUB8   t9, p_1, p_0
    331         SADD8   t0, t4, a
    332         SHADD8  t9, t0, t9
    333         UHADD8  t10, q_0, p_1
    334         SADD8   t9, t9, a
    335         USUB8   a, q_1, q_0
    336         SHADD8  t9, t9, a
    337         SHADD8  t0, t0, a
    338         SHADD8  t9, t9, m01
    339         SHADD8  a, t0, m01
    340         SADD8   t9, q_0, t9
    341         ;// Q0b ready - t9
    342 
    343         MOV     t4, #0
    344         UHADD8  apqflg, apqflg, t4
    345 
    346         SADD8   Q1b, q_0, a
    347         ;// Q1b ready
    348 
    349         USUB8   t4, apqflg, m01
    350         SEL     Q1b, Q1b, q_1
    351         MVN     t11, q_1
    352         UHSUB8  t10, t10, t11
    353         M_LDR   q_3b, pQ_3
    354         EOR     t10, t10, m01, LSL #7
    355         SEL     t9, t9, t10
    356 
    357         ;// Compute Q2b
    358         USUB8   t4, q_2, q_0
    359         SADD8   t4, t0, t4
    360         EOR     t0, q_3b, q_0
    361         AND     t0, t0, m01
    362         SHADD8  t4, t4, t0
    363         UHSUB8  t10, q_3b, q_0
    364         SADD8   t4, t4, m01
    365         SHADD8  t4, t4, t10
    366 
    367         USUB8   t10, filt, m01
    368         SEL     Q0b, t9, q_0
    369 
    370         SADD8   t4, q_0, t4
    371         ;// Q2b ready - t4
    372 
    373         USUB8   t10, apqflg, m01
    374         SEL     Q2b, t4, q_2
    375 
    376         M_END
    377 
    378     ENDIF
    379 
    380         END
    381