Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl sixtap_predict_ppc
     13     .globl sixtap_predict8x4_ppc
     14     .globl sixtap_predict8x8_ppc
     15     .globl sixtap_predict16x16_ppc
     16 
     17 .macro load_c V, LABEL, OFF, R0, R1
     18     lis     \R0, \LABEL@ha
     19     la      \R1, \LABEL@l(\R0)
     20     lvx     \V, \OFF, \R1
     21 .endm
     22 
     23 .macro load_hfilter V0, V1
     24     load_c \V0, HFilter, r5, r9, r10
     25 
     26     addi    r5,  r5, 16
     27     lvx     \V1, r5, r10
     28 .endm
     29 
     30 ;# Vertical filtering
     31 .macro Vprolog
     32     load_c v0, VFilter, r6, r3, r10
     33 
     34     vspltish v5, 8
     35     vspltish v6, 3
     36     vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
     37 
     38     vspltb  v1, v0, 1
     39     vspltb  v2, v0, 2
     40     vspltb  v3, v0, 3
     41     vspltb  v4, v0, 4
     42     vspltb  v5, v0, 5
     43     vspltb  v0, v0, 0
     44 .endm
     45 
     46 .macro vpre_load
     47     Vprolog
     48     li      r10,  16
     49     lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
     50     lvx     v11, r10, r9
     51     addi    r9,   r9, 32
     52     lvx     v12,   0, r9
     53     lvx     v13, r10, r9
     54     addi    r9,   r9, 32
     55     lvx     v14,   0, r9
     56 .endm
     57 
     58 .macro Msum Re, Ro, V, T, TMP
     59                                 ;# (Re,Ro) += (V*T)
     60     vmuleub \TMP, \V, \T        ;# trashes v8
     61     vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
     62     vmuloub \TMP, \V, \T
     63     vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
     64 .endm
     65 
     66 .macro vinterp_no_store P0 P1 P2 P3 P4 P5
     67     vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
     68     vadduhm v16, v6, v8
     69     vmuloub  v8, \P0, v0
     70     vadduhm v17, v6, v8
     71     Msum v16, v17, \P2, v2, v8
     72     Msum v16, v17, \P3, v3, v8
     73     Msum v16, v17, \P5, v5, v8
     74 
     75     vmuleub v18, \P1, v1        ;# 2 negative taps
     76     vmuloub v19, \P1, v1
     77     Msum v18, v19, \P4, v4, v8
     78 
     79     vsubuhs v16, v16, v18       ;# subtract neg from pos
     80     vsubuhs v17, v17, v19
     81     vsrh    v16, v16, v7        ;# divide by 128
     82     vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
     83     vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
     84     vmrglh  v19, v16, v17
     85     vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
     86 .endm
     87 
     88 .macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
     89     vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
     90     vadduhm v21, v20, v24
     91     vmuloub v24, \P0, v13
     92     vadduhm v22, v20, v24
     93     Msum v21, v22, \P2, v15, v25
     94     Msum v21, v22, \P3, v16, v25
     95     Msum v21, v22, \P5, v18, v25
     96 
     97     vmuleub v23, \P1, v14       ;# 2 negative taps
     98     vmuloub v24, \P1, v14
     99     Msum v23, v24, \P4, v17, v25
    100 
    101     vsubuhs v21, v21, v23       ;# subtract neg from pos
    102     vsubuhs v22, v22, v24
    103     vsrh    v21, v21, v19       ;# divide by 128
    104     vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
    105     vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
    106     vmrglh  v24, v21, v22
    107     vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
    108 .endm
    109 
    110 
    111 .macro Vinterp P0 P1 P2 P3 P4 P5
    112     vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
    113     stvx    \P0, 0, r7
    114     add     r7, r7, r8      ;# 33 ops per 16 pels
    115 .endm
    116 
    117 
    118 .macro luma_v P0, P1, P2, P3, P4, P5
    119     addi    r9,   r9, 16        ;# P5 = newest input row
    120     lvx     \P5,   0, r9
    121     Vinterp \P0, \P1, \P2, \P3, \P4, \P5
    122 .endm
    123 
    124 .macro luma_vtwo
    125     luma_v v10, v11, v12, v13, v14, v15
    126     luma_v v11, v12, v13, v14, v15, v10
    127 .endm
    128 
    129 .macro luma_vfour
    130     luma_vtwo
    131     luma_v v12, v13, v14, v15, v10, v11
    132     luma_v v13, v14, v15, v10, v11, v12
    133 .endm
    134 
    135 .macro luma_vsix
    136     luma_vfour
    137     luma_v v14, v15, v10, v11, v12, v13
    138     luma_v v15, v10, v11, v12, v13, v14
    139 .endm
    140 
    141 .macro Interp4 R I I4
    142     vmsummbm \R, v13, \I, v15
    143     vmsummbm \R, v14, \I4, \R
    144 .endm
    145 
    146 .macro Read8x8 VD, RS, RP, increment_counter
    147     lvsl    v21,  0, \RS        ;# permutate value for alignment
    148 
    149     ;# input to filter is 21 bytes wide, output is 16 bytes.
    150     ;#  input will can span three vectors if not aligned correctly.
    151     lvx     \VD,   0, \RS
    152     lvx     v20, r10, \RS
    153 
    154 .if \increment_counter
    155     add     \RS, \RS, \RP
    156 .endif
    157 
    158     vperm   \VD, \VD, v20, v21
    159 .endm
    160 
    161 .macro interp_8x8 R
    162     vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
    163     vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
    164     Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
    165     vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
    166     Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
    167 
    168     vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
    169     vsrh    \R, \R, v19
    170 
    171     vpkuhus \R, \R, \R          ;# saturate and pack
    172 
    173 .endm
    174 
    175 .macro Read4x4 VD, RS, RP, increment_counter
    176     lvsl    v21,  0, \RS        ;# permutate value for alignment
    177 
    178     ;# input to filter is 21 bytes wide, output is 16 bytes.
    179     ;#  input will can span three vectors if not aligned correctly.
    180     lvx     v20,   0, \RS
    181 
    182 .if \increment_counter
    183     add     \RS, \RS, \RP
    184 .endif
    185 
    186     vperm   \VD, v20, v20, v21
    187 .endm
    188     .text
    189 
    190     .align 2
    191 ;# r3 unsigned char * src
    192 ;# r4 int src_pitch
    193 ;# r5 int x_offset
    194 ;# r6 int y_offset
    195 ;# r7 unsigned char * dst
    196 ;# r8 int dst_pitch
    197 sixtap_predict_ppc:
    198     mfspr   r11, 256            ;# get old VRSAVE
    199     oris    r12, r11, 0xff87
    200     ori     r12, r12, 0xffc0
    201     mtspr   256, r12            ;# set VRSAVE
    202 
    203     stwu    r1,-32(r1)          ;# create space on the stack
    204 
    205     slwi.   r5, r5, 5           ;# index into horizontal filter array
    206 
    207     vspltish v19, 7
    208 
    209     ;# If there isn't any filtering to be done for the horizontal, then
    210     ;#  just skip to the second pass.
    211     beq-    vertical_only_4x4
    212 
    213     ;# load up horizontal filter
    214     load_hfilter v13, v14
    215 
    216     ;# rounding added in on the multiply
    217     vspltisw v16, 8
    218     vspltisw v15, 3
    219     vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
    220 
    221     ;# Load up permutation constants
    222     load_c v16, B_0123, 0, r9, r10
    223     load_c v17, B_4567, 0, r9, r10
    224     load_c v18, B_89AB, 0, r9, r10
    225 
    226     ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
    227     addi    r3, r3, -2
    228 
    229     addi    r9, r3, 0
    230     li      r10, 16
    231     Read8x8 v2, r3, r4, 1
    232     Read8x8 v3, r3, r4, 1
    233     Read8x8 v4, r3, r4, 1
    234     Read8x8 v5, r3, r4, 1
    235 
    236     slwi.   r6, r6, 4           ;# index into vertical filter array
    237 
    238     ;# filter a line
    239     interp_8x8 v2
    240     interp_8x8 v3
    241     interp_8x8 v4
    242     interp_8x8 v5
    243 
    244     ;# Finished filtering main horizontal block.  If there is no
    245     ;#  vertical filtering, jump to storing the data.  Otherwise
    246     ;#  load up and filter the additional 5 lines that are needed
    247     ;#  for the vertical filter.
    248     beq-    store_4x4
    249 
    250     ;# only needed if there is a vertical filter present
    251     ;# if the second filter is not null then need to back off by 2*pitch
    252     sub     r9, r9, r4
    253     sub     r9, r9, r4
    254 
    255     Read8x8 v0, r9, r4, 1
    256     Read8x8 v1, r9, r4, 0
    257     Read8x8 v6, r3, r4, 1
    258     Read8x8 v7, r3, r4, 1
    259     Read8x8 v8, r3, r4, 0
    260 
    261     interp_8x8 v0
    262     interp_8x8 v1
    263     interp_8x8 v6
    264     interp_8x8 v7
    265     interp_8x8 v8
    266 
    267     b       second_pass_4x4
    268 
    269 vertical_only_4x4:
    270     ;# only needed if there is a vertical filter present
    271     ;# if the second filter is not null then need to back off by 2*pitch
    272     sub     r3, r3, r4
    273     sub     r3, r3, r4
    274     li      r10, 16
    275 
    276     Read8x8 v0, r3, r4, 1
    277     Read8x8 v1, r3, r4, 1
    278     Read8x8 v2, r3, r4, 1
    279     Read8x8 v3, r3, r4, 1
    280     Read8x8 v4, r3, r4, 1
    281     Read8x8 v5, r3, r4, 1
    282     Read8x8 v6, r3, r4, 1
    283     Read8x8 v7, r3, r4, 1
    284     Read8x8 v8, r3, r4, 0
    285 
    286     slwi    r6, r6, 4           ;# index into vertical filter array
    287 
    288 second_pass_4x4:
    289     load_c   v20, b_hilo_4x4, 0, r9, r10
    290     load_c   v21, b_hilo, 0, r9, r10
    291 
    292     ;# reposition input so that it can go through the
    293     ;# filtering phase with one pass.
    294     vperm   v0, v0, v1, v20     ;# 0 1 x x
    295     vperm   v2, v2, v3, v20     ;# 2 3 x x
    296     vperm   v4, v4, v5, v20     ;# 4 5 x x
    297     vperm   v6, v6, v7, v20     ;# 6 7 x x
    298 
    299     vperm   v0, v0, v2, v21     ;# 0 1 2 3
    300     vperm   v4, v4, v6, v21     ;# 4 5 6 7
    301 
    302     vsldoi  v1, v0, v4, 4
    303     vsldoi  v2, v0, v4, 8
    304     vsldoi  v3, v0, v4, 12
    305 
    306     vsldoi  v5, v4, v8, 4
    307 
    308     load_c   v13, VFilter, r6, r9, r10
    309 
    310     vspltish v15, 8
    311     vspltish v20, 3
    312     vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    313 
    314     vspltb  v14, v13, 1
    315     vspltb  v15, v13, 2
    316     vspltb  v16, v13, 3
    317     vspltb  v17, v13, 4
    318     vspltb  v18, v13, 5
    319     vspltb  v13, v13, 0
    320 
    321     vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
    322 
    323     stvx    v0, 0, r1
    324 
    325     lwz     r0, 0(r1)
    326     stw     r0, 0(r7)
    327     add     r7, r7, r8
    328 
    329     lwz     r0, 4(r1)
    330     stw     r0, 0(r7)
    331     add     r7, r7, r8
    332 
    333     lwz     r0, 8(r1)
    334     stw     r0, 0(r7)
    335     add     r7, r7, r8
    336 
    337     lwz     r0, 12(r1)
    338     stw     r0, 0(r7)
    339 
    340     b       exit_4x4
    341 
    342 store_4x4:
    343 
    344     stvx    v2, 0, r1
    345     lwz     r0, 0(r1)
    346     stw     r0, 0(r7)
    347     add     r7, r7, r8
    348 
    349     stvx    v3, 0, r1
    350     lwz     r0, 0(r1)
    351     stw     r0, 0(r7)
    352     add     r7, r7, r8
    353 
    354     stvx    v4, 0, r1
    355     lwz     r0, 0(r1)
    356     stw     r0, 0(r7)
    357     add     r7, r7, r8
    358 
    359     stvx    v5, 0, r1
    360     lwz     r0, 0(r1)
    361     stw     r0, 0(r7)
    362 
    363 exit_4x4:
    364 
    365     addi    r1, r1, 32          ;# recover stack
    366 
    367     mtspr   256, r11            ;# reset old VRSAVE
    368 
    369     blr
    370 
    371 .macro w_8x8 V, D, R, P
    372     stvx    \V, 0, r1
    373     lwz     \R, 0(r1)
    374     stw     \R, 0(r7)
    375     lwz     \R, 4(r1)
    376     stw     \R, 4(r7)
    377     add     \D, \D, \P
    378 .endm
    379 
    380     .align 2
    381 ;# r3 unsigned char * src
    382 ;# r4 int src_pitch
    383 ;# r5 int x_offset
    384 ;# r6 int y_offset
    385 ;# r7 unsigned char * dst
    386 ;# r8 int dst_pitch
    387 
    388 sixtap_predict8x4_ppc:
    389     mfspr   r11, 256            ;# get old VRSAVE
    390     oris    r12, r11, 0xffff
    391     ori     r12, r12, 0xffc0
    392     mtspr   256, r12            ;# set VRSAVE
    393 
    394     stwu    r1,-32(r1)          ;# create space on the stack
    395 
    396     slwi.   r5, r5, 5           ;# index into horizontal filter array
    397 
    398     vspltish v19, 7
    399 
    400     ;# If there isn't any filtering to be done for the horizontal, then
    401     ;#  just skip to the second pass.
    402     beq-    second_pass_pre_copy_8x4
    403 
    404     load_hfilter v13, v14
    405 
    406     ;# rounding added in on the multiply
    407     vspltisw v16, 8
    408     vspltisw v15, 3
    409     vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
    410 
    411     ;# Load up permutation constants
    412     load_c v16, B_0123, 0, r9, r10
    413     load_c v17, B_4567, 0, r9, r10
    414     load_c v18, B_89AB, 0, r9, r10
    415 
    416     ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
    417     addi    r3, r3, -2
    418 
    419     addi    r9, r3, 0
    420     li      r10, 16
    421     Read8x8 v2, r3, r4, 1
    422     Read8x8 v3, r3, r4, 1
    423     Read8x8 v4, r3, r4, 1
    424     Read8x8 v5, r3, r4, 1
    425 
    426     slwi.   r6, r6, 4           ;# index into vertical filter array
    427 
    428     ;# filter a line
    429     interp_8x8 v2
    430     interp_8x8 v3
    431     interp_8x8 v4
    432     interp_8x8 v5
    433 
    434     ;# Finished filtering main horizontal block.  If there is no
    435     ;#  vertical filtering, jump to storing the data.  Otherwise
    436     ;#  load up and filter the additional 5 lines that are needed
    437     ;#  for the vertical filter.
    438     beq-    store_8x4
    439 
    440     ;# only needed if there is a vertical filter present
    441     ;# if the second filter is not null then need to back off by 2*pitch
    442     sub     r9, r9, r4
    443     sub     r9, r9, r4
    444 
    445     Read8x8 v0, r9, r4, 1
    446     Read8x8 v1, r9, r4, 0
    447     Read8x8 v6, r3, r4, 1
    448     Read8x8 v7, r3, r4, 1
    449     Read8x8 v8, r3, r4, 0
    450 
    451     interp_8x8 v0
    452     interp_8x8 v1
    453     interp_8x8 v6
    454     interp_8x8 v7
    455     interp_8x8 v8
    456 
    457     b       second_pass_8x4
    458 
    459 second_pass_pre_copy_8x4:
    460     ;# only needed if there is a vertical filter present
    461     ;# if the second filter is not null then need to back off by 2*pitch
    462     sub     r3, r3, r4
    463     sub     r3, r3, r4
    464     li      r10, 16
    465 
    466     Read8x8 v0,  r3, r4, 1
    467     Read8x8 v1,  r3, r4, 1
    468     Read8x8 v2,  r3, r4, 1
    469     Read8x8 v3,  r3, r4, 1
    470     Read8x8 v4,  r3, r4, 1
    471     Read8x8 v5,  r3, r4, 1
    472     Read8x8 v6,  r3, r4, 1
    473     Read8x8 v7,  r3, r4, 1
    474     Read8x8 v8,  r3, r4, 1
    475 
    476     slwi    r6, r6, 4           ;# index into vertical filter array
    477 
    478 second_pass_8x4:
    479     load_c v13, VFilter, r6, r9, r10
    480 
    481     vspltish v15, 8
    482     vspltish v20, 3
    483     vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    484 
    485     vspltb  v14, v13, 1
    486     vspltb  v15, v13, 2
    487     vspltb  v16, v13, 3
    488     vspltb  v17, v13, 4
    489     vspltb  v18, v13, 5
    490     vspltb  v13, v13, 0
    491 
    492     vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
    493     vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
    494     vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
    495     vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
    496 
    497     cmpi    cr0, r8, 8
    498     beq     cr0, store_aligned_8x4
    499 
    500     w_8x8   v0, r7, r0, r8
    501     w_8x8   v1, r7, r0, r8
    502     w_8x8   v2, r7, r0, r8
    503     w_8x8   v3, r7, r0, r8
    504 
    505     b       exit_8x4
    506 
    507 store_aligned_8x4:
    508 
    509     load_c v10, b_hilo, 0, r9, r10
    510 
    511     vperm   v0, v0, v1, v10
    512     vperm   v2, v2, v3, v10
    513 
    514     stvx    v0, 0, r7
    515     addi    r7, r7, 16
    516     stvx    v2, 0, r7
    517 
    518     b       exit_8x4
    519 
    520 store_8x4:
    521     cmpi    cr0, r8, 8
    522     beq     cr0, store_aligned2_8x4
    523 
    524     w_8x8   v2, r7, r0, r8
    525     w_8x8   v3, r7, r0, r8
    526     w_8x8   v4, r7, r0, r8
    527     w_8x8   v5, r7, r0, r8
    528 
    529     b       exit_8x4
    530 
    531 store_aligned2_8x4:
    532     load_c v10, b_hilo, 0, r9, r10
    533 
    534     vperm   v2, v2, v3, v10
    535     vperm   v4, v4, v5, v10
    536 
    537     stvx    v2, 0, r7
    538     addi    r7, r7, 16
    539     stvx    v4, 0, r7
    540 
    541 exit_8x4:
    542 
    543     addi    r1, r1, 32          ;# recover stack
    544 
    545     mtspr   256, r11            ;# reset old VRSAVE
    546 
    547 
    548     blr
    549 
    550     .align 2
    551 ;# r3 unsigned char * src
    552 ;# r4 int src_pitch
    553 ;# r5 int x_offset
    554 ;# r6 int y_offset
    555 ;# r7 unsigned char * dst
    556 ;# r8 int dst_pitch
    557 
    558 ;# Because the width that needs to be filtered will fit in a single altivec
    559 ;#  register there is no need to loop.  Everything can stay in registers.
    560 sixtap_predict8x8_ppc:
    561     mfspr   r11, 256            ;# get old VRSAVE
    562     oris    r12, r11, 0xffff
    563     ori     r12, r12, 0xffc0
    564     mtspr   256, r12            ;# set VRSAVE
    565 
    566     stwu    r1,-32(r1)          ;# create space on the stack
    567 
    568     slwi.   r5, r5, 5           ;# index into horizontal filter array
    569 
    570     vspltish v19, 7
    571 
    572     ;# If there isn't any filtering to be done for the horizontal, then
    573     ;#  just skip to the second pass.
    574     beq-    second_pass_pre_copy_8x8
    575 
    576     load_hfilter v13, v14
    577 
    578     ;# rounding added in on the multiply
    579     vspltisw v16, 8
    580     vspltisw v15, 3
    581     vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
    582 
    583     ;# Load up permutation constants
    584     load_c v16, B_0123, 0, r9, r10
    585     load_c v17, B_4567, 0, r9, r10
    586     load_c v18, B_89AB, 0, r9, r10
    587 
    588     ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
    589     addi    r3, r3, -2
    590 
    591     addi    r9, r3, 0
    592     li      r10, 16
    593     Read8x8 v2, r3, r4, 1
    594     Read8x8 v3, r3, r4, 1
    595     Read8x8 v4, r3, r4, 1
    596     Read8x8 v5, r3, r4, 1
    597     Read8x8 v6, r3, r4, 1
    598     Read8x8 v7, r3, r4, 1
    599     Read8x8 v8, r3, r4, 1
    600     Read8x8 v9, r3, r4, 1
    601 
    602     slwi.   r6, r6, 4           ;# index into vertical filter array
    603 
    604     ;# filter a line
    605     interp_8x8 v2
    606     interp_8x8 v3
    607     interp_8x8 v4
    608     interp_8x8 v5
    609     interp_8x8 v6
    610     interp_8x8 v7
    611     interp_8x8 v8
    612     interp_8x8 v9
    613 
    614     ;# Finished filtering main horizontal block.  If there is no
    615     ;#  vertical filtering, jump to storing the data.  Otherwise
    616     ;#  load up and filter the additional 5 lines that are needed
    617     ;#  for the vertical filter.
    618     beq-    store_8x8
    619 
    620     ;# only needed if there is a vertical filter present
    621     ;# if the second filter is not null then need to back off by 2*pitch
    622     sub     r9, r9, r4
    623     sub     r9, r9, r4
    624 
    625     Read8x8 v0,  r9, r4, 1
    626     Read8x8 v1,  r9, r4, 0
    627     Read8x8 v10, r3, r4, 1
    628     Read8x8 v11, r3, r4, 1
    629     Read8x8 v12, r3, r4, 0
    630 
    631     interp_8x8 v0
    632     interp_8x8 v1
    633     interp_8x8 v10
    634     interp_8x8 v11
    635     interp_8x8 v12
    636 
    637     b       second_pass_8x8
    638 
    639 second_pass_pre_copy_8x8:
    640     ;# only needed if there is a vertical filter present
    641     ;# if the second filter is not null then need to back off by 2*pitch
    642     sub     r3, r3, r4
    643     sub     r3, r3, r4
    644     li      r10, 16
    645 
    646     Read8x8 v0,  r3, r4, 1
    647     Read8x8 v1,  r3, r4, 1
    648     Read8x8 v2,  r3, r4, 1
    649     Read8x8 v3,  r3, r4, 1
    650     Read8x8 v4,  r3, r4, 1
    651     Read8x8 v5,  r3, r4, 1
    652     Read8x8 v6,  r3, r4, 1
    653     Read8x8 v7,  r3, r4, 1
    654     Read8x8 v8,  r3, r4, 1
    655     Read8x8 v9,  r3, r4, 1
    656     Read8x8 v10, r3, r4, 1
    657     Read8x8 v11, r3, r4, 1
    658     Read8x8 v12, r3, r4, 0
    659 
    660     slwi    r6, r6, 4           ;# index into vertical filter array
    661 
    662 second_pass_8x8:
    663     load_c v13, VFilter, r6, r9, r10
    664 
    665     vspltish v15, 8
    666     vspltish v20, 3
    667     vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    668 
    669     vspltb  v14, v13, 1
    670     vspltb  v15, v13, 2
    671     vspltb  v16, v13, 3
    672     vspltb  v17, v13, 4
    673     vspltb  v18, v13, 5
    674     vspltb  v13, v13, 0
    675 
    676     vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
    677     vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
    678     vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
    679     vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
    680     vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
    681     vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
    682     vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
    683     vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
    684 
    685     cmpi    cr0, r8, 8
    686     beq     cr0, store_aligned_8x8
    687 
    688     w_8x8   v0, r7, r0, r8
    689     w_8x8   v1, r7, r0, r8
    690     w_8x8   v2, r7, r0, r8
    691     w_8x8   v3, r7, r0, r8
    692     w_8x8   v4, r7, r0, r8
    693     w_8x8   v5, r7, r0, r8
    694     w_8x8   v6, r7, r0, r8
    695     w_8x8   v7, r7, r0, r8
    696 
    697     b       exit_8x8
    698 
    699 store_aligned_8x8:
    700 
    701     load_c v10, b_hilo, 0, r9, r10
    702 
    703     vperm   v0, v0, v1, v10
    704     vperm   v2, v2, v3, v10
    705     vperm   v4, v4, v5, v10
    706     vperm   v6, v6, v7, v10
    707 
    708     stvx    v0, 0, r7
    709     addi    r7, r7, 16
    710     stvx    v2, 0, r7
    711     addi    r7, r7, 16
    712     stvx    v4, 0, r7
    713     addi    r7, r7, 16
    714     stvx    v6, 0, r7
    715 
    716     b       exit_8x8
    717 
    718 store_8x8:
    719     cmpi    cr0, r8, 8
    720     beq     cr0, store_aligned2_8x8
    721 
    722     w_8x8   v2, r7, r0, r8
    723     w_8x8   v3, r7, r0, r8
    724     w_8x8   v4, r7, r0, r8
    725     w_8x8   v5, r7, r0, r8
    726     w_8x8   v6, r7, r0, r8
    727     w_8x8   v7, r7, r0, r8
    728     w_8x8   v8, r7, r0, r8
    729     w_8x8   v9, r7, r0, r8
    730 
    731     b       exit_8x8
    732 
    733 store_aligned2_8x8:
    734     load_c v10, b_hilo, 0, r9, r10
    735 
    736     vperm   v2, v2, v3, v10
    737     vperm   v4, v4, v5, v10
    738     vperm   v6, v6, v7, v10
    739     vperm   v8, v8, v9, v10
    740 
    741     stvx    v2, 0, r7
    742     addi    r7, r7, 16
    743     stvx    v4, 0, r7
    744     addi    r7, r7, 16
    745     stvx    v6, 0, r7
    746     addi    r7, r7, 16
    747     stvx    v8, 0, r7
    748 
    749 exit_8x8:
    750 
    751     addi    r1, r1, 32          ;# recover stack
    752 
    753     mtspr   256, r11            ;# reset old VRSAVE
    754 
    755     blr
    756 
    757     .align 2
    758 ;# r3 unsigned char * src
    759 ;# r4 int src_pitch
    760 ;# r5 int x_offset
    761 ;# r6 int y_offset
    762 ;# r7 unsigned char * dst
    763 ;# r8 int dst_pitch
    764 
    765 ;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
    766 ;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
    767 ;#  temporary buffer because the source buffer can't be modified and the buffer
    768 ;#  for the destination is not large enough to hold the temporary data.
    769 sixtap_predict16x16_ppc:
    770     mfspr   r11, 256            ;# get old VRSAVE
    771     oris    r12, r11, 0xffff
    772     ori     r12, r12, 0xf000
    773     mtspr   256, r12            ;# set VRSAVE
    774 
    775     stwu    r1,-416(r1)         ;# create space on the stack
    776 
    777     ;# Three possiblities
    778     ;#  1. First filter is null.  Don't use a temp buffer.
    779     ;#  2. Second filter is null.  Don't use a temp buffer.
    780     ;#  3. Neither are null, use temp buffer.
    781 
    782     ;# First Pass (horizontal edge)
    783     ;#  setup pointers for src
    784     ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
    785     ;#  to second pass.  this is based on if x_offset is 0.
    786 
    787     ;# load up horizontal filter
    788     slwi.   r5, r5, 5           ;# index into horizontal filter array
    789 
    790     load_hfilter v4, v5
    791 
    792     beq-    copy_horizontal_16x21
    793 
    794     ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
    795     addi    r3, r3, -2
    796 
    797     slwi.   r6, r6, 4           ;# index into vertical filter array
    798 
    799     ;# setup constants
    800     ;# v14 permutation value for alignment
    801     load_c v14, b_hperm, 0, r9, r10
    802 
    803     ;# These statements are guessing that there won't be a second pass,
    804     ;#  but if there is then inside the bypass they need to be set
    805     li      r0, 16              ;# prepare for no vertical filter
    806 
    807     ;# Change the output pointer and pitch to be the actual
    808     ;#  desination instead of a temporary buffer.
    809     addi    r9, r7, 0
    810     addi    r5, r8, 0
    811 
    812     ;# no vertical filter, so write the output from the first pass
    813     ;#  directly into the output buffer.
    814     beq-    no_vertical_filter_bypass
    815 
    816     ;# if the second filter is not null then need to back off by 2*pitch
    817     sub     r3, r3, r4
    818     sub     r3, r3, r4
    819 
    820     ;# setup counter for the number of lines that are going to be filtered
    821     li      r0, 21
    822 
    823     ;# use the stack as temporary storage
    824     la      r9, 48(r1)
    825     li      r5, 16
    826 
    827 no_vertical_filter_bypass:
    828 
    829     mtctr   r0
    830 
    831     ;# rounding added in on the multiply
    832     vspltisw v10, 8
    833     vspltisw v12, 3
    834     vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
    835 
    836     ;# downshift by 7 ( divide by 128 ) at the end
    837     vspltish v13, 7
    838 
    839     ;# index to the next set of vectors in the row.
    840     li      r10, 16
    841     li      r12, 32
    842 
    843 horizontal_loop_16x16:
    844 
    845     lvsl    v15,  0, r3         ;# permutate value for alignment
    846 
    847     ;# input to filter is 21 bytes wide, output is 16 bytes.
    848     ;#  input will can span three vectors if not aligned correctly.
    849     lvx     v1,   0, r3
    850     lvx     v2, r10, r3
    851     lvx     v3, r12, r3
    852 
    853     vperm   v8, v1, v2, v15
    854     vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
    855 
    856     vsldoi  v11, v8, v9, 4
    857 
    858     ;# set 0
    859     vmsummbm v6, v4, v8, v12    ;# taps times elements
    860     vmsummbm v0, v5, v11, v6
    861 
    862     ;# set 1
    863     vsldoi  v10, v8, v9, 1
    864     vsldoi  v11, v8, v9, 5
    865 
    866     vmsummbm v6, v4, v10, v12
    867     vmsummbm v1, v5, v11, v6
    868 
    869     ;# set 2
    870     vsldoi  v10, v8, v9, 2
    871     vsldoi  v11, v8, v9, 6
    872 
    873     vmsummbm v6, v4, v10, v12
    874     vmsummbm v2, v5, v11, v6
    875 
    876     ;# set 3
    877     vsldoi  v10, v8, v9, 3
    878     vsldoi  v11, v8, v9, 7
    879 
    880     vmsummbm v6, v4, v10, v12
    881     vmsummbm v3, v5, v11, v6
    882 
    883     vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
    884     vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
    885 
    886     vsrh    v0, v0, v13         ;# divide v0, v1 by 128
    887     vsrh    v1, v1, v13
    888 
    889     vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
    890     vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
    891 
    892     stvx    v0,  0, r9
    893     add     r9, r9, r5
    894 
    895     add     r3, r3, r4
    896 
    897     bdnz    horizontal_loop_16x16
    898 
    899     ;# check again to see if vertical filter needs to be done.
    900     cmpi    cr0, r6, 0
    901     beq     cr0, end_16x16
    902 
    903     ;# yes there is, so go to the second pass
    904     b       second_pass_16x16
    905 
    906 copy_horizontal_16x21:
    907     li      r10, 21
    908     mtctr   r10
    909 
    910     li      r10, 16
    911 
    912     sub     r3, r3, r4
    913     sub     r3, r3, r4
    914 
    915     ;# this is done above if there is a horizontal filter,
    916     ;#  if not it needs to be done down here.
    917     slwi    r6, r6, 4           ;# index into vertical filter array
    918 
    919     ;# always write to the stack when doing a horizontal copy
    920     la      r9, 48(r1)
    921 
    922 copy_horizontal_loop_16x21:
    923     lvsl    v15,  0, r3         ;# permutate value for alignment
    924 
    925     lvx     v1,   0, r3
    926     lvx     v2, r10, r3
    927 
    928     vperm   v8, v1, v2, v15
    929 
    930     stvx    v8,  0, r9
    931     addi    r9, r9, 16
    932 
    933     add     r3, r3, r4
    934 
    935     bdnz    copy_horizontal_loop_16x21
    936 
    937 second_pass_16x16:
    938 
    939     ;# always read from the stack when doing a vertical filter
    940     la      r9, 48(r1)
    941 
    942     ;# downshift by 7 ( divide by 128 ) at the end
    943     vspltish v7, 7
    944 
    945     vpre_load
    946 
    947     luma_vsix
    948     luma_vsix
    949     luma_vfour
    950 
    951 end_16x16:
    952 
    953     addi    r1, r1, 416         ;# recover stack
    954 
    955     mtspr   256, r11            ;# reset old VRSAVE
    956 
    957     blr
    958 
    959     .data
    960 
    961     .align 4
    962 HFilter:
    963     .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
    964     .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    965     .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
    966     .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
    967     .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
    968     .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
    969     .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
    970     .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
    971     .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
    972     .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
    973     .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
    974     .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
    975     .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
    976     .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
    977     .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
    978     .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
    979 
    980     .align 4
    981 VFilter:
    982     .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    983     .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    984     .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    985     .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    986     .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    987     .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    988     .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    989     .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    990 
    991     .align 4
    992 b_hperm:
    993     .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
    994 
    995     .align 4
    996 B_0123:
    997     .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    998 
    999     .align 4
   1000 B_4567:
   1001     .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
   1002 
   1003     .align 4
   1004 B_89AB:
   1005     .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
   1006 
   1007     .align 4
   1008 b_hilo:
   1009     .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
   1010 
   1011     .align 4
   1012 b_hilo_4x4:
   1013     .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
   1014