Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl vp8_sub_pixel_variance4x4_ppc
     13     .globl vp8_sub_pixel_variance8x8_ppc
     14     .globl vp8_sub_pixel_variance8x16_ppc
     15     .globl vp8_sub_pixel_variance16x8_ppc
     16     .globl vp8_sub_pixel_variance16x16_ppc
     17 
     18 .macro load_c V, LABEL, OFF, R0, R1
     19     lis     \R0, \LABEL@ha
     20     la      \R1, \LABEL@l(\R0)
     21     lvx     \V, \OFF, \R1
     22 .endm
     23 
     24 .macro load_vfilter V0, V1
     25     load_c \V0, vfilter_b, r6, r12, r10
     26 
     27     addi    r6,  r6, 16
     28     lvx     \V1, r6, r10
     29 .endm
     30 
     31 .macro HProlog jump_label
     32     ;# load up horizontal filter
     33     slwi.   r5, r5, 4           ;# index into horizontal filter array
     34 
     35     ;# index to the next set of vectors in the row.
     36     li      r10, 16
     37 
     38     ;# downshift by 7 ( divide by 128 ) at the end
     39     vspltish v19, 7
     40 
     41     ;# If there isn't any filtering to be done for the horizontal, then
     42     ;#  just skip to the second pass.
     43     beq     \jump_label
     44 
     45     load_c v20, hfilter_b, r5, r12, r0
     46 
     47     ;# setup constants
     48     ;# v14 permutation value for alignment
     49     load_c v28, b_hperm_b, 0, r12, r0
     50 
     51     ;# index to the next set of vectors in the row.
     52     li      r12, 32
     53 
     54     ;# rounding added in on the multiply
     55     vspltisw v21, 8
     56     vspltisw v18, 3
     57     vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
     58 
     59     slwi.   r6, r6, 5           ;# index into vertical filter array
     60 .endm
     61 
     62 ;# Filters a horizontal line
     63 ;# expects:
     64 ;#  r3  src_ptr
     65 ;#  r4  pitch
     66 ;#  r10 16
     67 ;#  r12 32
     68 ;#  v17 perm intput
     69 ;#  v18 rounding
     70 ;#  v19 shift
     71 ;#  v20 filter taps
     72 ;#  v21 tmp
     73 ;#  v22 tmp
     74 ;#  v23 tmp
     75 ;#  v24 tmp
     76 ;#  v25 tmp
     77 ;#  v26 tmp
     78 ;#  v27 tmp
     79 ;#  v28 perm output
     80 ;#
     81 
     82 .macro hfilter_8 V, hp, lp, increment_counter
     83     lvsl    v17,  0, r3         ;# permutate value for alignment
     84 
     85     ;# input to filter is 9 bytes wide, output is 8 bytes.
     86     lvx     v21,   0, r3
     87     lvx     v22, r10, r3
     88 
     89 .if \increment_counter
     90     add     r3, r3, r4
     91 .endif
     92     vperm   v21, v21, v22, v17
     93 
     94     vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
     95     vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
     96 
     97     vmsummbm v24, v20, v24, v18
     98     vmsummbm v25, v20, v25, v18
     99 
    100     vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
    101 
    102     vsrh    v24, v24, v19       ;# divide v0, v1 by 128
    103 
    104     vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
    105 .endm
    106 
    107 .macro vfilter_16 P0 P1
    108     vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
    109     vadduhm v22, v18, v22
    110     vmuloub v23, \P0, v20
    111     vadduhm v23, v18, v23
    112 
    113     vmuleub v24, \P1, v21
    114     vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
    115     vmuloub v25, \P1, v21
    116     vadduhm v23, v23, v25       ;# Ro = odds
    117 
    118     vsrh    v22, v22, v19       ;# divide by 128
    119     vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
    120     vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
    121     vmrglh  v23, v22, v23
    122     vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
    123 .endm
    124 
    125 .macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
    126     ;# Compute sum first.  Unpack to so signed subract
    127     ;#  can be used.  Only have a half word signed
    128     ;#  subract.  Do high, then low.
    129     vmrghb  \t1, \z0, \src
    130     vmrghb  \t2, \z0, \ref
    131     vsubshs \t1, \t1, \t2
    132     vsum4shs \sum, \t1, \sum
    133 
    134     vmrglb  \t1, \z0, \src
    135     vmrglb  \t2, \z0, \ref
    136     vsubshs \t1, \t1, \t2
    137     vsum4shs \sum, \t1, \sum
    138 
    139     ;# Now compute sse.
    140     vsububs \t1, \src, \ref
    141     vsububs \t2, \ref, \src
    142     vor     \t1, \t1, \t2
    143 
    144     vmsumubm \sse, \t1, \t1, \sse
    145 .endm
    146 
    147 .macro variance_final sum, sse, z0, DS
    148     vsumsws \sum, \sum, \z0
    149     vsumsws \sse, \sse, \z0
    150 
    151     stvx    \sum, 0, r1
    152     lwz     r3, 12(r1)
    153 
    154     stvx    \sse, 0, r1
    155     lwz     r4, 12(r1)
    156 
    157     stw     r4, 0(r9)           ;# sse
    158 
    159     mullw   r3, r3, r3          ;# sum*sum
    160     srawi   r3, r3, \DS         ;# (sum*sum) >> 8
    161     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
    162 .endm
    163 
    164 .macro compute_sum_sse_16 V, increment_counter
    165     load_and_align_16  v16, r7, r8, \increment_counter
    166     compute_sum_sse \V, v16, v18, v19, v20, v21, v23
    167 .endm
    168 
    169 .macro load_and_align_16 V, R, P, increment_counter
    170     lvsl    v17,  0, \R         ;# permutate value for alignment
    171 
    172     ;# input to filter is 21 bytes wide, output is 16 bytes.
    173     ;#  input will can span three vectors if not aligned correctly.
    174     lvx     v21,   0, \R
    175     lvx     v22, r10, \R
    176 
    177 .if \increment_counter
    178     add     \R, \R, \P
    179 .endif
    180 
    181     vperm   \V, v21, v22, v17
    182 .endm
    183 
    184     .align 2
    185 ;# r3 unsigned char  *src_ptr
    186 ;# r4 int  src_pixels_per_line
    187 ;# r5 int  xoffset
    188 ;# r6 int  yoffset
    189 ;# r7 unsigned char *dst_ptr
    190 ;# r8 int dst_pixels_per_line
    191 ;# r9 unsigned int *sse
    192 ;#
    193 ;# r3 return value
    194 vp8_sub_pixel_variance4x4_ppc:
    195     mfspr   r11, 256            ;# get old VRSAVE
    196     oris    r12, r11, 0xf830
    197     ori     r12, r12, 0xfff8
    198     mtspr   256, r12            ;# set VRSAVE
    199 
    200     stwu    r1,-32(r1)          ;# create space on the stack
    201 
    202     HProlog second_pass_4x4_pre_copy_b
    203 
    204     ;# Load up permutation constants
    205     load_c v10, b_0123_b, 0, r12, r0
    206     load_c v11, b_4567_b, 0, r12, r0
    207 
    208     hfilter_8 v0, v10, v11, 1
    209     hfilter_8 v1, v10, v11, 1
    210     hfilter_8 v2, v10, v11, 1
    211     hfilter_8 v3, v10, v11, 1
    212 
    213     ;# Finished filtering main horizontal block.  If there is no
    214     ;#  vertical filtering, jump to storing the data.  Otherwise
    215     ;#  load up and filter the additional line that is needed
    216     ;#  for the vertical filter.
    217     beq     compute_sum_sse_4x4_b
    218 
    219     hfilter_8 v4, v10, v11, 0
    220 
    221     b   second_pass_4x4_b
    222 
    223 second_pass_4x4_pre_copy_b:
    224     slwi    r6, r6, 5           ;# index into vertical filter array
    225 
    226     load_and_align_16 v0, r3, r4, 1
    227     load_and_align_16 v1, r3, r4, 1
    228     load_and_align_16 v2, r3, r4, 1
    229     load_and_align_16 v3, r3, r4, 1
    230     load_and_align_16 v4, r3, r4, 0
    231 
    232 second_pass_4x4_b:
    233     vspltish v20, 8
    234     vspltish v18, 3
    235     vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    236 
    237     load_vfilter v20, v21
    238 
    239     vfilter_16 v0,  v1
    240     vfilter_16 v1,  v2
    241     vfilter_16 v2,  v3
    242     vfilter_16 v3,  v4
    243 
    244 compute_sum_sse_4x4_b:
    245     vspltish v18, 0             ;# sum
    246     vspltish v19, 0             ;# sse
    247     vspltish v23, 0             ;# unpack
    248     li      r10, 16
    249 
    250     load_and_align_16 v4, r7, r8, 1
    251     load_and_align_16 v5, r7, r8, 1
    252     load_and_align_16 v6, r7, r8, 1
    253     load_and_align_16 v7, r7, r8, 1
    254 
    255     vmrghb  v0, v0, v1
    256     vmrghb  v1, v2, v3
    257 
    258     vmrghb  v2, v4, v5
    259     vmrghb  v3, v6, v7
    260 
    261     load_c v10, b_hilo_b, 0, r12, r0
    262 
    263     vperm   v0, v0, v1, v10
    264     vperm   v1, v2, v3, v10
    265 
    266     compute_sum_sse v0, v1, v18, v19, v20, v21, v23
    267 
    268     variance_final v18, v19, v23, 4
    269 
    270     addi    r1, r1, 32          ;# recover stack
    271     mtspr   256, r11            ;# reset old VRSAVE
    272 
    273     blr
    274 
    275     .align 2
    276 ;# r3 unsigned char  *src_ptr
    277 ;# r4 int  src_pixels_per_line
    278 ;# r5 int  xoffset
    279 ;# r6 int  yoffset
    280 ;# r7 unsigned char *dst_ptr
    281 ;# r8 int dst_pixels_per_line
    282 ;# r9 unsigned int *sse
    283 ;#
    284 ;# r3 return value
    285 vp8_sub_pixel_variance8x8_ppc:
    286     mfspr   r11, 256            ;# get old VRSAVE
    287     oris    r12, r11, 0xfff0
    288     ori     r12, r12, 0xffff
    289     mtspr   256, r12            ;# set VRSAVE
    290 
    291     stwu    r1,-32(r1)          ;# create space on the stack
    292 
    293     HProlog second_pass_8x8_pre_copy_b
    294 
    295     ;# Load up permutation constants
    296     load_c v10, b_0123_b, 0, r12, r0
    297     load_c v11, b_4567_b, 0, r12, r0
    298 
    299     hfilter_8 v0, v10, v11, 1
    300     hfilter_8 v1, v10, v11, 1
    301     hfilter_8 v2, v10, v11, 1
    302     hfilter_8 v3, v10, v11, 1
    303     hfilter_8 v4, v10, v11, 1
    304     hfilter_8 v5, v10, v11, 1
    305     hfilter_8 v6, v10, v11, 1
    306     hfilter_8 v7, v10, v11, 1
    307 
    308     ;# Finished filtering main horizontal block.  If there is no
    309     ;#  vertical filtering, jump to storing the data.  Otherwise
    310     ;#  load up and filter the additional line that is needed
    311     ;#  for the vertical filter.
    312     beq     compute_sum_sse_8x8_b
    313 
    314     hfilter_8 v8, v10, v11, 0
    315 
    316     b   second_pass_8x8_b
    317 
    318 second_pass_8x8_pre_copy_b:
    319     slwi.   r6, r6, 5           ;# index into vertical filter array
    320 
    321     load_and_align_16 v0, r3, r4, 1
    322     load_and_align_16 v1, r3, r4, 1
    323     load_and_align_16 v2, r3, r4, 1
    324     load_and_align_16 v3, r3, r4, 1
    325     load_and_align_16 v4, r3, r4, 1
    326     load_and_align_16 v5, r3, r4, 1
    327     load_and_align_16 v6, r3, r4, 1
    328     load_and_align_16 v7, r3, r4, 1
    329     load_and_align_16 v8, r3, r4, 0
    330 
    331     beq     compute_sum_sse_8x8_b
    332 
    333 second_pass_8x8_b:
    334     vspltish v20, 8
    335     vspltish v18, 3
    336     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    337 
    338     load_vfilter v20, v21
    339 
    340     vfilter_16 v0, v1
    341     vfilter_16 v1, v2
    342     vfilter_16 v2, v3
    343     vfilter_16 v3, v4
    344     vfilter_16 v4, v5
    345     vfilter_16 v5, v6
    346     vfilter_16 v6, v7
    347     vfilter_16 v7, v8
    348 
    349 compute_sum_sse_8x8_b:
    350     vspltish v18, 0             ;# sum
    351     vspltish v19, 0             ;# sse
    352     vspltish v23, 0             ;# unpack
    353     li      r10, 16
    354 
    355     vmrghb  v0, v0, v1
    356     vmrghb  v1, v2, v3
    357     vmrghb  v2, v4, v5
    358     vmrghb  v3, v6, v7
    359 
    360     load_and_align_16 v4,  r7, r8, 1
    361     load_and_align_16 v5,  r7, r8, 1
    362     load_and_align_16 v6,  r7, r8, 1
    363     load_and_align_16 v7,  r7, r8, 1
    364     load_and_align_16 v8,  r7, r8, 1
    365     load_and_align_16 v9,  r7, r8, 1
    366     load_and_align_16 v10, r7, r8, 1
    367     load_and_align_16 v11, r7, r8, 0
    368 
    369     vmrghb  v4, v4,  v5
    370     vmrghb  v5, v6,  v7
    371     vmrghb  v6, v8,  v9
    372     vmrghb  v7, v10, v11
    373 
    374     compute_sum_sse v0, v4, v18, v19, v20, v21, v23
    375     compute_sum_sse v1, v5, v18, v19, v20, v21, v23
    376     compute_sum_sse v2, v6, v18, v19, v20, v21, v23
    377     compute_sum_sse v3, v7, v18, v19, v20, v21, v23
    378 
    379     variance_final v18, v19, v23, 6
    380 
    381     addi    r1, r1, 32          ;# recover stack
    382     mtspr   256, r11            ;# reset old VRSAVE
    383     blr
    384 
    385     .align 2
    386 ;# r3 unsigned char  *src_ptr
    387 ;# r4 int  src_pixels_per_line
    388 ;# r5 int  xoffset
    389 ;# r6 int  yoffset
    390 ;# r7 unsigned char *dst_ptr
    391 ;# r8 int dst_pixels_per_line
    392 ;# r9 unsigned int *sse
    393 ;#
    394 ;# r3 return value
    395 vp8_sub_pixel_variance8x16_ppc:
    396     mfspr   r11, 256            ;# get old VRSAVE
    397     oris    r12, r11, 0xffff
    398     ori     r12, r12, 0xfffc
    399     mtspr   256, r12            ;# set VRSAVE
    400 
    401     stwu    r1,-32(r1)          ;# create space on the stack
    402 
    403     HProlog second_pass_8x16_pre_copy_b
    404 
    405     ;# Load up permutation constants
    406     load_c v29, b_0123_b, 0, r12, r0
    407     load_c v30, b_4567_b, 0, r12, r0
    408 
    409     hfilter_8 v0,  v29, v30, 1
    410     hfilter_8 v1,  v29, v30, 1
    411     hfilter_8 v2,  v29, v30, 1
    412     hfilter_8 v3,  v29, v30, 1
    413     hfilter_8 v4,  v29, v30, 1
    414     hfilter_8 v5,  v29, v30, 1
    415     hfilter_8 v6,  v29, v30, 1
    416     hfilter_8 v7,  v29, v30, 1
    417     hfilter_8 v8,  v29, v30, 1
    418     hfilter_8 v9,  v29, v30, 1
    419     hfilter_8 v10, v29, v30, 1
    420     hfilter_8 v11, v29, v30, 1
    421     hfilter_8 v12, v29, v30, 1
    422     hfilter_8 v13, v29, v30, 1
    423     hfilter_8 v14, v29, v30, 1
    424     hfilter_8 v15, v29, v30, 1
    425 
    426     ;# Finished filtering main horizontal block.  If there is no
    427     ;#  vertical filtering, jump to storing the data.  Otherwise
    428     ;#  load up and filter the additional line that is needed
    429     ;#  for the vertical filter.
    430     beq     compute_sum_sse_8x16_b
    431 
    432     hfilter_8 v16, v29, v30, 0
    433 
    434     b   second_pass_8x16_b
    435 
    436 second_pass_8x16_pre_copy_b:
    437     slwi.   r6, r6, 5           ;# index into vertical filter array
    438 
    439     load_and_align_16 v0,  r3, r4, 1
    440     load_and_align_16 v1,  r3, r4, 1
    441     load_and_align_16 v2,  r3, r4, 1
    442     load_and_align_16 v3,  r3, r4, 1
    443     load_and_align_16 v4,  r3, r4, 1
    444     load_and_align_16 v5,  r3, r4, 1
    445     load_and_align_16 v6,  r3, r4, 1
    446     load_and_align_16 v7,  r3, r4, 1
    447     load_and_align_16 v8,  r3, r4, 1
    448     load_and_align_16 v9,  r3, r4, 1
    449     load_and_align_16 v10, r3, r4, 1
    450     load_and_align_16 v11, r3, r4, 1
    451     load_and_align_16 v12, r3, r4, 1
    452     load_and_align_16 v13, r3, r4, 1
    453     load_and_align_16 v14, r3, r4, 1
    454     load_and_align_16 v15, r3, r4, 1
    455     load_and_align_16 v16, r3, r4, 0
    456 
    457     beq     compute_sum_sse_8x16_b
    458 
    459 second_pass_8x16_b:
    460     vspltish v20, 8
    461     vspltish v18, 3
    462     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    463 
    464     load_vfilter v20, v21
    465 
    466     vfilter_16 v0,  v1
    467     vfilter_16 v1,  v2
    468     vfilter_16 v2,  v3
    469     vfilter_16 v3,  v4
    470     vfilter_16 v4,  v5
    471     vfilter_16 v5,  v6
    472     vfilter_16 v6,  v7
    473     vfilter_16 v7,  v8
    474     vfilter_16 v8,  v9
    475     vfilter_16 v9,  v10
    476     vfilter_16 v10, v11
    477     vfilter_16 v11, v12
    478     vfilter_16 v12, v13
    479     vfilter_16 v13, v14
    480     vfilter_16 v14, v15
    481     vfilter_16 v15, v16
    482 
    483 compute_sum_sse_8x16_b:
    484     vspltish v18, 0             ;# sum
    485     vspltish v19, 0             ;# sse
    486     vspltish v23, 0             ;# unpack
    487     li      r10, 16
    488 
    489     vmrghb  v0, v0,  v1
    490     vmrghb  v1, v2,  v3
    491     vmrghb  v2, v4,  v5
    492     vmrghb  v3, v6,  v7
    493     vmrghb  v4, v8,  v9
    494     vmrghb  v5, v10, v11
    495     vmrghb  v6, v12, v13
    496     vmrghb  v7, v14, v15
    497 
    498     load_and_align_16 v8,  r7, r8, 1
    499     load_and_align_16 v9,  r7, r8, 1
    500     load_and_align_16 v10, r7, r8, 1
    501     load_and_align_16 v11, r7, r8, 1
    502     load_and_align_16 v12, r7, r8, 1
    503     load_and_align_16 v13, r7, r8, 1
    504     load_and_align_16 v14, r7, r8, 1
    505     load_and_align_16 v15, r7, r8, 1
    506 
    507     vmrghb  v8,  v8,  v9
    508     vmrghb  v9,  v10, v11
    509     vmrghb  v10, v12, v13
    510     vmrghb  v11, v14, v15
    511 
    512     compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
    513     compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
    514     compute_sum_sse v2, v10, v18, v19, v20, v21, v23
    515     compute_sum_sse v3, v11, v18, v19, v20, v21, v23
    516 
    517     load_and_align_16 v8,  r7, r8, 1
    518     load_and_align_16 v9,  r7, r8, 1
    519     load_and_align_16 v10, r7, r8, 1
    520     load_and_align_16 v11, r7, r8, 1
    521     load_and_align_16 v12, r7, r8, 1
    522     load_and_align_16 v13, r7, r8, 1
    523     load_and_align_16 v14, r7, r8, 1
    524     load_and_align_16 v15, r7, r8, 0
    525 
    526     vmrghb  v8,  v8,  v9
    527     vmrghb  v9,  v10, v11
    528     vmrghb  v10, v12, v13
    529     vmrghb  v11, v14, v15
    530 
    531     compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
    532     compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
    533     compute_sum_sse v6, v10, v18, v19, v20, v21, v23
    534     compute_sum_sse v7, v11, v18, v19, v20, v21, v23
    535 
    536     variance_final v18, v19, v23, 7
    537 
    538     addi    r1, r1, 32          ;# recover stack
    539     mtspr   256, r11            ;# reset old VRSAVE
    540     blr
    541 
    542 ;# Filters a horizontal line
    543 ;# expects:
    544 ;#  r3  src_ptr
    545 ;#  r4  pitch
    546 ;#  r10 16
    547 ;#  r12 32
    548 ;#  v17 perm intput
    549 ;#  v18 rounding
    550 ;#  v19 shift
    551 ;#  v20 filter taps
    552 ;#  v21 tmp
    553 ;#  v22 tmp
    554 ;#  v23 tmp
    555 ;#  v24 tmp
    556 ;#  v25 tmp
    557 ;#  v26 tmp
    558 ;#  v27 tmp
    559 ;#  v28 perm output
    560 ;#
    561 .macro hfilter_16 V, increment_counter
    562 
    563     lvsl    v17,  0, r3         ;# permutate value for alignment
    564 
    565     ;# input to filter is 21 bytes wide, output is 16 bytes.
    566     ;#  input will can span three vectors if not aligned correctly.
    567     lvx     v21,   0, r3
    568     lvx     v22, r10, r3
    569     lvx     v23, r12, r3
    570 
    571 .if \increment_counter
    572     add     r3, r3, r4
    573 .endif
    574     vperm   v21, v21, v22, v17
    575     vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
    576 
    577     ;# set 0
    578     vmsummbm v24, v20, v21, v18 ;# taps times elements
    579 
    580     ;# set 1
    581     vsldoi  v23, v21, v22, 1
    582     vmsummbm v25, v20, v23, v18
    583 
    584     ;# set 2
    585     vsldoi  v23, v21, v22, 2
    586     vmsummbm v26, v20, v23, v18
    587 
    588     ;# set 3
    589     vsldoi  v23, v21, v22, 3
    590     vmsummbm v27, v20, v23, v18
    591 
    592     vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
    593     vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
    594 
    595     vsrh    v24, v24, v19       ;# divide v0, v1 by 128
    596     vsrh    v25, v25, v19
    597 
    598     vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
    599     vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
    600 .endm
    601 
    602     .align 2
    603 ;# r3 unsigned char  *src_ptr
    604 ;# r4 int  src_pixels_per_line
    605 ;# r5 int  xoffset
    606 ;# r6 int  yoffset
    607 ;# r7 unsigned char *dst_ptr
    608 ;# r8 int dst_pixels_per_line
    609 ;# r9 unsigned int *sse
    610 ;#
    611 ;# r3 return value
    612 vp8_sub_pixel_variance16x8_ppc:
    613     mfspr   r11, 256            ;# get old VRSAVE
    614     oris    r12, r11, 0xffff
    615     ori     r12, r12, 0xfff8
    616     mtspr   256, r12            ;# set VRSAVE
    617 
    618     stwu    r1, -32(r1)         ;# create space on the stack
    619 
    620     HProlog second_pass_16x8_pre_copy_b
    621 
    622     hfilter_16 v0, 1
    623     hfilter_16 v1, 1
    624     hfilter_16 v2, 1
    625     hfilter_16 v3, 1
    626     hfilter_16 v4, 1
    627     hfilter_16 v5, 1
    628     hfilter_16 v6, 1
    629     hfilter_16 v7, 1
    630 
    631     ;# Finished filtering main horizontal block.  If there is no
    632     ;#  vertical filtering, jump to storing the data.  Otherwise
    633     ;#  load up and filter the additional line that is needed
    634     ;#  for the vertical filter.
    635     beq     compute_sum_sse_16x8_b
    636 
    637     hfilter_16 v8, 0
    638 
    639     b   second_pass_16x8_b
    640 
    641 second_pass_16x8_pre_copy_b:
    642     slwi.   r6, r6, 5           ;# index into vertical filter array
    643 
    644     load_and_align_16  v0,  r3, r4, 1
    645     load_and_align_16  v1,  r3, r4, 1
    646     load_and_align_16  v2,  r3, r4, 1
    647     load_and_align_16  v3,  r3, r4, 1
    648     load_and_align_16  v4,  r3, r4, 1
    649     load_and_align_16  v5,  r3, r4, 1
    650     load_and_align_16  v6,  r3, r4, 1
    651     load_and_align_16  v7,  r3, r4, 1
    652     load_and_align_16  v8,  r3, r4, 1
    653 
    654     beq     compute_sum_sse_16x8_b
    655 
    656 second_pass_16x8_b:
    657     vspltish v20, 8
    658     vspltish v18, 3
    659     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    660 
    661     load_vfilter v20, v21
    662 
    663     vfilter_16 v0,  v1
    664     vfilter_16 v1,  v2
    665     vfilter_16 v2,  v3
    666     vfilter_16 v3,  v4
    667     vfilter_16 v4,  v5
    668     vfilter_16 v5,  v6
    669     vfilter_16 v6,  v7
    670     vfilter_16 v7,  v8
    671 
    672 compute_sum_sse_16x8_b:
    673     vspltish v18, 0             ;# sum
    674     vspltish v19, 0             ;# sse
    675     vspltish v23, 0             ;# unpack
    676     li      r10, 16
    677 
    678     compute_sum_sse_16 v0, 1
    679     compute_sum_sse_16 v1, 1
    680     compute_sum_sse_16 v2, 1
    681     compute_sum_sse_16 v3, 1
    682     compute_sum_sse_16 v4, 1
    683     compute_sum_sse_16 v5, 1
    684     compute_sum_sse_16 v6, 1
    685     compute_sum_sse_16 v7, 0
    686 
    687     variance_final v18, v19, v23, 7
    688 
    689     addi    r1, r1, 32          ;# recover stack
    690 
    691     mtspr   256, r11            ;# reset old VRSAVE
    692 
    693     blr
    694 
    695     .align 2
    696 ;# r3 unsigned char  *src_ptr
    697 ;# r4 int  src_pixels_per_line
    698 ;# r5 int  xoffset
    699 ;# r6 int  yoffset
    700 ;# r7 unsigned char *dst_ptr
    701 ;# r8 int dst_pixels_per_line
    702 ;# r9 unsigned int *sse
    703 ;#
    704 ;# r3 return value
    705 vp8_sub_pixel_variance16x16_ppc:
    706     mfspr   r11, 256            ;# get old VRSAVE
    707     oris    r12, r11, 0xffff
    708     ori     r12, r12, 0xfff8
    709     mtspr   256, r12            ;# set VRSAVE
    710 
    711     stwu    r1, -32(r1)         ;# create space on the stack
    712 
    713     HProlog second_pass_16x16_pre_copy_b
    714 
    715     hfilter_16 v0,  1
    716     hfilter_16 v1,  1
    717     hfilter_16 v2,  1
    718     hfilter_16 v3,  1
    719     hfilter_16 v4,  1
    720     hfilter_16 v5,  1
    721     hfilter_16 v6,  1
    722     hfilter_16 v7,  1
    723     hfilter_16 v8,  1
    724     hfilter_16 v9,  1
    725     hfilter_16 v10, 1
    726     hfilter_16 v11, 1
    727     hfilter_16 v12, 1
    728     hfilter_16 v13, 1
    729     hfilter_16 v14, 1
    730     hfilter_16 v15, 1
    731 
    732     ;# Finished filtering main horizontal block.  If there is no
    733     ;#  vertical filtering, jump to storing the data.  Otherwise
    734     ;#  load up and filter the additional line that is needed
    735     ;#  for the vertical filter.
    736     beq     compute_sum_sse_16x16_b
    737 
    738     hfilter_16 v16, 0
    739 
    740     b   second_pass_16x16_b
    741 
    742 second_pass_16x16_pre_copy_b:
    743     slwi.   r6, r6, 5           ;# index into vertical filter array
    744 
    745     load_and_align_16  v0,  r3, r4, 1
    746     load_and_align_16  v1,  r3, r4, 1
    747     load_and_align_16  v2,  r3, r4, 1
    748     load_and_align_16  v3,  r3, r4, 1
    749     load_and_align_16  v4,  r3, r4, 1
    750     load_and_align_16  v5,  r3, r4, 1
    751     load_and_align_16  v6,  r3, r4, 1
    752     load_and_align_16  v7,  r3, r4, 1
    753     load_and_align_16  v8,  r3, r4, 1
    754     load_and_align_16  v9,  r3, r4, 1
    755     load_and_align_16  v10, r3, r4, 1
    756     load_and_align_16  v11, r3, r4, 1
    757     load_and_align_16  v12, r3, r4, 1
    758     load_and_align_16  v13, r3, r4, 1
    759     load_and_align_16  v14, r3, r4, 1
    760     load_and_align_16  v15, r3, r4, 1
    761     load_and_align_16  v16, r3, r4, 0
    762 
    763     beq     compute_sum_sse_16x16_b
    764 
    765 second_pass_16x16_b:
    766     vspltish v20, 8
    767     vspltish v18, 3
    768     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    769 
    770     load_vfilter v20, v21
    771 
    772     vfilter_16 v0,  v1
    773     vfilter_16 v1,  v2
    774     vfilter_16 v2,  v3
    775     vfilter_16 v3,  v4
    776     vfilter_16 v4,  v5
    777     vfilter_16 v5,  v6
    778     vfilter_16 v6,  v7
    779     vfilter_16 v7,  v8
    780     vfilter_16 v8,  v9
    781     vfilter_16 v9,  v10
    782     vfilter_16 v10, v11
    783     vfilter_16 v11, v12
    784     vfilter_16 v12, v13
    785     vfilter_16 v13, v14
    786     vfilter_16 v14, v15
    787     vfilter_16 v15, v16
    788 
    789 compute_sum_sse_16x16_b:
    790     vspltish v18, 0             ;# sum
    791     vspltish v19, 0             ;# sse
    792     vspltish v23, 0             ;# unpack
    793     li      r10, 16
    794 
    795     compute_sum_sse_16 v0,  1
    796     compute_sum_sse_16 v1,  1
    797     compute_sum_sse_16 v2,  1
    798     compute_sum_sse_16 v3,  1
    799     compute_sum_sse_16 v4,  1
    800     compute_sum_sse_16 v5,  1
    801     compute_sum_sse_16 v6,  1
    802     compute_sum_sse_16 v7,  1
    803     compute_sum_sse_16 v8,  1
    804     compute_sum_sse_16 v9,  1
    805     compute_sum_sse_16 v10, 1
    806     compute_sum_sse_16 v11, 1
    807     compute_sum_sse_16 v12, 1
    808     compute_sum_sse_16 v13, 1
    809     compute_sum_sse_16 v14, 1
    810     compute_sum_sse_16 v15, 0
    811 
    812     variance_final v18, v19, v23, 8
    813 
    814     addi    r1, r1, 32          ;# recover stack
    815 
    816     mtspr   256, r11            ;# reset old VRSAVE
    817 
    818     blr
    819 
    820     .data
    821 
    822     .align 4
    823 hfilter_b:
    824     .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
    825     .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
    826     .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
    827     .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
    828     .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
    829     .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
    830     .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
    831     .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
    832 
    833     .align 4
    834 vfilter_b:
    835     .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
    836     .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    837     .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
    838     .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    839     .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
    840     .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
    841     .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
    842     .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
    843     .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
    844     .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
    845     .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
    846     .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
    847     .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
    848     .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
    849     .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    850     .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
    851 
    852     .align 4
    853 b_hperm_b:
    854     .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
    855 
    856     .align 4
    857 b_0123_b:
    858     .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    859 
    860     .align 4
    861 b_4567_b:
    862     .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
    863 
    864 b_hilo_b:
    865     .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
    866