Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl mbloop_filter_horizontal_edge_y_ppc
     13     .globl loop_filter_horizontal_edge_y_ppc
     14     .globl mbloop_filter_vertical_edge_y_ppc
     15     .globl loop_filter_vertical_edge_y_ppc
     16 
     17     .globl mbloop_filter_horizontal_edge_uv_ppc
     18     .globl loop_filter_horizontal_edge_uv_ppc
     19     .globl mbloop_filter_vertical_edge_uv_ppc
     20     .globl loop_filter_vertical_edge_uv_ppc
     21 
     22     .globl loop_filter_simple_horizontal_edge_ppc
     23     .globl loop_filter_simple_vertical_edge_ppc
     24 
     25     .text
     26 ;# We often need to perform transposes (and other transpose-like operations)
     27 ;#   on matrices of data.  This is simplified by the fact that we usually
     28 ;#   operate on hunks of data whose dimensions are powers of 2, or at least
     29 ;#   divisible by highish powers of 2.
     30 ;#
     31 ;#   These operations can be very confusing.  They become more straightforward
     32 ;#   when we think of them as permutations of address bits: Concatenate a
     33 ;#   group of vector registers and think of it as occupying a block of
     34 ;#   memory beginning at address zero.  The low four bits 0...3 of the
     35 ;#   address then correspond to position within a register, the higher-order
     36 ;#   address bits select the register.
     37 ;#
     38 ;#   Although register selection, at the code level, is arbitrary, things
     39 ;#   are simpler if we use contiguous ranges of register numbers, simpler
     40 ;#   still if the low-order bits of the register number correspond to
     41 ;#   conceptual address bits.  We do this whenever reasonable.
     42 ;#
     43 ;#   A 16x16 transpose can then be thought of as an operation on
     44 ;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
     45 ;#   memory and the effect of a transpose is to interchange address bit
     46 ;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
     47 ;#   column, which is interchanged with the row addressed by bits 4..7.
     48 ;#
     49 ;#   The altivec merge instructions provide a rapid means of effecting
     50 ;#   many of these transforms.  They operate at three widths (8,16,32).
     51 ;#   Writing V(x) for vector register #x, paired merges permute address
     52 ;#   indices as follows.
     53 ;#
     54 ;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
     55 ;#
     56 ;#      vmrghb  V( x),          V( y), V( y + (1<<s))
     57 ;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
     58 ;#
     59 ;#
     60 ;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
     61 ;#
     62 ;#      vmrghh  V( x),          V( y), V( y + (1<<s))
     63 ;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
     64 ;#
     65 ;#
     66 ;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
     67 ;#
     68 ;#      vmrghw  V( x),          V( y), V( y + (1<<s))
     69 ;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
     70 ;#
     71 ;#
     72 ;#   Unfortunately, there is no doubleword merge instruction.
     73 ;#   The following sequence uses "vperm" is a substitute.
     74 ;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
     75 ;#   are in registers Vhihi and Vlolo, we can also effect the permutation
     76 ;#
     77 ;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
     78 ;#
     79 ;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
     80 ;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
     81 ;#
     82 ;#
     83 ;#   Except for bits s and d, the other relationships between register
     84 ;#   number (= high-order part of address) bits are at the disposal of
     85 ;#   the programmer.
     86 ;#
     87 
     88 ;# To avoid excess transposes, we filter all 3 vertical luma subblock
     89 ;#   edges together.  This requires a single 16x16 transpose, which, in
     90 ;#   the above language, amounts to the following permutation of address
     91 ;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
     92 ;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
     93 ;#
     94 ;#   Except for the fact that the destination registers get written
     95 ;#   before we are done referencing the old contents, the cyclic transform
     96 ;#   is effected by
     97 ;#
     98 ;#      x = 0;  do {
     99 ;#          vmrghb V(2x),   V(x), V(x+8);
    100 ;#          vmrghb V(2x+1), V(x), V(x+8);
    101 ;#      } while( ++x < 8);
    102 ;#
    103 ;#   For clarity, and because we can afford it, we do this transpose
    104 ;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
    105 ;#   leaving the final result in 16 .. 31, as the lower registers are
    106 ;#   used in the filtering itself.
    107 ;#
    108 .macro Tpair A, B, X, Y
    109     vmrghb  \A, \X, \Y
    110     vmrglb  \B, \X, \Y
    111 .endm
    112 
    113 ;# Each step takes 8*2 = 16 instructions
    114 
    115 .macro t16_even
    116     Tpair v16,v17,  v0,v8
    117     Tpair v18,v19,  v1,v9
    118     Tpair v20,v21,  v2,v10
    119     Tpair v22,v23,  v3,v11
    120     Tpair v24,v25,  v4,v12
    121     Tpair v26,v27,  v5,v13
    122     Tpair v28,v29,  v6,v14
    123     Tpair v30,v31,  v7,v15
    124 .endm
    125 
    126 .macro t16_odd
    127     Tpair v0,v1, v16,v24
    128     Tpair v2,v3, v17,v25
    129     Tpair v4,v5, v18,v26
    130     Tpair v6,v7, v19,v27
    131     Tpair v8,v9, v20,v28
    132     Tpair v10,v11, v21,v29
    133     Tpair v12,v13, v22,v30
    134     Tpair v14,v15, v23,v31
    135 .endm
    136 
    137 ;# Whole transpose takes 4*16 = 64 instructions
    138 
    139 .macro t16_full
    140     t16_odd
    141     t16_even
    142     t16_odd
    143     t16_even
    144 .endm
    145 
    146 ;# Vertical edge filtering requires transposes.  For the simple filter,
    147 ;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
    148 ;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
    149 ;#
    150 ;#  v0 =  0  1 ... 14 15
    151 ;#  v1 = 16 17 ... 30 31
    152 ;#  v2 = 32 33 ... 47 48
    153 ;#  v3 = 49 50 ... 62 63
    154 ;#
    155 ;#  In frame-buffer memory, the layout is:
    156 ;#
    157 ;#     0  16  32  48
    158 ;#     1  17  33  49
    159 ;#     ...
    160 ;#    15  31  47  63.
    161 ;#
    162 ;#  We begin by reading the data 32 bits at a time (using scalar operations)
    163 ;#  into a temporary array, reading the rows of the array into vector registers,
    164 ;#  with the following layout:
    165 ;#
    166 ;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
    167 ;#  v1 =  1 17 33 49  5 21 ...                      45 61
    168 ;#  v2 =  2 18 ...                                  46 62
    169 ;#  v3 =  3 19 ...                                  47 63
    170 ;#
    171 ;#  From the "address-bit" perspective discussed above, we simply need to
    172 ;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
    173 ;#  In other words, we transpose each of the four 4x4 submatrices.
    174 ;#
    175 ;#  This transformation is its own inverse, and we need to perform it
    176 ;#  again before writing the pixels back into the frame buffer.
    177 ;#
    178 ;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
    179 ;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
    180 ;#  defined above.  We think of both groups of 4 registers as having
    181 ;#  "addresses" {0,1,2,3} * 16.
    182 ;#
    183 .macro Transpose4times4x4 Vlo, Vhi
    184 
    185     ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
    186 
    187     vmrghb  v4, v0, v1
    188     vmrglb  v5, v0, v1
    189     vmrghb  v6, v2, v3
    190     vmrglb  v7, v2, v3
    191 
    192     ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
    193 
    194     vmrghh  v0, v4, v6
    195     vmrglh  v1, v4, v6
    196     vmrghh  v2, v5, v7
    197     vmrglh  v3, v5, v7
    198 
    199     ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
    200 
    201     vmrghw  v4, v0, v1
    202     vmrglw  v5, v0, v1
    203     vmrghw  v6, v2, v3
    204     vmrglw  v7, v2, v3
    205 
    206     ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
    207 
    208     vperm   v0, v4, v6, \Vlo
    209     vperm   v1, v4, v6, \Vhi
    210     vperm   v2, v5, v7, \Vlo
    211     vperm   v3, v5, v7, \Vhi
    212 .endm
    213 ;# end Transpose4times4x4
    214 
    215 
    216 ;# Normal mb vertical edge filter transpose.
    217 ;#
    218 ;#   We read 8 columns of data, initially in the following pattern:
    219 ;#
    220 ;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
    221 ;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
    222 ;#  ...
    223 ;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
    224 ;#
    225 ;#   and wish to convert to:
    226 ;#
    227 ;#  (0,0) ... (0,15)
    228 ;#  (1,0) ... (1,15)
    229 ;#  ...
    230 ;#  (7,0) ... (7,15).
    231 ;#
    232 ;#  In "address bit" language, we wish to map
    233 ;#
    234 ;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
    235 ;#
    236 ;#  This can be accomplished by 4 iterations of the cyclic transform
    237 ;#
    238 ;#  I -> (I+1) mod 7;
    239 ;#
    240 ;#  each iteration can be realized by (d=0, s=2):
    241 ;#
    242 ;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
    243 ;#
    244 ;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
    245 ;#  preserving v8 = sign converter.
    246 ;#
    247 ;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
    248 ;#  result lands in the "mirror" registers v10...v17
    249 ;#
    250 .macro t8x16_odd
    251     Tpair v10, v11,  v0, v4
    252     Tpair v12, v13,  v1, v5
    253     Tpair v14, v15,  v2, v6
    254     Tpair v16, v17,  v3, v7
    255 .endm
    256 
    257 .macro t8x16_even
    258     Tpair v0, v1,  v10, v14
    259     Tpair v2, v3,  v11, v15
    260     Tpair v4, v5,  v12, v16
    261     Tpair v6, v7,  v13, v17
    262 .endm
    263 
    264 .macro transpose8x16_fwd
    265     t8x16_odd
    266     t8x16_even
    267     t8x16_odd
    268     t8x16_even
    269 .endm
    270 
    271 .macro transpose8x16_inv
    272     t8x16_odd
    273     t8x16_even
    274     t8x16_odd
    275 .endm
    276 
    277 .macro Transpose16x16
    278     vmrghb  v0, v16, v24
    279     vmrglb  v1, v16, v24
    280     vmrghb  v2, v17, v25
    281     vmrglb  v3, v17, v25
    282     vmrghb  v4, v18, v26
    283     vmrglb  v5, v18, v26
    284     vmrghb  v6, v19, v27
    285     vmrglb  v7, v19, v27
    286     vmrghb  v8, v20, v28
    287     vmrglb  v9, v20, v28
    288     vmrghb  v10, v21, v29
    289     vmrglb  v11, v21, v29
    290     vmrghb  v12, v22, v30
    291     vmrglb  v13, v22, v30
    292     vmrghb  v14, v23, v31
    293     vmrglb  v15, v23, v31
    294     vmrghb  v16, v0, v8
    295     vmrglb  v17, v0, v8
    296     vmrghb  v18, v1, v9
    297     vmrglb  v19, v1, v9
    298     vmrghb  v20, v2, v10
    299     vmrglb  v21, v2, v10
    300     vmrghb  v22, v3, v11
    301     vmrglb  v23, v3, v11
    302     vmrghb  v24, v4, v12
    303     vmrglb  v25, v4, v12
    304     vmrghb  v26, v5, v13
    305     vmrglb  v27, v5, v13
    306     vmrghb  v28, v6, v14
    307     vmrglb  v29, v6, v14
    308     vmrghb  v30, v7, v15
    309     vmrglb  v31, v7, v15
    310     vmrghb  v0, v16, v24
    311     vmrglb  v1, v16, v24
    312     vmrghb  v2, v17, v25
    313     vmrglb  v3, v17, v25
    314     vmrghb  v4, v18, v26
    315     vmrglb  v5, v18, v26
    316     vmrghb  v6, v19, v27
    317     vmrglb  v7, v19, v27
    318     vmrghb  v8, v20, v28
    319     vmrglb  v9, v20, v28
    320     vmrghb  v10, v21, v29
    321     vmrglb  v11, v21, v29
    322     vmrghb  v12, v22, v30
    323     vmrglb  v13, v22, v30
    324     vmrghb  v14, v23, v31
    325     vmrglb  v15, v23, v31
    326     vmrghb  v16, v0, v8
    327     vmrglb  v17, v0, v8
    328     vmrghb  v18, v1, v9
    329     vmrglb  v19, v1, v9
    330     vmrghb  v20, v2, v10
    331     vmrglb  v21, v2, v10
    332     vmrghb  v22, v3, v11
    333     vmrglb  v23, v3, v11
    334     vmrghb  v24, v4, v12
    335     vmrglb  v25, v4, v12
    336     vmrghb  v26, v5, v13
    337     vmrglb  v27, v5, v13
    338     vmrghb  v28, v6, v14
    339     vmrglb  v29, v6, v14
    340     vmrghb  v30, v7, v15
    341     vmrglb  v31, v7, v15
    342 .endm
    343 
    344 ;# load_g loads a global vector (whose address is in the local variable Gptr)
    345 ;#   into vector register Vreg.  Trashes r0
    346 .macro load_g Vreg, Gptr
    347     lwz     r0, \Gptr
    348     lvx     \Vreg, 0, r0
    349 .endm
    350 
    351 ;# exploit the saturation here.  if the answer is negative
    352 ;# it will be clamped to 0.  orring 0 with a positive
    353 ;# number will be the positive number (abs)
    354 ;# RES = abs( A-B), trashes TMP
    355 .macro Abs RES, TMP, A, B
    356     vsububs \RES, \A, \B
    357     vsububs \TMP, \B, \A
    358     vor     \RES, \RES, \TMP
    359 .endm
    360 
    361 ;# RES = Max( RES, abs( A-B)), trashes TMP
    362 .macro max_abs RES, TMP, A, B
    363     vsububs \TMP, \A, \B
    364     vmaxub  \RES, \RES, \TMP
    365     vsububs \TMP, \B, \A
    366     vmaxub  \RES, \RES, \TMP
    367 .endm
    368 
    369 .macro Masks
    370     ;# build masks
    371     ;# input is all 8 bit unsigned (0-255).  need to
    372     ;# do abs(vala-valb) > limit.  but no need to compare each
    373     ;# value to the limit.  find the max of the absolute differences
    374     ;# and compare that to the limit.
    375     ;# First hev
    376     Abs     v14, v13, v2, v3    ;# |P1 - P0|
    377     max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
    378 
    379     vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
    380 
    381     ;# Next limit
    382     max_abs  v14, v13, v0, v1    ;# |P3 - P2|
    383     max_abs  v14, v13, v1, v2    ;# |P2 - P1|
    384     max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
    385     max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
    386 
    387     vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
    388 
    389     ;# flimit
    390     Abs     v14, v13, v3, v4    ;# |P0 - Q0|
    391 
    392     vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
    393 
    394     vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
    395     ;# done building masks
    396 .endm
    397 
    398 .macro build_constants RFL, RLI, RTH, FL, LI, TH
    399     ;# build constants
    400     lvx     \FL, 0, \RFL        ;# flimit
    401     lvx     \LI, 0, \RLI        ;# limit
    402     lvx     \TH, 0, \RTH        ;# thresh
    403 
    404     vspltisb v11, 8
    405     vspltisb v12, 4
    406     vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
    407 .endm
    408 
    409 .macro load_data_y
    410     ;# setup strides/pointers to be able to access
    411     ;# all of the data
    412     add     r5, r4, r4          ;# r5 = 2 * stride
    413     sub     r6, r3, r5          ;# r6 -> 2 rows back
    414     neg     r7, r4              ;# r7 = -stride
    415 
    416     ;# load 16 pixels worth of data to work on
    417     sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
    418     lvx     v0,  0, r0          ;# P3  (read only)
    419     lvx     v1, r7, r6          ;# P2
    420     lvx     v2,  0, r6          ;# P1
    421     lvx     v3, r7, r3          ;# P0
    422     lvx     v4,  0, r3          ;# Q0
    423     lvx     v5, r4, r3          ;# Q1
    424     lvx     v6, r5, r3          ;# Q2
    425     add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
    426     lvx     v7, r4, r0          ;# Q3  (read only)
    427 .endm
    428 
    429 ;# Expects
    430 ;#  v10 == HEV
    431 ;#  v13 == tmp
    432 ;#  v14 == tmp
    433 .macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
    434     vxor    \P1, \P1, v11       ;# SP1
    435     vxor    \P0, \P0, v11       ;# SP0
    436     vxor    \Q0, \Q0, v11       ;# SQ0
    437     vxor    \Q1, \Q1, v11       ;# SQ1
    438 
    439     vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
    440 .if \HEV_PRESENT
    441     vand    v13, v13, v10       ;# f &= hev
    442 .endif
    443     vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
    444     vaddsbs v13, v13, v14
    445     vaddsbs v13, v13, v14
    446     vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
    447 
    448     vandc   v13, v13, v8        ;# f &= mask
    449 
    450     vspltisb v8, 3
    451     vspltisb v9, 4
    452 
    453     vaddsbs v14, v13, v9        ;# f1 = c (f+4)
    454     vaddsbs v15, v13, v8        ;# f2 = c (f+3)
    455 
    456     vsrab   v13, v14, v8        ;# f1 >>= 3
    457     vsrab   v15, v15, v8        ;# f2 >>= 3
    458 
    459     vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
    460     vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
    461 .endm
    462 
    463 .macro vp8_mbfilter
    464     Masks
    465 
    466     ;# start the fitering here
    467     vxor    v1, v1, v11         ;# SP2
    468     vxor    v2, v2, v11         ;# SP1
    469     vxor    v3, v3, v11         ;# SP0
    470     vxor    v4, v4, v11         ;# SQ0
    471     vxor    v5, v5, v11         ;# SQ1
    472     vxor    v6, v6, v11         ;# SQ2
    473 
    474     ;# add outer taps if we have high edge variance
    475     vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
    476 
    477     vsubsbs v14, v4, v3         ;# SQ0-SP0
    478     vaddsbs v13, v13, v14
    479     vaddsbs v13, v13, v14
    480     vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
    481 
    482     vandc   v13, v13, v8        ;# f &= mask
    483     vand    v15, v13, v10       ;# f2 = f & hev
    484 
    485     ;# save bottom 3 bits so that we round one side +4 and the other +3
    486     vspltisb v8, 3
    487     vspltisb v9, 4
    488 
    489     vaddsbs v14, v15, v9        ;# f1 = c (f+4)
    490     vaddsbs v15, v15, v8        ;# f2 = c (f+3)
    491 
    492     vsrab   v14, v14, v8        ;# f1 >>= 3
    493     vsrab   v15, v15, v8        ;# f2 >>= 3
    494 
    495     vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
    496     vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
    497 
    498     ;# only apply wider filter if not high edge variance
    499     vandc   v13, v13, v10       ;# f &= ~hev
    500 
    501     vspltisb v9, 2
    502     vnor    v8, v8, v8
    503     vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
    504     vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
    505     vspltisb v8, 9
    506 
    507     ;# roughly 1/7th difference across boundary
    508     vspltish v10, 7
    509     vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
    510     vmulesb v15, v8, v13
    511     vaddshs v14, v14, v9        ;# +=  63
    512     vaddshs v15, v15, v9
    513     vsrah   v14, v14, v10       ;# >>= 7
    514     vsrah   v15, v15, v10
    515     vmrglh  v10, v15, v14
    516     vmrghh  v15, v15, v14
    517 
    518     vpkshss v10, v15, v10       ;# X = saturated down to bytes
    519 
    520     vsubsbs v6, v6, v10         ;# subtract from Q and add to P
    521     vaddsbs v1, v1, v10
    522 
    523     vxor    v6, v6, v11
    524     vxor    v1, v1, v11
    525 
    526     ;# roughly 2/7th difference across boundary
    527     vspltish v10, 7
    528     vaddubm v12, v8, v8
    529     vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
    530     vmulesb v15, v12, v13
    531     vaddshs v14, v14, v9
    532     vaddshs v15, v15, v9
    533     vsrah   v14, v14, v10       ;# >>= 7
    534     vsrah   v15, v15, v10
    535     vmrglh  v10, v15, v14
    536     vmrghh  v15, v15, v14
    537 
    538     vpkshss v10, v15, v10       ;# X = saturated down to bytes
    539 
    540     vsubsbs v5, v5, v10         ;# subtract from Q and add to P
    541     vaddsbs v2, v2, v10
    542 
    543     vxor    v5, v5, v11
    544     vxor    v2, v2, v11
    545 
    546     ;# roughly 3/7th difference across boundary
    547     vspltish v10, 7
    548     vaddubm v12, v12, v8
    549     vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
    550     vmulesb v15, v12, v13
    551     vaddshs v14, v14, v9
    552     vaddshs v15, v15, v9
    553     vsrah   v14, v14, v10       ;# >>= 7
    554     vsrah   v15, v15, v10
    555     vmrglh  v10, v15, v14
    556     vmrghh  v15, v15, v14
    557 
    558     vpkshss v10, v15, v10       ;# X = saturated down to bytes
    559 
    560     vsubsbs v4, v4, v10         ;# subtract from Q and add to P
    561     vaddsbs v3, v3, v10
    562 
    563     vxor    v4, v4, v11
    564     vxor    v3, v3, v11
    565 .endm
    566 
    567 .macro SBFilter
    568     Masks
    569 
    570     common_adjust v3, v4, v2, v5, 1
    571 
    572     ;# outer tap adjustments
    573     vspltisb v8, 1
    574 
    575     vaddubm v13, v13, v8        ;# f  += 1
    576     vsrab   v13, v13, v8        ;# f >>= 1
    577 
    578     vandc   v13, v13, v10       ;# f &= ~hev
    579 
    580     vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
    581     vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
    582 
    583     vxor    v2, v2, v11
    584     vxor    v3, v3, v11
    585     vxor    v4, v4, v11
    586     vxor    v5, v5, v11
    587 .endm
    588 
    589     .align 2
    590 mbloop_filter_horizontal_edge_y_ppc:
    591     mfspr   r11, 256            ;# get old VRSAVE
    592     oris    r12, r11, 0xffff
    593     mtspr   256, r12            ;# set VRSAVE
    594 
    595     build_constants r5, r6, r7, v8, v9, v10
    596 
    597     load_data_y
    598 
    599     vp8_mbfilter
    600 
    601     stvx     v1, r7, r6         ;# P2
    602     stvx     v2,  0, r6         ;# P1
    603     stvx     v3, r7, r3         ;# P0
    604     stvx     v4,  0, r3         ;# Q0
    605     stvx     v5, r4, r3         ;# Q1
    606     stvx     v6, r5, r3         ;# Q2
    607 
    608     mtspr   256, r11            ;# reset old VRSAVE
    609 
    610     blr
    611 
    612     .align 2
    613 ;#  r3 unsigned char *s
    614 ;#  r4 int p
    615 ;#  r5 const signed char *flimit
    616 ;#  r6 const signed char *limit
    617 ;#  r7 const signed char *thresh
    618 loop_filter_horizontal_edge_y_ppc:
    619     mfspr   r11, 256            ;# get old VRSAVE
    620     oris    r12, r11, 0xffff
    621     mtspr   256, r12            ;# set VRSAVE
    622 
    623     build_constants r5, r6, r7, v8, v9, v10
    624 
    625     load_data_y
    626 
    627     SBFilter
    628 
    629     stvx     v2,  0, r6         ;# P1
    630     stvx     v3, r7, r3         ;# P0
    631     stvx     v4,  0, r3         ;# Q0
    632     stvx     v5, r4, r3         ;# Q1
    633 
    634     mtspr   256, r11            ;# reset old VRSAVE
    635 
    636     blr
    637 
    638 ;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
    639 ;#  So we can read in an entire mb aligned.  However if we want to filter the mb
    640 ;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
    641 ;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
    642 ;#  of a waste.  So this is an even uglier way to get around that.
    643 ;# Using the regular register file words are read in and then saved back out to
    644 ;#  memory to align and order them up.  Then they are read in using the
    645 ;#  vector register file.
    646 .macro RLVmb V, R
    647     lwzux   r0, r3, r4
    648     stw     r0, 4(\R)
    649     lwz     r0,-4(r3)
    650     stw     r0, 0(\R)
    651     lwzux   r0, r3, r4
    652     stw     r0,12(\R)
    653     lwz     r0,-4(r3)
    654     stw     r0, 8(\R)
    655     lvx     \V, 0, \R
    656 .endm
    657 
    658 .macro WLVmb V, R
    659     stvx    \V, 0, \R
    660     lwz     r0,12(\R)
    661     stwux   r0, r3, r4
    662     lwz     r0, 8(\R)
    663     stw     r0,-4(r3)
    664     lwz     r0, 4(\R)
    665     stwux   r0, r3, r4
    666     lwz     r0, 0(\R)
    667     stw     r0,-4(r3)
    668 .endm
    669 
    670     .align 2
    671 ;#  r3 unsigned char *s
    672 ;#  r4 int p
    673 ;#  r5 const signed char *flimit
    674 ;#  r6 const signed char *limit
    675 ;#  r7 const signed char *thresh
    676 mbloop_filter_vertical_edge_y_ppc:
    677     mfspr   r11, 256            ;# get old VRSAVE
    678     oris    r12, r11, 0xffff
    679     ori     r12, r12, 0xc000
    680     mtspr   256, r12            ;# set VRSAVE
    681 
    682     la      r9, -48(r1)         ;# temporary space for reading in vectors
    683     sub     r3, r3, r4
    684 
    685     RLVmb v0, r9
    686     RLVmb v1, r9
    687     RLVmb v2, r9
    688     RLVmb v3, r9
    689     RLVmb v4, r9
    690     RLVmb v5, r9
    691     RLVmb v6, r9
    692     RLVmb v7, r9
    693 
    694     transpose8x16_fwd
    695 
    696     build_constants r5, r6, r7, v8, v9, v10
    697 
    698     vp8_mbfilter
    699 
    700     transpose8x16_inv
    701 
    702     add r3, r3, r4
    703     neg r4, r4
    704 
    705     WLVmb v17, r9
    706     WLVmb v16, r9
    707     WLVmb v15, r9
    708     WLVmb v14, r9
    709     WLVmb v13, r9
    710     WLVmb v12, r9
    711     WLVmb v11, r9
    712     WLVmb v10, r9
    713 
    714     mtspr   256, r11            ;# reset old VRSAVE
    715 
    716     blr
    717 
    718 .macro RL V, R, P
    719     lvx     \V, 0,  \R
    720     add     \R, \R, \P
    721 .endm
    722 
    723 .macro WL V, R, P
    724     stvx    \V, 0,  \R
    725     add     \R, \R, \P
    726 .endm
    727 
    728 .macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
    729                                 ;# K = |P0-P1| already
    730     Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
    731     vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
    732     vcmpgtub v10, v14, v0
    733 
    734     Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
    735 
    736     max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
    737     max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
    738     max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
    739 
    740     vmaxub   v14, v14, v4       ;# M = max interior abs diff
    741     vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
    742 
    743     Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
    744     vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
    745     vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
    746 
    747     ;# replace P1,Q1 w/signed versions
    748     common_adjust \P0, \Q0, \P1, \Q1, 1
    749 
    750     vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
    751     vsrab   v13, v13, v1
    752     vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
    753     vsubsbs \Q1, \Q1, v13
    754     vaddsbs \P1, \P1, v13
    755 
    756     vxor    \P1, \P1, v11       ;# P1
    757     vxor    \P0, \P0, v11       ;# P0
    758     vxor    \Q0, \Q0, v11       ;# Q0
    759     vxor    \Q1, \Q1, v11       ;# Q1
    760 .endm
    761 
    762 
    763     .align 2
    764 ;#  r3 unsigned char *s
    765 ;#  r4 int p
    766 ;#  r5 const signed char *flimit
    767 ;#  r6 const signed char *limit
    768 ;#  r7 const signed char *thresh
    769 loop_filter_vertical_edge_y_ppc:
    770     mfspr   r11, 256            ;# get old VRSAVE
    771     oris    r12, r11, 0xffff
    772     ori     r12, r12, 0xffff
    773     mtspr   256, r12            ;# set VRSAVE
    774 
    775     addi    r9, r3, 0
    776     RL      v16, r9, r4
    777     RL      v17, r9, r4
    778     RL      v18, r9, r4
    779     RL      v19, r9, r4
    780     RL      v20, r9, r4
    781     RL      v21, r9, r4
    782     RL      v22, r9, r4
    783     RL      v23, r9, r4
    784     RL      v24, r9, r4
    785     RL      v25, r9, r4
    786     RL      v26, r9, r4
    787     RL      v27, r9, r4
    788     RL      v28, r9, r4
    789     RL      v29, r9, r4
    790     RL      v30, r9, r4
    791     lvx     v31, 0, r9
    792 
    793     Transpose16x16
    794 
    795     vspltisb v1, 1
    796 
    797     build_constants r5, r6, r7, v3, v2, v0
    798 
    799     Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
    800 
    801     Fil v16, v17, v18, v19,  v20, v21, v22, v23
    802     Fil v20, v21, v22, v23,  v24, v25, v26, v27
    803     Fil v24, v25, v26, v27,  v28, v29, v30, v31
    804 
    805     Transpose16x16
    806 
    807     addi    r9, r3, 0
    808     WL      v16, r9, r4
    809     WL      v17, r9, r4
    810     WL      v18, r9, r4
    811     WL      v19, r9, r4
    812     WL      v20, r9, r4
    813     WL      v21, r9, r4
    814     WL      v22, r9, r4
    815     WL      v23, r9, r4
    816     WL      v24, r9, r4
    817     WL      v25, r9, r4
    818     WL      v26, r9, r4
    819     WL      v27, r9, r4
    820     WL      v28, r9, r4
    821     WL      v29, r9, r4
    822     WL      v30, r9, r4
    823     stvx    v31, 0, r9
    824 
    825     mtspr   256, r11            ;# reset old VRSAVE
    826 
    827     blr
    828 
    829 ;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    830 .macro active_chroma_sel V
    831     andi.   r7, r3, 8       ;# row origin modulo 16
    832     add     r7, r7, r7      ;# selects selectors
    833     lis     r12, _chromaSelectors@ha
    834     la      r0,  _chromaSelectors@l(r12)
    835     lwzux   r0, r7, r0      ;# leave selector addr in r7
    836 
    837     lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
    838 .endm
    839 
    840 .macro hread_uv Dest, U, V, Offs, VMask
    841     lvx     \U, \Offs, r3
    842     lvx     \V, \Offs, r4
    843     vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
    844 .endm
    845 
    846 .macro hwrite_uv New, U, V, Offs, Umask, Vmask
    847     vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
    848     vperm   \V, \New, \V, \Vmask
    849     stvx    \U, \Offs, r3           ;# Write to frame buffer
    850     stvx    \V, \Offs, r4
    851 .endm
    852 
    853 ;# Process U,V in parallel.
    854 .macro load_chroma_h
    855     neg     r9, r5          ;# r9 = -1 * stride
    856     add     r8, r9, r9      ;# r8 = -2 * stride
    857     add     r10, r5, r5     ;# r10 = 2 * stride
    858 
    859     active_chroma_sel v12
    860 
    861     ;# P3, Q3 are read-only; need not save addresses or sibling pels
    862     add     r6, r8, r8      ;# r6 = -4 * stride
    863     hread_uv v0, v14, v15, r6, v12
    864     add     r6, r10, r5     ;# r6 =  3 * stride
    865     hread_uv v7, v14, v15, r6, v12
    866 
    867     ;# Others are read/write; save addresses and sibling pels
    868 
    869     add     r6, r8, r9      ;# r6 = -3 * stride
    870     hread_uv v1, v16, v17, r6,  v12
    871     hread_uv v2, v18, v19, r8,  v12
    872     hread_uv v3, v20, v21, r9,  v12
    873     hread_uv v4, v22, v23, 0,   v12
    874     hread_uv v5, v24, v25, r5,  v12
    875     hread_uv v6, v26, v27, r10, v12
    876 .endm
    877 
    878 .macro uresult_sel V
    879     load_g   \V, 4(r7)
    880 .endm
    881 
    882 .macro vresult_sel V
    883     load_g   \V, 8(r7)
    884 .endm
    885 
    886 ;# always write P1,P0,Q0,Q1
    887 .macro store_chroma_h
    888     uresult_sel v11
    889     vresult_sel v12
    890     hwrite_uv v2, v18, v19, r8, v11, v12
    891     hwrite_uv v3, v20, v21, r9, v11, v12
    892     hwrite_uv v4, v22, v23, 0,  v11, v12
    893     hwrite_uv v5, v24, v25, r5, v11, v12
    894 .endm
    895 
    896     .align 2
    897 ;#  r3 unsigned char *u
    898 ;#  r4 unsigned char *v
    899 ;#  r5 int p
    900 ;#  r6 const signed char *flimit
    901 ;#  r7 const signed char *limit
    902 ;#  r8 const signed char *thresh
    903 mbloop_filter_horizontal_edge_uv_ppc:
    904     mfspr   r11, 256            ;# get old VRSAVE
    905     oris    r12, r11, 0xffff
    906     ori     r12, r12, 0xffff
    907     mtspr   256, r12            ;# set VRSAVE
    908 
    909     build_constants r6, r7, r8, v8, v9, v10
    910 
    911     load_chroma_h
    912 
    913     vp8_mbfilter
    914 
    915     store_chroma_h
    916 
    917     hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
    918     hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
    919 
    920     mtspr   256, r11            ;# reset old VRSAVE
    921 
    922     blr
    923 
    924     .align 2
    925 ;#  r3 unsigned char *u
    926 ;#  r4 unsigned char *v
    927 ;#  r5 int p
    928 ;#  r6 const signed char *flimit
    929 ;#  r7 const signed char *limit
    930 ;#  r8 const signed char *thresh
    931 loop_filter_horizontal_edge_uv_ppc:
    932     mfspr   r11, 256            ;# get old VRSAVE
    933     oris    r12, r11, 0xffff
    934     ori     r12, r12, 0xffff
    935     mtspr   256, r12            ;# set VRSAVE
    936 
    937     build_constants r6, r7, r8, v8, v9, v10
    938 
    939     load_chroma_h
    940 
    941     SBFilter
    942 
    943     store_chroma_h
    944 
    945     mtspr   256, r11            ;# reset old VRSAVE
    946 
    947     blr
    948 
    949 .macro R V, R
    950     lwzux   r0, r3, r5
    951     stw     r0, 4(\R)
    952     lwz     r0,-4(r3)
    953     stw     r0, 0(\R)
    954     lwzux   r0, r4, r5
    955     stw     r0,12(\R)
    956     lwz     r0,-4(r4)
    957     stw     r0, 8(\R)
    958     lvx     \V, 0, \R
    959 .endm
    960 
    961 
    962 .macro W V, R
    963     stvx    \V, 0, \R
    964     lwz     r0,12(\R)
    965     stwux   r0, r4, r5
    966     lwz     r0, 8(\R)
    967     stw     r0,-4(r4)
    968     lwz     r0, 4(\R)
    969     stwux   r0, r3, r5
    970     lwz     r0, 0(\R)
    971     stw     r0,-4(r3)
    972 .endm
    973 
    974 .macro chroma_vread R
    975     sub r3, r3, r5          ;# back up one line for simplicity
    976     sub r4, r4, r5
    977 
    978     R v0, \R
    979     R v1, \R
    980     R v2, \R
    981     R v3, \R
    982     R v4, \R
    983     R v5, \R
    984     R v6, \R
    985     R v7, \R
    986 
    987     transpose8x16_fwd
    988 .endm
    989 
    990 .macro chroma_vwrite R
    991 
    992     transpose8x16_inv
    993 
    994     add     r3, r3, r5
    995     add     r4, r4, r5
    996     neg     r5, r5          ;# Write rows back in reverse order
    997 
    998     W v17, \R
    999     W v16, \R
   1000     W v15, \R
   1001     W v14, \R
   1002     W v13, \R
   1003     W v12, \R
   1004     W v11, \R
   1005     W v10, \R
   1006 .endm
   1007 
   1008     .align 2
   1009 ;#  r3 unsigned char *u
   1010 ;#  r4 unsigned char *v
   1011 ;#  r5 int p
   1012 ;#  r6 const signed char *flimit
   1013 ;#  r7 const signed char *limit
   1014 ;#  r8 const signed char *thresh
   1015 mbloop_filter_vertical_edge_uv_ppc:
   1016     mfspr   r11, 256            ;# get old VRSAVE
   1017     oris    r12, r11, 0xffff
   1018     ori     r12, r12, 0xc000
   1019     mtspr   256, r12            ;# set VRSAVE
   1020 
   1021     la      r9, -48(r1)         ;# temporary space for reading in vectors
   1022 
   1023     chroma_vread r9
   1024 
   1025     build_constants r6, r7, r8, v8, v9, v10
   1026 
   1027     vp8_mbfilter
   1028 
   1029     chroma_vwrite r9
   1030 
   1031     mtspr   256, r11            ;# reset old VRSAVE
   1032 
   1033     blr
   1034 
   1035     .align 2
   1036 ;#  r3 unsigned char *u
   1037 ;#  r4 unsigned char *v
   1038 ;#  r5 int p
   1039 ;#  r6 const signed char *flimit
   1040 ;#  r7 const signed char *limit
   1041 ;#  r8 const signed char *thresh
   1042 loop_filter_vertical_edge_uv_ppc:
   1043     mfspr   r11, 256            ;# get old VRSAVE
   1044     oris    r12, r11, 0xffff
   1045     ori     r12, r12, 0xc000
   1046     mtspr   256, r12            ;# set VRSAVE
   1047 
   1048     la      r9, -48(r1)         ;# temporary space for reading in vectors
   1049 
   1050     chroma_vread r9
   1051 
   1052     build_constants r6, r7, r8, v8, v9, v10
   1053 
   1054     SBFilter
   1055 
   1056     chroma_vwrite r9
   1057 
   1058     mtspr   256, r11            ;# reset old VRSAVE
   1059 
   1060     blr
   1061 
   1062 ;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
   1063 
   1064 .macro vp8_simple_filter
   1065     Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
   1066     vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
   1067 
   1068     ;# preserve unsigned v0 and v3
   1069     common_adjust v1, v2, v0, v3, 0
   1070 
   1071     vxor v1, v1, v11
   1072     vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
   1073 .endm
   1074 
   1075 .macro simple_vertical
   1076     addi    r8,  0, 16
   1077     addi    r7, r5, 32
   1078 
   1079     lvx     v0,  0, r5
   1080     lvx     v1, r8, r5
   1081     lvx     v2,  0, r7
   1082     lvx     v3, r8, r7
   1083 
   1084     lis     r12, _B_hihi@ha
   1085     la      r0,  _B_hihi@l(r12)
   1086     lvx     v16, 0, r0
   1087 
   1088     lis     r12, _B_lolo@ha
   1089     la      r0,  _B_lolo@l(r12)
   1090     lvx     v17, 0, r0
   1091 
   1092     Transpose4times4x4 v16, v17
   1093     vp8_simple_filter
   1094 
   1095     vxor v0, v0, v11
   1096     vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
   1097 
   1098     Transpose4times4x4 v16, v17
   1099 
   1100     stvx    v0,  0, r5
   1101     stvx    v1, r8, r5
   1102     stvx    v2,  0, r7
   1103     stvx    v3, r8, r7
   1104 .endm
   1105 
   1106     .align 2
   1107 ;#  r3 unsigned char *s
   1108 ;#  r4 int p
   1109 ;#  r5 const signed char *flimit
   1110 loop_filter_simple_horizontal_edge_ppc:
   1111     mfspr   r11, 256            ;# get old VRSAVE
   1112     oris    r12, r11, 0xffff
   1113     mtspr   256, r12            ;# set VRSAVE
   1114 
   1115     ;# build constants
   1116     lvx     v8, 0, r5           ;# flimit
   1117 
   1118     vspltisb v11, 8
   1119     vspltisb v12, 4
   1120     vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
   1121 
   1122     neg     r5, r4              ;# r5 = -1 * stride
   1123     add     r6, r5, r5          ;# r6 = -2 * stride
   1124 
   1125     lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
   1126     lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
   1127     lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
   1128     lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
   1129 
   1130     vp8_simple_filter
   1131 
   1132     stvx    v1, r5, r3          ;# store P0
   1133     stvx    v2,  0, r3          ;# store Q0
   1134 
   1135     mtspr   256, r11            ;# reset old VRSAVE
   1136 
   1137     blr
   1138 
   1139 .macro RLV Offs
   1140     stw     r0, (\Offs*4)(r5)
   1141     lwzux   r0, r7, r4
   1142 .endm
   1143 
   1144 .macro WLV Offs
   1145     lwz     r0, (\Offs*4)(r5)
   1146     stwux   r0, r7, r4
   1147 .endm
   1148 
   1149     .align 2
   1150 ;#  r3 unsigned char *s
   1151 ;#  r4 int p
   1152 ;#  r5 const signed char *flimit
   1153 loop_filter_simple_vertical_edge_ppc:
   1154     mfspr   r11, 256            ;# get old VRSAVE
   1155     oris    r12, r11, 0xffff
   1156     ori     r12, r12, 0xc000
   1157     mtspr   256, r12            ;# set VRSAVE
   1158 
   1159     ;# build constants
   1160     lvx     v8, 0, r5           ;# flimit
   1161 
   1162     vspltisb v11, 8
   1163     vspltisb v12, 4
   1164     vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
   1165 
   1166     la r5, -96(r1)              ;# temporary space for reading in vectors
   1167 
   1168     ;# Store 4 pels at word "Offs" in temp array, then advance r7
   1169     ;#   to next row and read another 4 pels from the frame buffer.
   1170 
   1171     subi    r7, r3,  2          ;# r7 -> 2 pels before start
   1172     lwzx    r0,  0, r7          ;# read first 4 pels
   1173 
   1174     ;# 16 unaligned word accesses
   1175     RLV 0
   1176     RLV 4
   1177     RLV 8
   1178     RLV 12
   1179     RLV 1
   1180     RLV 5
   1181     RLV 9
   1182     RLV 13
   1183     RLV 2
   1184     RLV 6
   1185     RLV 10
   1186     RLV 14
   1187     RLV 3
   1188     RLV 7
   1189     RLV 11
   1190 
   1191     stw     r0, (15*4)(r5)      ;# write last 4 pels
   1192 
   1193     simple_vertical
   1194 
   1195     ;# Read temp array, write frame buffer.
   1196     subi    r7, r3,  2          ;# r7 -> 2 pels before start
   1197     lwzx    r0,  0, r5          ;# read/write first 4 pels
   1198     stwx    r0,  0, r7
   1199 
   1200     WLV 4
   1201     WLV 8
   1202     WLV 12
   1203     WLV 1
   1204     WLV 5
   1205     WLV 9
   1206     WLV 13
   1207     WLV 2
   1208     WLV 6
   1209     WLV 10
   1210     WLV 14
   1211     WLV 3
   1212     WLV 7
   1213     WLV 11
   1214     WLV 15
   1215 
   1216     mtspr   256, r11            ;# reset old VRSAVE
   1217 
   1218     blr
   1219 
   1220     .data
   1221 
   1222 _chromaSelectors:
   1223     .long   _B_hihi
   1224     .long   _B_Ures0
   1225     .long   _B_Vres0
   1226     .long   0
   1227     .long   _B_lolo
   1228     .long   _B_Ures8
   1229     .long   _B_Vres8
   1230     .long   0
   1231 
   1232     .align 4
   1233 _B_Vres8:
   1234     .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
   1235 
   1236     .align 4
   1237 _B_Ures8:
   1238     .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
   1239 
   1240     .align 4
   1241 _B_lolo:
   1242     .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
   1243 
   1244     .align 4
   1245 _B_Vres0:
   1246     .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
   1247     .align 4
   1248 _B_Ures0:
   1249     .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
   1250 
   1251     .align 4
   1252 _B_hihi:
   1253     .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
   1254