Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl bilinear_predict4x4_ppc
     13     .globl bilinear_predict8x4_ppc
     14     .globl bilinear_predict8x8_ppc
     15     .globl bilinear_predict16x16_ppc
     16 
     17 .macro load_c V, LABEL, OFF, R0, R1
     18     lis     \R0, \LABEL@ha
     19     la      \R1, \LABEL@l(\R0)
     20     lvx     \V, \OFF, \R1
     21 .endm
     22 
     23 .macro load_vfilter V0, V1
     24     load_c \V0, vfilter_b, r6, r9, r10
     25 
     26     addi    r6,  r6, 16
     27     lvx     \V1, r6, r10
     28 .endm
     29 
     30 .macro HProlog jump_label
     31     ;# load up horizontal filter
     32     slwi.   r5, r5, 4           ;# index into horizontal filter array
     33 
     34     ;# index to the next set of vectors in the row.
     35     li      r10, 16
     36     li      r12, 32
     37 
     38     ;# downshift by 7 ( divide by 128 ) at the end
     39     vspltish v19, 7
     40 
     41     ;# If there isn't any filtering to be done for the horizontal, then
     42     ;#  just skip to the second pass.
     43     beq     \jump_label
     44 
     45     load_c v20, hfilter_b, r5, r9, r0
     46 
     47     ;# setup constants
     48     ;# v14 permutation value for alignment
     49     load_c v28, b_hperm_b, 0, r9, r0
     50 
     51     ;# rounding added in on the multiply
     52     vspltisw v21, 8
     53     vspltisw v18, 3
     54     vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
     55 
     56     slwi.   r6, r6, 5           ;# index into vertical filter array
     57 .endm
     58 
     59 ;# Filters a horizontal line
     60 ;# expects:
     61 ;#  r3  src_ptr
     62 ;#  r4  pitch
     63 ;#  r10 16
     64 ;#  r12 32
     65 ;#  v17 perm intput
     66 ;#  v18 rounding
     67 ;#  v19 shift
     68 ;#  v20 filter taps
     69 ;#  v21 tmp
     70 ;#  v22 tmp
     71 ;#  v23 tmp
     72 ;#  v24 tmp
     73 ;#  v25 tmp
     74 ;#  v26 tmp
     75 ;#  v27 tmp
     76 ;#  v28 perm output
     77 ;#
     78 .macro HFilter V
     79     vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
     80     vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
     81 
     82     vmsummbm v24, v20, v24, v18
     83     vmsummbm v25, v20, v25, v18
     84 
     85     vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
     86 
     87     vsrh    v24, v24, v19       ;# divide v0, v1 by 128
     88 
     89     vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
     90 .endm
     91 
     92 .macro hfilter_8 V, increment_counter
     93     lvsl    v17,  0, r3         ;# permutate value for alignment
     94 
     95     ;# input to filter is 9 bytes wide, output is 8 bytes.
     96     lvx     v21,   0, r3
     97     lvx     v22, r10, r3
     98 
     99 .if \increment_counter
    100     add     r3, r3, r4
    101 .endif
    102     vperm   v21, v21, v22, v17
    103 
    104     HFilter \V
    105 .endm
    106 
    107 
    108 .macro load_and_align_8 V, increment_counter
    109     lvsl    v17,  0, r3         ;# permutate value for alignment
    110 
    111     ;# input to filter is 21 bytes wide, output is 16 bytes.
    112     ;#  input will can span three vectors if not aligned correctly.
    113     lvx     v21,   0, r3
    114     lvx     v22, r10, r3
    115 
    116 .if \increment_counter
    117     add     r3, r3, r4
    118 .endif
    119 
    120     vperm   \V, v21, v22, v17
    121 .endm
    122 
    123 .macro write_aligned_8 V, increment_counter
    124     stvx    \V,  0, r7
    125 
    126 .if \increment_counter
    127     add     r7, r7, r8
    128 .endif
    129 .endm
    130 
    131 .macro vfilter_16 P0 P1
    132     vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
    133     vadduhm v22, v18, v22
    134     vmuloub v23, \P0, v20
    135     vadduhm v23, v18, v23
    136 
    137     vmuleub v24, \P1, v21
    138     vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
    139     vmuloub v25, \P1, v21
    140     vadduhm v23, v23, v25       ;# Ro = odds
    141 
    142     vsrh    v22, v22, v19       ;# divide by 128
    143     vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
    144     vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
    145     vmrglh  v23, v22, v23
    146     vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
    147 .endm
    148 
    149 
    150 .macro w_8x8 V, D, R, P
    151     stvx    \V, 0, r1
    152     lwz     \R, 0(r1)
    153     stw     \R, 0(r7)
    154     lwz     \R, 4(r1)
    155     stw     \R, 4(r7)
    156     add     \D, \D, \P
    157 .endm
    158 
    159 
    160     .align 2
    161 ;# r3 unsigned char * src
    162 ;# r4 int src_pitch
    163 ;# r5 int x_offset
    164 ;# r6 int y_offset
    165 ;# r7 unsigned char * dst
    166 ;# r8 int dst_pitch
    167 bilinear_predict4x4_ppc:
    168     mfspr   r11, 256            ;# get old VRSAVE
    169     oris    r12, r11, 0xf830
    170     ori     r12, r12, 0xfff8
    171     mtspr   256, r12            ;# set VRSAVE
    172 
    173     stwu    r1,-32(r1)          ;# create space on the stack
    174 
    175     HProlog second_pass_4x4_pre_copy_b
    176 
    177     ;# Load up permutation constants
    178     load_c v10, b_0123_b, 0, r9, r12
    179     load_c v11, b_4567_b, 0, r9, r12
    180 
    181     hfilter_8 v0, 1
    182     hfilter_8 v1, 1
    183     hfilter_8 v2, 1
    184     hfilter_8 v3, 1
    185 
    186     ;# Finished filtering main horizontal block.  If there is no
    187     ;#  vertical filtering, jump to storing the data.  Otherwise
    188     ;#  load up and filter the additional line that is needed
    189     ;#  for the vertical filter.
    190     beq     store_out_4x4_b
    191 
    192     hfilter_8 v4, 0
    193 
    194     b   second_pass_4x4_b
    195 
    196 second_pass_4x4_pre_copy_b:
    197     slwi    r6, r6, 5           ;# index into vertical filter array
    198 
    199     load_and_align_8  v0, 1
    200     load_and_align_8  v1, 1
    201     load_and_align_8  v2, 1
    202     load_and_align_8  v3, 1
    203     load_and_align_8  v4, 1
    204 
    205 second_pass_4x4_b:
    206     vspltish v20, 8
    207     vspltish v18, 3
    208     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    209 
    210     load_vfilter v20, v21
    211 
    212     vfilter_16 v0,  v1
    213     vfilter_16 v1,  v2
    214     vfilter_16 v2,  v3
    215     vfilter_16 v3,  v4
    216 
    217 store_out_4x4_b:
    218 
    219     stvx    v0, 0, r1
    220     lwz     r0, 0(r1)
    221     stw     r0, 0(r7)
    222     add     r7, r7, r8
    223 
    224     stvx    v1, 0, r1
    225     lwz     r0, 0(r1)
    226     stw     r0, 0(r7)
    227     add     r7, r7, r8
    228 
    229     stvx    v2, 0, r1
    230     lwz     r0, 0(r1)
    231     stw     r0, 0(r7)
    232     add     r7, r7, r8
    233 
    234     stvx    v3, 0, r1
    235     lwz     r0, 0(r1)
    236     stw     r0, 0(r7)
    237 
    238 exit_4x4:
    239 
    240     addi    r1, r1, 32          ;# recover stack
    241     mtspr   256, r11            ;# reset old VRSAVE
    242 
    243     blr
    244 
    245     .align 2
    246 ;# r3 unsigned char * src
    247 ;# r4 int src_pitch
    248 ;# r5 int x_offset
    249 ;# r6 int y_offset
    250 ;# r7 unsigned char * dst
    251 ;# r8 int dst_pitch
    252 bilinear_predict8x4_ppc:
    253     mfspr   r11, 256            ;# get old VRSAVE
    254     oris    r12, r11, 0xf830
    255     ori     r12, r12, 0xfff8
    256     mtspr   256, r12            ;# set VRSAVE
    257 
    258     stwu    r1,-32(r1)          ;# create space on the stack
    259 
    260     HProlog second_pass_8x4_pre_copy_b
    261 
    262     ;# Load up permutation constants
    263     load_c v10, b_0123_b, 0, r9, r12
    264     load_c v11, b_4567_b, 0, r9, r12
    265 
    266     hfilter_8 v0, 1
    267     hfilter_8 v1, 1
    268     hfilter_8 v2, 1
    269     hfilter_8 v3, 1
    270 
    271     ;# Finished filtering main horizontal block.  If there is no
    272     ;#  vertical filtering, jump to storing the data.  Otherwise
    273     ;#  load up and filter the additional line that is needed
    274     ;#  for the vertical filter.
    275     beq     store_out_8x4_b
    276 
    277     hfilter_8 v4, 0
    278 
    279     b   second_pass_8x4_b
    280 
    281 second_pass_8x4_pre_copy_b:
    282     slwi    r6, r6, 5           ;# index into vertical filter array
    283 
    284     load_and_align_8  v0, 1
    285     load_and_align_8  v1, 1
    286     load_and_align_8  v2, 1
    287     load_and_align_8  v3, 1
    288     load_and_align_8  v4, 1
    289 
    290 second_pass_8x4_b:
    291     vspltish v20, 8
    292     vspltish v18, 3
    293     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    294 
    295     load_vfilter v20, v21
    296 
    297     vfilter_16 v0,  v1
    298     vfilter_16 v1,  v2
    299     vfilter_16 v2,  v3
    300     vfilter_16 v3,  v4
    301 
    302 store_out_8x4_b:
    303 
    304     cmpi    cr0, r8, 8
    305     beq     cr0, store_aligned_8x4_b
    306 
    307     w_8x8   v0, r7, r0, r8
    308     w_8x8   v1, r7, r0, r8
    309     w_8x8   v2, r7, r0, r8
    310     w_8x8   v3, r7, r0, r8
    311 
    312     b       exit_8x4
    313 
    314 store_aligned_8x4_b:
    315     load_c v10, b_hilo_b, 0, r9, r10
    316 
    317     vperm   v0, v0, v1, v10
    318     vperm   v2, v2, v3, v10
    319 
    320     stvx    v0, 0, r7
    321     addi    r7, r7, 16
    322     stvx    v2, 0, r7
    323 
    324 exit_8x4:
    325 
    326     addi    r1, r1, 32          ;# recover stack
    327     mtspr   256, r11            ;# reset old VRSAVE
    328 
    329     blr
    330 
    331     .align 2
    332 ;# r3 unsigned char * src
    333 ;# r4 int src_pitch
    334 ;# r5 int x_offset
    335 ;# r6 int y_offset
    336 ;# r7 unsigned char * dst
    337 ;# r8 int dst_pitch
    338 bilinear_predict8x8_ppc:
    339     mfspr   r11, 256            ;# get old VRSAVE
    340     oris    r12, r11, 0xfff0
    341     ori     r12, r12, 0xffff
    342     mtspr   256, r12            ;# set VRSAVE
    343 
    344     stwu    r1,-32(r1)          ;# create space on the stack
    345 
    346     HProlog second_pass_8x8_pre_copy_b
    347 
    348     ;# Load up permutation constants
    349     load_c v10, b_0123_b, 0, r9, r12
    350     load_c v11, b_4567_b, 0, r9, r12
    351 
    352     hfilter_8 v0, 1
    353     hfilter_8 v1, 1
    354     hfilter_8 v2, 1
    355     hfilter_8 v3, 1
    356     hfilter_8 v4, 1
    357     hfilter_8 v5, 1
    358     hfilter_8 v6, 1
    359     hfilter_8 v7, 1
    360 
    361     ;# Finished filtering main horizontal block.  If there is no
    362     ;#  vertical filtering, jump to storing the data.  Otherwise
    363     ;#  load up and filter the additional line that is needed
    364     ;#  for the vertical filter.
    365     beq     store_out_8x8_b
    366 
    367     hfilter_8 v8, 0
    368 
    369     b   second_pass_8x8_b
    370 
    371 second_pass_8x8_pre_copy_b:
    372     slwi    r6, r6, 5           ;# index into vertical filter array
    373 
    374     load_and_align_8  v0, 1
    375     load_and_align_8  v1, 1
    376     load_and_align_8  v2, 1
    377     load_and_align_8  v3, 1
    378     load_and_align_8  v4, 1
    379     load_and_align_8  v5, 1
    380     load_and_align_8  v6, 1
    381     load_and_align_8  v7, 1
    382     load_and_align_8  v8, 0
    383 
    384 second_pass_8x8_b:
    385     vspltish v20, 8
    386     vspltish v18, 3
    387     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    388 
    389     load_vfilter v20, v21
    390 
    391     vfilter_16 v0,  v1
    392     vfilter_16 v1,  v2
    393     vfilter_16 v2,  v3
    394     vfilter_16 v3,  v4
    395     vfilter_16 v4,  v5
    396     vfilter_16 v5,  v6
    397     vfilter_16 v6,  v7
    398     vfilter_16 v7,  v8
    399 
    400 store_out_8x8_b:
    401 
    402     cmpi    cr0, r8, 8
    403     beq     cr0, store_aligned_8x8_b
    404 
    405     w_8x8   v0, r7, r0, r8
    406     w_8x8   v1, r7, r0, r8
    407     w_8x8   v2, r7, r0, r8
    408     w_8x8   v3, r7, r0, r8
    409     w_8x8   v4, r7, r0, r8
    410     w_8x8   v5, r7, r0, r8
    411     w_8x8   v6, r7, r0, r8
    412     w_8x8   v7, r7, r0, r8
    413 
    414     b       exit_8x8
    415 
    416 store_aligned_8x8_b:
    417     load_c v10, b_hilo_b, 0, r9, r10
    418 
    419     vperm   v0, v0, v1, v10
    420     vperm   v2, v2, v3, v10
    421     vperm   v4, v4, v5, v10
    422     vperm   v6, v6, v7, v10
    423 
    424     stvx    v0, 0, r7
    425     addi    r7, r7, 16
    426     stvx    v2, 0, r7
    427     addi    r7, r7, 16
    428     stvx    v4, 0, r7
    429     addi    r7, r7, 16
    430     stvx    v6, 0, r7
    431 
    432 exit_8x8:
    433 
    434     addi    r1, r1, 32          ;# recover stack
    435     mtspr   256, r11            ;# reset old VRSAVE
    436 
    437     blr
    438 
    439 ;# Filters a horizontal line
    440 ;# expects:
    441 ;#  r3  src_ptr
    442 ;#  r4  pitch
    443 ;#  r10 16
    444 ;#  r12 32
    445 ;#  v17 perm intput
    446 ;#  v18 rounding
    447 ;#  v19 shift
    448 ;#  v20 filter taps
    449 ;#  v21 tmp
    450 ;#  v22 tmp
    451 ;#  v23 tmp
    452 ;#  v24 tmp
    453 ;#  v25 tmp
    454 ;#  v26 tmp
    455 ;#  v27 tmp
    456 ;#  v28 perm output
    457 ;#
    458 .macro hfilter_16 V, increment_counter
    459 
    460     lvsl    v17,  0, r3         ;# permutate value for alignment
    461 
    462     ;# input to filter is 21 bytes wide, output is 16 bytes.
    463     ;#  input will can span three vectors if not aligned correctly.
    464     lvx     v21,   0, r3
    465     lvx     v22, r10, r3
    466     lvx     v23, r12, r3
    467 
    468 .if \increment_counter
    469     add     r3, r3, r4
    470 .endif
    471     vperm   v21, v21, v22, v17
    472     vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
    473 
    474     ;# set 0
    475     vmsummbm v24, v20, v21, v18 ;# taps times elements
    476 
    477     ;# set 1
    478     vsldoi  v23, v21, v22, 1
    479     vmsummbm v25, v20, v23, v18
    480 
    481     ;# set 2
    482     vsldoi  v23, v21, v22, 2
    483     vmsummbm v26, v20, v23, v18
    484 
    485     ;# set 3
    486     vsldoi  v23, v21, v22, 3
    487     vmsummbm v27, v20, v23, v18
    488 
    489     vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
    490     vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
    491 
    492     vsrh    v24, v24, v19       ;# divide v0, v1 by 128
    493     vsrh    v25, v25, v19
    494 
    495     vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
    496     vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
    497 .endm
    498 
    499 .macro load_and_align_16 V, increment_counter
    500     lvsl    v17,  0, r3         ;# permutate value for alignment
    501 
    502     ;# input to filter is 21 bytes wide, output is 16 bytes.
    503     ;#  input will can span three vectors if not aligned correctly.
    504     lvx     v21,   0, r3
    505     lvx     v22, r10, r3
    506 
    507 .if \increment_counter
    508     add     r3, r3, r4
    509 .endif
    510 
    511     vperm   \V, v21, v22, v17
    512 .endm
    513 
    514 .macro write_16 V, increment_counter
    515     stvx    \V,  0, r7
    516 
    517 .if \increment_counter
    518     add     r7, r7, r8
    519 .endif
    520 .endm
    521 
    522     .align 2
    523 ;# r3 unsigned char * src
    524 ;# r4 int src_pitch
    525 ;# r5 int x_offset
    526 ;# r6 int y_offset
    527 ;# r7 unsigned char * dst
    528 ;# r8 int dst_pitch
    529 bilinear_predict16x16_ppc:
    530     mfspr   r11, 256            ;# get old VRSAVE
    531     oris    r12, r11, 0xffff
    532     ori     r12, r12, 0xfff8
    533     mtspr   256, r12            ;# set VRSAVE
    534 
    535     HProlog second_pass_16x16_pre_copy_b
    536 
    537     hfilter_16 v0,  1
    538     hfilter_16 v1,  1
    539     hfilter_16 v2,  1
    540     hfilter_16 v3,  1
    541     hfilter_16 v4,  1
    542     hfilter_16 v5,  1
    543     hfilter_16 v6,  1
    544     hfilter_16 v7,  1
    545     hfilter_16 v8,  1
    546     hfilter_16 v9,  1
    547     hfilter_16 v10, 1
    548     hfilter_16 v11, 1
    549     hfilter_16 v12, 1
    550     hfilter_16 v13, 1
    551     hfilter_16 v14, 1
    552     hfilter_16 v15, 1
    553 
    554     ;# Finished filtering main horizontal block.  If there is no
    555     ;#  vertical filtering, jump to storing the data.  Otherwise
    556     ;#  load up and filter the additional line that is needed
    557     ;#  for the vertical filter.
    558     beq     store_out_16x16_b
    559 
    560     hfilter_16 v16, 0
    561 
    562     b   second_pass_16x16_b
    563 
    564 second_pass_16x16_pre_copy_b:
    565     slwi    r6, r6, 5           ;# index into vertical filter array
    566 
    567     load_and_align_16  v0,  1
    568     load_and_align_16  v1,  1
    569     load_and_align_16  v2,  1
    570     load_and_align_16  v3,  1
    571     load_and_align_16  v4,  1
    572     load_and_align_16  v5,  1
    573     load_and_align_16  v6,  1
    574     load_and_align_16  v7,  1
    575     load_and_align_16  v8,  1
    576     load_and_align_16  v9,  1
    577     load_and_align_16  v10, 1
    578     load_and_align_16  v11, 1
    579     load_and_align_16  v12, 1
    580     load_and_align_16  v13, 1
    581     load_and_align_16  v14, 1
    582     load_and_align_16  v15, 1
    583     load_and_align_16  v16, 0
    584 
    585 second_pass_16x16_b:
    586     vspltish v20, 8
    587     vspltish v18, 3
    588     vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
    589 
    590     load_vfilter v20, v21
    591 
    592     vfilter_16 v0,  v1
    593     vfilter_16 v1,  v2
    594     vfilter_16 v2,  v3
    595     vfilter_16 v3,  v4
    596     vfilter_16 v4,  v5
    597     vfilter_16 v5,  v6
    598     vfilter_16 v6,  v7
    599     vfilter_16 v7,  v8
    600     vfilter_16 v8,  v9
    601     vfilter_16 v9,  v10
    602     vfilter_16 v10, v11
    603     vfilter_16 v11, v12
    604     vfilter_16 v12, v13
    605     vfilter_16 v13, v14
    606     vfilter_16 v14, v15
    607     vfilter_16 v15, v16
    608 
    609 store_out_16x16_b:
    610 
    611     write_16 v0,  1
    612     write_16 v1,  1
    613     write_16 v2,  1
    614     write_16 v3,  1
    615     write_16 v4,  1
    616     write_16 v5,  1
    617     write_16 v6,  1
    618     write_16 v7,  1
    619     write_16 v8,  1
    620     write_16 v9,  1
    621     write_16 v10, 1
    622     write_16 v11, 1
    623     write_16 v12, 1
    624     write_16 v13, 1
    625     write_16 v14, 1
    626     write_16 v15, 0
    627 
    628     mtspr   256, r11            ;# reset old VRSAVE
    629 
    630     blr
    631 
    632     .data
    633 
    634     .align 4
    635 hfilter_b:
    636     .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
    637     .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
    638     .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
    639     .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
    640     .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
    641     .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
    642     .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
    643     .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
    644 
    645     .align 4
    646 vfilter_b:
    647     .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
    648     .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
    649     .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
    650     .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    651     .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
    652     .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
    653     .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
    654     .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
    655     .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
    656     .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
    657     .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
    658     .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
    659     .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
    660     .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
    661     .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    662     .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
    663 
    664     .align 4
    665 b_hperm_b:
    666     .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
    667 
    668     .align 4
    669 b_0123_b:
    670     .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    671 
    672     .align 4
    673 b_4567_b:
    674     .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
    675 
    676 b_hilo_b:
    677     .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
    678