Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 
     13 // This module is for GCC MSA
     14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
     15 #include "libyuv/macros_msa.h"
     16 
     17 #ifdef __cplusplus
     18 namespace libyuv {
     19 extern "C" {
     20 #endif
     21 
     22 #define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
     23   {                                                         \
     24     out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
     25     out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
     26     out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
     27     out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
     28   }
     29 
     30 #define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
     31   {                                                         \
     32     out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
     33     out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
     34     out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
     35     out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
     36   }
     37 
     38 #define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
     39   {                                                         \
     40     out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
     41     out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
     42     out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
     43     out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
     44   }
     45 
     46 #define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
     47   {                                                         \
     48     out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
     49     out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
     50     out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
     51     out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
     52   }
     53 
     54 void TransposeWx16_C(const uint8* src,
     55                      int src_stride,
     56                      uint8* dst,
     57                      int dst_stride,
     58                      int width) {
     59   TransposeWx8_C(src, src_stride, dst, dst_stride, width);
     60   TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
     61                  width);
     62 }
     63 
     64 void TransposeUVWx16_C(const uint8* src,
     65                        int src_stride,
     66                        uint8* dst_a,
     67                        int dst_stride_a,
     68                        uint8* dst_b,
     69                        int dst_stride_b,
     70                        int width) {
     71   TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
     72                    width);
     73   TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
     74                    dst_stride_a, (dst_b + 8), dst_stride_b, width);
     75 }
     76 
     77 void TransposeWx16_MSA(const uint8* src,
     78                        int src_stride,
     79                        uint8* dst,
     80                        int dst_stride,
     81                        int width) {
     82   int x;
     83   const uint8* s;
     84   v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
     85   v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
     86   v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
     87 
     88   for (x = 0; x < width; x += 16) {
     89     s = src;
     90     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
     91     s += src_stride;
     92     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
     93     s += src_stride;
     94     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
     95     s += src_stride;
     96     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
     97     s += src_stride;
     98     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
     99     ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
    100     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    101     s += src_stride;
    102     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    103     s += src_stride;
    104     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    105     s += src_stride;
    106     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    107     s += src_stride;
    108     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    109     ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
    110     ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
    111     ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
    112     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    113     s += src_stride;
    114     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    115     s += src_stride;
    116     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    117     s += src_stride;
    118     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    119     s += src_stride;
    120     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    121     ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
    122     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    123     s += src_stride;
    124     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    125     s += src_stride;
    126     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    127     s += src_stride;
    128     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    129     s += src_stride;
    130     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    131     ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
    132     res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
    133     res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
    134     ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
    135     ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
    136     dst += dst_stride * 4;
    137     res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
    138     res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
    139     ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
    140     ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
    141     dst += dst_stride * 4;
    142     res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
    143     res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
    144     ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
    145     ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
    146     dst += dst_stride * 4;
    147     res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
    148     res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
    149     ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
    150     ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
    151     src += 16;
    152     dst += dst_stride * 4;
    153   }
    154 }
    155 
    156 void TransposeUVWx16_MSA(const uint8* src,
    157                          int src_stride,
    158                          uint8* dst_a,
    159                          int dst_stride_a,
    160                          uint8* dst_b,
    161                          int dst_stride_b,
    162                          int width) {
    163   int x;
    164   const uint8* s;
    165   v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
    166   v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
    167   v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
    168 
    169   for (x = 0; x < width; x += 8) {
    170     s = src;
    171     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    172     s += src_stride;
    173     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    174     s += src_stride;
    175     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    176     s += src_stride;
    177     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    178     s += src_stride;
    179     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    180     ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
    181     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    182     s += src_stride;
    183     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    184     s += src_stride;
    185     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    186     s += src_stride;
    187     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    188     s += src_stride;
    189     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    190     ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
    191     ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
    192     ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
    193     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    194     s += src_stride;
    195     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    196     s += src_stride;
    197     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    198     s += src_stride;
    199     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    200     s += src_stride;
    201     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    202     ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
    203     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    204     s += src_stride;
    205     src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    206     s += src_stride;
    207     src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    208     s += src_stride;
    209     src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    210     s += src_stride;
    211     ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
    212     ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
    213     res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
    214     res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
    215     ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
    216     ST_UB2(dst0, dst2, dst_a, dst_stride_a);
    217     ST_UB2(dst1, dst3, dst_b, dst_stride_b);
    218     dst_a += dst_stride_a * 2;
    219     dst_b += dst_stride_b * 2;
    220     res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
    221     res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
    222     ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
    223     ST_UB2(dst0, dst2, dst_a, dst_stride_a);
    224     ST_UB2(dst1, dst3, dst_b, dst_stride_b);
    225     dst_a += dst_stride_a * 2;
    226     dst_b += dst_stride_b * 2;
    227     res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
    228     res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
    229     ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
    230     ST_UB2(dst0, dst2, dst_a, dst_stride_a);
    231     ST_UB2(dst1, dst3, dst_b, dst_stride_b);
    232     dst_a += dst_stride_a * 2;
    233     dst_b += dst_stride_b * 2;
    234     res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
    235     res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
    236     ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
    237     ST_UB2(dst0, dst2, dst_a, dst_stride_a);
    238     ST_UB2(dst1, dst3, dst_b, dst_stride_b);
    239     src += 16;
    240     dst_a += dst_stride_a * 2;
    241     dst_b += dst_stride_b * 2;
    242   }
    243 }
    244 
    245 #ifdef __cplusplus
    246 }  // extern "C"
    247 }  // namespace libyuv
    248 #endif
    249 
    250 #endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
    251