Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/variance.h"
     13 #include "vpx_ports/mem.h"
     14 #include "vpx/vpx_integer.h"
     15 #include "vpx_ports/asmdefs_mmi.h"
     16 
     17 static const uint8_t bilinear_filters[8][2] = {
     18   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
     19   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
     20 };
     21 
     22 /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
     23    vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
     24 #define VARIANCE_SSE_SUM_8_FOR_W64                                  \
     25   /* sse */                                                         \
     26   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
     27   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
     28   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
     29   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
     30   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
     31   "paddw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t" \
     32   "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \
     33                                                                     \
     34   /* sum */                                                         \
     35   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
     36   "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
     37   "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
     38   "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
     39   "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \
     40   "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \
     41   "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \
     42   "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \
     43   "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \
     44   "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \
     45   "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \
     46   "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \
     47   "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \
     48   "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \
     49   "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \
     50   "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \
     51   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \
     52   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \
     53   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \
     54   "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
     55 
     56 #define VARIANCE_SSE_SUM_4                                          \
     57   /* sse */                                                         \
     58   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
     59   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
     60   "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \
     61   "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \
     62                                                                     \
     63   /* sum */                                                         \
     64   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
     65   "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \
     66   "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \
     67   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
     68 
     69 #define VARIANCE_SSE_SUM_8                                          \
     70   /* sse */                                                         \
     71   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
     72   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
     73   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
     74   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
     75   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
     76   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
     77   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \
     78                                                                     \
     79   /* sum */                                                         \
     80   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
     81   "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
     82   "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
     83   "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
     84   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \
     85   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \
     86   "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \
     87   "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
     88 
     89 #define VARIANCE_SSE_8                                              \
     90   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
     91   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
     92   "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t" \
     93   "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t" \
     94   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
     95   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
     96   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
     97   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
     98   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
     99   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
    100   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
    101 
    102 #define VARIANCE_SSE_16                                             \
    103   VARIANCE_SSE_8                                                    \
    104   "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
    105   "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    106   "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t" \
    107   "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t" \
    108   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
    109   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
    110   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
    111   "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
    112   "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
    113   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
    114   "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
    115 
    116 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
    117   /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
    118   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
    119   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
    120   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
    121   "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    122   "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
    123   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
    124   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
    125   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
    126   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \
    127   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
    128   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
    129 
    130 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
    131   /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
    132   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
    133   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
    134   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
    135   "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    136   "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
    137   "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
    138   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
    139   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
    140   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
    141   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
    142   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
    143 
    144 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \
    145   /* calculate: temp2[0] ~ temp2[3] */                              \
    146   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
    147   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
    148   "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
    149   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
    150   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
    151                                                                     \
    152   /* store: temp2[0] ~ temp2[3] */                                  \
    153   "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
    154   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
    155   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
    156 
    157 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \
    158   /* calculate: temp2[0] ~ temp2[3] */                              \
    159   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
    160   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
    161   "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
    162   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
    163   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
    164                                                                     \
    165   /* store: temp2[0] ~ temp2[3] */                                  \
    166   "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
    167   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
    168   "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
    169 
    170 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
    171   /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
    172   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
    173   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
    174   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
    175   "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
    176   "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    177   "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
    178   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
    179   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
    180   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
    181   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \
    182   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
    183   "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
    184   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \
    185   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
    186   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
    187   "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \
    188   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
    189   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"
    190 
    191 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
    192   /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
    193   "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
    194   "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
    195   "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
    196   "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
    197   "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    198   "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
    199   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
    200   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
    201   "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
    202   "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \
    203   "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
    204   "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
    205   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \
    206   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \
    207   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \
    208   "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \
    209   "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
    210   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"
    211 
    212 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \
    213   /* calculate: temp2[0] ~ temp2[3] */                              \
    214   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
    215   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
    216   "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \
    217   "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
    218   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
    219                                                                     \
    220   /* calculate: temp2[4] ~ temp2[7] */                              \
    221   "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \
    222   "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
    223   "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \
    224   "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \
    225   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
    226                                                                     \
    227   /* store: temp2[0] ~ temp2[7] */                                  \
    228   "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
    229   "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
    230   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
    231   "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
    232   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
    233 
    234 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \
    235   /* calculate: temp2[0] ~ temp2[3] */                              \
    236   "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \
    237   "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
    238   "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
    239   "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \
    240   "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
    241                                                                     \
    242   /* calculate: temp2[4] ~ temp2[7] */                              \
    243   "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \
    244   "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
    245   "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \
    246   "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \
    247   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
    248                                                                     \
    249   /* store: temp2[0] ~ temp2[7] */                                  \
    250   "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
    251   "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
    252   "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
    253   "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
    254   "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
    255 
    256 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \
    257   /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
    258   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
    259                                                                     \
    260   /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
    261   "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
    262   "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    263   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
    264   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
    265   "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
    266   "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
    267   "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
    268   "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
    269   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
    270   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \
    271   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
    272   "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
    273   "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \
    274   "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \
    275   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
    276   "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \
    277   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
    278   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"
    279 
    280 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \
    281   /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
    282   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
    283                                                                     \
    284   /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
    285   "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
    286   "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
    287   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
    288   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
    289   "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
    290   "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
    291   "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
    292   "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
    293   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
    294   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \
    295   "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
    296   "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
    297   "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \
    298   "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \
    299   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \
    300   "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \
    301   "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
    302   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"
    303 
    304 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \
    305   VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \
    306                                                                     \
    307   /* calculate: temp2[8] ~ temp2[11] */                             \
    308   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
    309   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
    310   "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \
    311   "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
    312   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
    313                                                                     \
    314   /* calculate: temp2[12] ~ temp2[15] */                            \
    315   "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \
    316   "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
    317   "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \
    318   "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
    319   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
    320                                                                     \
    321   /* store: temp2[8] ~ temp2[15] */                                 \
    322   "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
    323   "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
    324   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
    325   "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
    326   "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
    327 
    328 #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \
    329   VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \
    330                                                                     \
    331   /* calculate: temp2[8] ~ temp2[11] */                             \
    332   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \
    333   "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
    334   "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
    335   "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \
    336   "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
    337                                                                     \
    338   /* calculate: temp2[12] ~ temp2[15] */                            \
    339   "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \
    340   "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
    341   "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \
    342   "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \
    343   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
    344                                                                     \
    345   /* store: temp2[8] ~ temp2[15] */                                 \
    346   "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
    347   "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
    348   "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
    349   "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
    350   "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
    351 
    352 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
    353 // or vertical direction to produce the filtered output block. Used to implement
    354 // the first-pass of 2-D separable filter.
    355 //
    356 // Produces int16_t output to retain precision for the next pass. Two filter
    357 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
    358 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
    359 // It defines the offset required to move from one input to the next.
    360 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
    361                                               unsigned int src_pixels_per_line,
    362                                               int pixel_step,
    363                                               unsigned int output_height,
    364                                               unsigned int output_width,
    365                                               const uint8_t *filter) {
    366   unsigned int i, j;
    367 
    368   for (i = 0; i < output_height; ++i) {
    369     for (j = 0; j < output_width; ++j) {
    370       b[j] = ROUND_POWER_OF_TWO(
    371           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
    372 
    373       ++a;
    374     }
    375 
    376     a += src_pixels_per_line - output_width;
    377     b += output_width;
    378   }
    379 }
    380 
    381 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
    382 // or vertical direction to produce the filtered output block. Used to implement
    383 // the second-pass of 2-D separable filter.
    384 //
    385 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
    386 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
    387 // filter is applied horizontally (pixel_step = 1) or vertically
    388 // (pixel_step = stride). It defines the offset required to move from one input
    389 // to the next. Output is 8-bit.
    390 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
    391                                                unsigned int src_pixels_per_line,
    392                                                unsigned int pixel_step,
    393                                                unsigned int output_height,
    394                                                unsigned int output_width,
    395                                                const uint8_t *filter) {
    396   unsigned int i, j;
    397 
    398   for (i = 0; i < output_height; ++i) {
    399     for (j = 0; j < output_width; ++j) {
    400       b[j] = ROUND_POWER_OF_TWO(
    401           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
    402       ++a;
    403     }
    404 
    405     a += src_pixels_per_line - output_width;
    406     b += output_width;
    407   }
    408 }
    409 
    410 static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
    411                                        const uint8_t *b, int b_stride,
    412                                        uint32_t *sse, int high) {
    413   int sum;
    414   double ftmp[12];
    415   uint32_t tmp[3];
    416 
    417   *sse = 0;
    418 
    419   __asm__ volatile (
    420     "li         %[tmp0],    0x20                                \n\t"
    421     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    422     MMI_L(%[tmp0], %[high], 0x00)
    423     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    424     "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
    425     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
    426     "1:                                                         \n\t"
    427     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    428     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    429     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    430     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    431     VARIANCE_SSE_SUM_8_FOR_W64
    432 
    433     "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
    434     "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
    435     "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
    436     "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
    437     VARIANCE_SSE_SUM_8_FOR_W64
    438 
    439     "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
    440     "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
    441     "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
    442     "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
    443     VARIANCE_SSE_SUM_8_FOR_W64
    444 
    445     "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
    446     "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
    447     "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
    448     "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
    449     VARIANCE_SSE_SUM_8_FOR_W64
    450 
    451     "gsldlc1    %[ftmp1],   0x27(%[a])                          \n\t"
    452     "gsldrc1    %[ftmp1],   0x20(%[a])                          \n\t"
    453     "gsldlc1    %[ftmp2],   0x27(%[b])                          \n\t"
    454     "gsldrc1    %[ftmp2],   0x20(%[b])                          \n\t"
    455     VARIANCE_SSE_SUM_8_FOR_W64
    456 
    457     "gsldlc1    %[ftmp1],   0x2f(%[a])                          \n\t"
    458     "gsldrc1    %[ftmp1],   0x28(%[a])                          \n\t"
    459     "gsldlc1    %[ftmp2],   0x2f(%[b])                          \n\t"
    460     "gsldrc1    %[ftmp2],   0x28(%[b])                          \n\t"
    461     VARIANCE_SSE_SUM_8_FOR_W64
    462 
    463     "gsldlc1    %[ftmp1],   0x37(%[a])                          \n\t"
    464     "gsldrc1    %[ftmp1],   0x30(%[a])                          \n\t"
    465     "gsldlc1    %[ftmp2],   0x37(%[b])                          \n\t"
    466     "gsldrc1    %[ftmp2],   0x30(%[b])                          \n\t"
    467     VARIANCE_SSE_SUM_8_FOR_W64
    468 
    469     "gsldlc1    %[ftmp1],   0x3f(%[a])                          \n\t"
    470     "gsldrc1    %[ftmp1],   0x38(%[a])                          \n\t"
    471     "gsldlc1    %[ftmp2],   0x3f(%[b])                          \n\t"
    472     "gsldrc1    %[ftmp2],   0x38(%[b])                          \n\t"
    473     VARIANCE_SSE_SUM_8_FOR_W64
    474 
    475     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    476     MMI_ADDU(%[a], %[a], %[a_stride])
    477     MMI_ADDU(%[b], %[b], %[b_stride])
    478     "bnez       %[tmp0],    1b                                  \n\t"
    479 
    480     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
    481     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
    482     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
    483     "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
    484     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
    485     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
    486     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    487       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    488       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    489       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    490       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    491       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    492       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
    493       [tmp2]"=&r"(tmp[2]),
    494       [a]"+&r"(a),                      [b]"+&r"(b),
    495       [sum]"=&r"(sum)
    496     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    497       [high]"r"(&high), [sse]"r"(sse)
    498     : "memory"
    499   );
    500 
    501   return *sse - (((int64_t)sum * sum) / (64 * high));
    502 }
    503 
    504 #define VPX_VARIANCE64XN(n)                                         \
    505   uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
    506                                     const uint8_t *b, int b_stride, \
    507                                     uint32_t *sse) {                \
    508     return vpx_variance64x(a, a_stride, b, b_stride, sse, n);       \
    509   }
    510 
    511 VPX_VARIANCE64XN(64)
    512 VPX_VARIANCE64XN(32)
    513 
    514 uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
    515                                int b_stride, uint32_t *sse) {
    516   int sum;
    517   double ftmp[12];
    518   uint32_t tmp[3];
    519 
    520   *sse = 0;
    521 
    522   __asm__ volatile (
    523     "li         %[tmp0],    0x20                                \n\t"
    524     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    525     "li         %[tmp0],    0x40                                \n\t"
    526     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    527     "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
    528     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
    529     "1:                                                         \n\t"
    530     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    531     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    532     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    533     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    534     VARIANCE_SSE_SUM_8_FOR_W64
    535 
    536     "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
    537     "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
    538     "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
    539     "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
    540     VARIANCE_SSE_SUM_8_FOR_W64
    541 
    542     "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
    543     "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
    544     "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
    545     "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
    546     VARIANCE_SSE_SUM_8_FOR_W64
    547 
    548     "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
    549     "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
    550     "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
    551     "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
    552     VARIANCE_SSE_SUM_8_FOR_W64
    553 
    554     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    555     MMI_ADDU(%[a], %[a], %[a_stride])
    556     MMI_ADDU(%[b], %[b], %[b_stride])
    557     "bnez       %[tmp0],    1b                                  \n\t"
    558 
    559     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
    560     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
    561     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
    562     "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
    563     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
    564     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
    565     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    566       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    567       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    568       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    569       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    570       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    571       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
    572       [tmp2]"=&r"(tmp[2]),
    573       [a]"+&r"(a),                      [b]"+&r"(b),
    574       [sum]"=&r"(sum)
    575     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    576       [sse]"r"(sse)
    577     : "memory"
    578   );
    579 
    580   return *sse - (((int64_t)sum * sum) / 2048);
    581 }
    582 
    583 static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
    584                                        const uint8_t *b, int b_stride,
    585                                        uint32_t *sse, int high) {
    586   int sum;
    587   double ftmp[13];
    588   uint32_t tmp[3];
    589 
    590   *sse = 0;
    591 
    592   __asm__ volatile (
    593     "li         %[tmp0],    0x20                                \n\t"
    594     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    595     MMI_L(%[tmp0], %[high], 0x00)
    596     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    597     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    598     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
    599     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
    600     "1:                                                         \n\t"
    601     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    602     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    603     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    604     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    605     VARIANCE_SSE_SUM_8
    606     "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
    607     "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
    608     "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
    609     "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
    610     VARIANCE_SSE_SUM_8
    611     "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
    612     "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
    613     "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
    614     "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
    615     VARIANCE_SSE_SUM_8
    616     "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
    617     "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
    618     "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
    619     "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
    620     VARIANCE_SSE_SUM_8
    621 
    622     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    623     MMI_ADDU(%[a], %[a], %[a_stride])
    624     MMI_ADDU(%[b], %[b], %[b_stride])
    625     "bnez       %[tmp0],    1b                                  \n\t"
    626 
    627     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
    628     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
    629     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    630 
    631     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
    632     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
    633     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
    634     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
    635     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
    636     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
    637     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
    638     "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
    639     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
    640     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
    641 
    642     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    643       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    644       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    645       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    646       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    647       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    648       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
    649       [a]"+&r"(a),                      [b]"+&r"(b)
    650     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    651       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
    652     : "memory"
    653   );
    654 
    655   return *sse - (((int64_t)sum * sum) / (32 * high));
    656 }
    657 
    658 #define VPX_VARIANCE32XN(n)                                         \
    659   uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
    660                                     const uint8_t *b, int b_stride, \
    661                                     uint32_t *sse) {                \
    662     return vpx_variance32x(a, a_stride, b, b_stride, sse, n);       \
    663   }
    664 
    665 VPX_VARIANCE32XN(32)
    666 VPX_VARIANCE32XN(16)
    667 
    668 static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
    669                                        const uint8_t *b, int b_stride,
    670                                        uint32_t *sse, int high) {
    671   int sum;
    672   double ftmp[13];
    673   uint32_t tmp[3];
    674 
    675   *sse = 0;
    676 
    677   __asm__ volatile (
    678     "li         %[tmp0],    0x20                                \n\t"
    679     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    680     MMI_L(%[tmp0], %[high], 0x00)
    681     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    682     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    683     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
    684     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
    685     "1:                                                         \n\t"
    686     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    687     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    688     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    689     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    690     VARIANCE_SSE_SUM_8
    691     "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
    692     "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
    693     "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
    694     "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
    695     VARIANCE_SSE_SUM_8
    696 
    697     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    698     MMI_ADDU(%[a], %[a], %[a_stride])
    699     MMI_ADDU(%[b], %[b], %[b_stride])
    700     "bnez       %[tmp0],    1b                                  \n\t"
    701 
    702     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
    703     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
    704     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    705 
    706     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
    707     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
    708     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
    709     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
    710     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
    711     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
    712     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
    713     "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
    714     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
    715     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
    716 
    717     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    718       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    719       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    720       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    721       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    722       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    723       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
    724       [a]"+&r"(a),                      [b]"+&r"(b)
    725     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    726       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
    727     : "memory"
    728   );
    729 
    730   return *sse - (((int64_t)sum * sum) / (16 * high));
    731 }
    732 
    733 #define VPX_VARIANCE16XN(n)                                         \
    734   uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
    735                                     const uint8_t *b, int b_stride, \
    736                                     uint32_t *sse) {                \
    737     return vpx_variance16x(a, a_stride, b, b_stride, sse, n);       \
    738   }
    739 
    740 VPX_VARIANCE16XN(32)
    741 VPX_VARIANCE16XN(16)
    742 VPX_VARIANCE16XN(8)
    743 
    744 static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
    745                                       const uint8_t *b, int b_stride,
    746                                       uint32_t *sse, int high) {
    747   int sum;
    748   double ftmp[13];
    749   uint32_t tmp[3];
    750 
    751   *sse = 0;
    752 
    753   __asm__ volatile (
    754     "li         %[tmp0],    0x20                                \n\t"
    755     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    756     MMI_L(%[tmp0], %[high], 0x00)
    757     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    758     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    759     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
    760     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
    761     "1:                                                         \n\t"
    762     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    763     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    764     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    765     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    766     VARIANCE_SSE_SUM_8
    767 
    768     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    769     MMI_ADDU(%[a], %[a], %[a_stride])
    770     MMI_ADDU(%[b], %[b], %[b_stride])
    771     "bnez       %[tmp0],    1b                                  \n\t"
    772 
    773     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
    774     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
    775     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    776 
    777     "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
    778     "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
    779     "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
    780     "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
    781     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
    782     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
    783     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
    784     "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
    785     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
    786     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
    787 
    788     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    789       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    790       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    791       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    792       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    793       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    794       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
    795       [a]"+&r"(a),                      [b]"+&r"(b)
    796     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    797       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
    798     : "memory"
    799   );
    800 
    801   return *sse - (((int64_t)sum * sum) / (8 * high));
    802 }
    803 
    804 #define VPX_VARIANCE8XN(n)                                         \
    805   uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
    806                                    const uint8_t *b, int b_stride, \
    807                                    uint32_t *sse) {                \
    808     return vpx_variance8x(a, a_stride, b, b_stride, sse, n);       \
    809   }
    810 
    811 VPX_VARIANCE8XN(16)
    812 VPX_VARIANCE8XN(8)
    813 VPX_VARIANCE8XN(4)
    814 
    815 static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
    816                                       const uint8_t *b, int b_stride,
    817                                       uint32_t *sse, int high) {
    818   int sum;
    819   double ftmp[12];
    820   uint32_t tmp[3];
    821 
    822   *sse = 0;
    823 
    824   __asm__ volatile (
    825     "li         %[tmp0],    0x20                                \n\t"
    826     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
    827     MMI_L(%[tmp0], %[high], 0x00)
    828     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    829     "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
    830     "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
    831     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    832     "1:                                                         \n\t"
    833     "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
    834     "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
    835     "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
    836     "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
    837     VARIANCE_SSE_SUM_4
    838 
    839     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    840     MMI_ADDU(%[a], %[a], %[a_stride])
    841     MMI_ADDU(%[b], %[b], %[b_stride])
    842     "bnez       %[tmp0],    1b                                  \n\t"
    843 
    844     "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
    845     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
    846     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    847 
    848     "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"
    849     "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"
    850     "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"
    851     "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"
    852     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
    853     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
    854     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
    855     "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
    856     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
    857     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
    858     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    859       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    860       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    861       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    862       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    863       [ftmp10]"=&f"(ftmp[10]),
    864       [tmp0]"=&r"(tmp[0]),
    865       [a]"+&r"(a),                      [b]"+&r"(b)
    866     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    867       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
    868     : "memory"
    869   );
    870 
    871   return *sse - (((int64_t)sum * sum) / (4 * high));
    872 }
    873 
    874 #define VPX_VARIANCE4XN(n)                                         \
    875   uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
    876                                    const uint8_t *b, int b_stride, \
    877                                    uint32_t *sse) {                \
    878     return vpx_variance4x(a, a_stride, b, b_stride, sse, n);       \
    879   }
    880 
    881 VPX_VARIANCE4XN(8)
    882 VPX_VARIANCE4XN(4)
    883 
    884 static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
    885                                   const uint8_t *b, int b_stride, uint32_t *sse,
    886                                   uint64_t high) {
    887   double ftmp[12];
    888   uint32_t tmp[1];
    889 
    890   *sse = 0;
    891 
    892   __asm__ volatile (
    893     "li         %[tmp0],    0x20                                \n\t"
    894     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    895     MMI_L(%[tmp0], %[high], 0x00)
    896     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    897     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    898 
    899     "1:                                                         \n\t"
    900     VARIANCE_SSE_16
    901 
    902     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    903     MMI_ADDU(%[a], %[a], %[a_stride])
    904     MMI_ADDU(%[b], %[b], %[b_stride])
    905     "bnez       %[tmp0],    1b                                  \n\t"
    906 
    907     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
    908     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
    909     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    910     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    911       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    912       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    913       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    914       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    915       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    916       [tmp0]"=&r"(tmp[0]),
    917       [a]"+&r"(a),                      [b]"+&r"(b)
    918     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    919       [high]"r"(&high), [sse]"r"(sse)
    920     : "memory"
    921   );
    922 
    923   return *sse;
    924 }
    925 
    926 #define vpx_mse16xN(n)                                         \
    927   uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
    928                                const uint8_t *b, int b_stride, \
    929                                uint32_t *sse) {                \
    930     return vpx_mse16x(a, a_stride, b, b_stride, sse, n);       \
    931   }
    932 
    933 vpx_mse16xN(16);
    934 vpx_mse16xN(8);
    935 
    936 static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
    937                                  const uint8_t *b, int b_stride, uint32_t *sse,
    938                                  uint64_t high) {
    939   double ftmp[12];
    940   uint32_t tmp[1];
    941 
    942   *sse = 0;
    943 
    944   __asm__ volatile (
    945     "li         %[tmp0],    0x20                                \n\t"
    946     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
    947     MMI_L(%[tmp0], %[high], 0x00)
    948     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    949     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
    950 
    951     "1:                                                         \n\t"
    952     VARIANCE_SSE_8
    953 
    954     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
    955     MMI_ADDU(%[a], %[a], %[a_stride])
    956     MMI_ADDU(%[b], %[b], %[b_stride])
    957     "bnez       %[tmp0],    1b                                  \n\t"
    958 
    959     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
    960     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
    961     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
    962     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
    963       [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
    964       [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
    965       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
    966       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
    967       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
    968       [tmp0]"=&r"(tmp[0]),
    969       [a]"+&r"(a),                      [b]"+&r"(b)
    970     : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
    971       [high]"r"(&high), [sse]"r"(sse)
    972     : "memory"
    973   );
    974 
    975   return *sse;
    976 }
    977 
    978 #define vpx_mse8xN(n)                                                          \
    979   uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride,                  \
    980                               const uint8_t *b, int b_stride, uint32_t *sse) { \
    981     return vpx_mse8x(a, a_stride, b, b_stride, sse, n);                        \
    982   }
    983 
    984 vpx_mse8xN(16);
    985 vpx_mse8xN(8);
    986 
    987 #define SUBPIX_VAR(W, H)                                                \
    988   uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                       \
    989       const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
    990       const uint8_t *b, int b_stride, uint32_t *sse) {                  \
    991     uint16_t fdata3[(H + 1) * W];                                       \
    992     uint8_t temp2[H * W];                                               \
    993                                                                         \
    994     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
    995                                       bilinear_filters[xoffset]);       \
    996     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
    997                                        bilinear_filters[yoffset]);      \
    998                                                                         \
    999     return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse);     \
   1000   }
   1001 
   1002 SUBPIX_VAR(64, 64)
   1003 SUBPIX_VAR(64, 32)
   1004 SUBPIX_VAR(32, 64)
   1005 SUBPIX_VAR(32, 32)
   1006 SUBPIX_VAR(32, 16)
   1007 SUBPIX_VAR(16, 32)
   1008 
   1009 static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
   1010                                               int xoffset, int yoffset,
   1011                                               uint8_t *temp2, int counter) {
   1012   uint8_t *temp2_ptr = temp2;
   1013   mips_reg l_counter = counter;
   1014   double ftmp[15];
   1015   mips_reg tmp[2];
   1016   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   1017   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
   1018 
   1019   const uint8_t *filter_x = bilinear_filters[xoffset];
   1020   const uint8_t *filter_y = bilinear_filters[yoffset];
   1021 
   1022   __asm__ volatile (
   1023     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
   1024     MMI_LI(%[tmp0], 0x07)
   1025     MMI_MTC1(%[tmp0], %[ftmp14])
   1026     "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
   1027     "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
   1028     "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
   1029     "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
   1030 
   1031     // fdata3: fdata3[0] ~ fdata3[15]
   1032     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
   1033 
   1034     // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
   1035     MMI_ADDU(%[a], %[a], %[a_stride])
   1036     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
   1037     // temp2: temp2[0] ~ temp2[15]
   1038     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
   1039 
   1040     // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
   1041     MMI_ADDU(%[a], %[a], %[a_stride])
   1042     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
   1043     // temp2+16*1: temp2[0] ~ temp2[15]
   1044     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
   1045     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
   1046 
   1047     "1:                                                         \n\t"
   1048     MMI_ADDU(%[a], %[a], %[a_stride])
   1049     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
   1050     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
   1051     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
   1052 
   1053     MMI_ADDU(%[a], %[a], %[a_stride])
   1054     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
   1055     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
   1056     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
   1057     "addiu      %[counter], %[counter],     -0x01               \n\t"
   1058     "bnez       %[counter], 1b                                  \n\t"
   1059     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
   1060       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
   1061       [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
   1062       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
   1063       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
   1064       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
   1065       [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
   1066       [counter]"+&r"(l_counter)
   1067     : [filter_x0] "f"((uint64_t)filter_x[0]),
   1068       [filter_x1] "f"((uint64_t)filter_x[1]),
   1069       [filter_y0] "f"((uint64_t)filter_y[0]),
   1070       [filter_y1] "f"((uint64_t)filter_y[1]),
   1071       [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
   1072       [mask] "f"(mask)
   1073     : "memory"
   1074   );
   1075 }
   1076 
   1077 #define SUBPIX_VAR16XN(H)                                            \
   1078   uint32_t vpx_sub_pixel_variance16x##H##_mmi(                       \
   1079       const uint8_t *a, int a_stride, int xoffset, int yoffset,      \
   1080       const uint8_t *b, int b_stride, uint32_t *sse) {               \
   1081     uint8_t temp2[16 * H];                                           \
   1082     var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
   1083                                (H - 2) / 2);                         \
   1084                                                                      \
   1085     return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse);    \
   1086   }
   1087 
   1088 SUBPIX_VAR16XN(16)
   1089 SUBPIX_VAR16XN(8)
   1090 
   1091 static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
   1092                                              int xoffset, int yoffset,
   1093                                              uint8_t *temp2, int counter) {
   1094   uint8_t *temp2_ptr = temp2;
   1095   mips_reg l_counter = counter;
   1096   double ftmp[15];
   1097   mips_reg tmp[2];
   1098   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   1099   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
   1100   const uint8_t *filter_x = bilinear_filters[xoffset];
   1101   const uint8_t *filter_y = bilinear_filters[yoffset];
   1102 
   1103   __asm__ volatile (
   1104     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
   1105     MMI_LI(%[tmp0], 0x07)
   1106     MMI_MTC1(%[tmp0], %[ftmp14])
   1107     "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
   1108     "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
   1109     "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
   1110     "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
   1111 
   1112     // fdata3: fdata3[0] ~ fdata3[7]
   1113     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
   1114 
   1115     // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
   1116     MMI_ADDU(%[a], %[a], %[a_stride])
   1117     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
   1118     // temp2: temp2[0] ~ temp2[7]
   1119     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
   1120 
   1121     // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
   1122     MMI_ADDU(%[a], %[a], %[a_stride])
   1123     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
   1124     // temp2+8*1: temp2[0] ~ temp2[7]
   1125     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
   1126     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
   1127 
   1128     "1:                                                         \n\t"
   1129     MMI_ADDU(%[a], %[a], %[a_stride])
   1130     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
   1131     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
   1132     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
   1133 
   1134     MMI_ADDU(%[a], %[a], %[a_stride])
   1135     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
   1136     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
   1137     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
   1138     "addiu      %[counter], %[counter],     -0x01               \n\t"
   1139     "bnez       %[counter], 1b                                  \n\t"
   1140     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
   1141       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
   1142       [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
   1143       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
   1144       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
   1145       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
   1146       [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
   1147       [counter]"+&r"(l_counter)
   1148     : [filter_x0] "f"((uint64_t)filter_x[0]),
   1149       [filter_x1] "f"((uint64_t)filter_x[1]),
   1150       [filter_y0] "f"((uint64_t)filter_y[0]),
   1151       [filter_y1] "f"((uint64_t)filter_y[1]),
   1152       [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
   1153       [mask] "f"(mask)
   1154     : "memory"
   1155   );
   1156 }
   1157 
   1158 #define SUBPIX_VAR8XN(H)                                            \
   1159   uint32_t vpx_sub_pixel_variance8x##H##_mmi(                       \
   1160       const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
   1161       const uint8_t *b, int b_stride, uint32_t *sse) {              \
   1162     uint8_t temp2[8 * H];                                           \
   1163     var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
   1164                               (H - 2) / 2);                         \
   1165                                                                     \
   1166     return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse);     \
   1167   }
   1168 
   1169 SUBPIX_VAR8XN(16)
   1170 SUBPIX_VAR8XN(8)
   1171 SUBPIX_VAR8XN(4)
   1172 
   1173 static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
   1174                                              int xoffset, int yoffset,
   1175                                              uint8_t *temp2, int counter) {
   1176   uint8_t *temp2_ptr = temp2;
   1177   mips_reg l_counter = counter;
   1178   double ftmp[7];
   1179   mips_reg tmp[2];
   1180   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   1181   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
   1182   const uint8_t *filter_x = bilinear_filters[xoffset];
   1183   const uint8_t *filter_y = bilinear_filters[yoffset];
   1184 
   1185   __asm__ volatile (
   1186     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
   1187     MMI_LI(%[tmp0], 0x07)
   1188     MMI_MTC1(%[tmp0], %[ftmp6])
   1189     "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
   1190     "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
   1191     "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
   1192     "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
   1193     // fdata3: fdata3[0] ~ fdata3[3]
   1194     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
   1195 
   1196     // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
   1197     MMI_ADDU(%[a], %[a], %[a_stride])
   1198     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
   1199     // temp2: temp2[0] ~ temp2[7]
   1200     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
   1201 
   1202     // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
   1203     MMI_ADDU(%[a], %[a], %[a_stride])
   1204     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
   1205     // temp2+4*1: temp2[0] ~ temp2[7]
   1206     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
   1207     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
   1208 
   1209     "1:                                                         \n\t"
   1210     MMI_ADDU(%[a], %[a], %[a_stride])
   1211     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
   1212     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
   1213     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
   1214 
   1215     MMI_ADDU(%[a], %[a], %[a_stride])
   1216     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
   1217     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
   1218     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
   1219     "addiu      %[counter], %[counter],     -0x01               \n\t"
   1220     "bnez       %[counter], 1b                                  \n\t"
   1221     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
   1222       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
   1223       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
   1224       [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
   1225     : [filter_x0] "f"((uint64_t)filter_x[0]),
   1226       [filter_x1] "f"((uint64_t)filter_x[1]),
   1227       [filter_y0] "f"((uint64_t)filter_y[0]),
   1228       [filter_y1] "f"((uint64_t)filter_y[1]),
   1229       [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
   1230       [mask] "f"(mask)
   1231     : "memory"
   1232   );
   1233 }
   1234 
   1235 #define SUBPIX_VAR4XN(H)                                            \
   1236   uint32_t vpx_sub_pixel_variance4x##H##_mmi(                       \
   1237       const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
   1238       const uint8_t *b, int b_stride, uint32_t *sse) {              \
   1239     uint8_t temp2[4 * H];                                           \
   1240     var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
   1241                               (H - 2) / 2);                         \
   1242                                                                     \
   1243     return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse);     \
   1244   }
   1245 
   1246 SUBPIX_VAR4XN(8)
   1247 SUBPIX_VAR4XN(4)
   1248 
   1249 #define SUBPIX_AVG_VAR(W, H)                                            \
   1250   uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                   \
   1251       const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
   1252       const uint8_t *b, int b_stride, uint32_t *sse,                    \
   1253       const uint8_t *second_pred) {                                     \
   1254     uint16_t fdata3[(H + 1) * W];                                       \
   1255     uint8_t temp2[H * W];                                               \
   1256     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
   1257                                                                         \
   1258     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
   1259                                       bilinear_filters[xoffset]);       \
   1260     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
   1261                                        bilinear_filters[yoffset]);      \
   1262                                                                         \
   1263     vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
   1264                                                                         \
   1265     return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse);     \
   1266   }
   1267 
   1268 SUBPIX_AVG_VAR(64, 64)
   1269 SUBPIX_AVG_VAR(64, 32)
   1270 SUBPIX_AVG_VAR(32, 64)
   1271 SUBPIX_AVG_VAR(32, 32)
   1272 SUBPIX_AVG_VAR(32, 16)
   1273 SUBPIX_AVG_VAR(16, 32)
   1274 SUBPIX_AVG_VAR(16, 16)
   1275 SUBPIX_AVG_VAR(16, 8)
   1276 SUBPIX_AVG_VAR(8, 16)
   1277 SUBPIX_AVG_VAR(8, 8)
   1278 SUBPIX_AVG_VAR(8, 4)
   1279 SUBPIX_AVG_VAR(4, 8)
   1280 SUBPIX_AVG_VAR(4, 4)
   1281