Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 pw_8: times  8 dw  8
     15 bilin_filter_m_sse2: times  8 dw 16
     16                      times  8 dw  0
     17                      times  8 dw 14
     18                      times  8 dw  2
     19                      times  8 dw 12
     20                      times  8 dw  4
     21                      times  8 dw 10
     22                      times  8 dw  6
     23                      times 16 dw  8
     24                      times  8 dw  6
     25                      times  8 dw 10
     26                      times  8 dw  4
     27                      times  8 dw 12
     28                      times  8 dw  2
     29                      times  8 dw 14
     30 
     31 SECTION .text
     32 
     33 ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
     34 ;                               int x_offset, int y_offset,
     35 ;                               const uint8_t *ref, ptrdiff_t ref_stride,
     36 ;                               int height, unsigned int *sse);
     37 ;
     38 ; This function returns the SE and stores SSE in the given pointer.
     39 
     40 %macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
     41   psubw                %3, %4
     42   psubw                %1, %2
     43   mova                 %4, %3       ; make copies to manipulate to calc sum
     44   mova                 %2, %1       ; use originals for calc sse
     45   pmaddwd              %3, %3
     46   paddw                %4, %2
     47   pmaddwd              %1, %1
     48   movhlps              %2, %4
     49   paddd                %6, %3
     50   paddw                %4, %2
     51   pxor                 %2, %2
     52   pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
     53   punpcklwd            %4, %2       ; sign-extend word to dword
     54   paddd                %6, %1
     55   paddd                %5, %4
     56 
     57 %endmacro
     58 
     59 %macro STORE_AND_RET 0
     60 %if mmsize == 16
     61   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
     62   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
     63   ; We have to sign-extend it before adding the words within the register
     64   ; and outputing to a dword.
     65   movhlps              m3, m7
     66   movhlps              m4, m6
     67   paddd                m7, m3
     68   paddd                m6, m4
     69   pshufd               m3, m7, 0x1
     70   pshufd               m4, m6, 0x1
     71   paddd                m7, m3
     72   paddd                m6, m4
     73   mov                  r1, ssem         ; r1 = unsigned int *sse
     74   movd               [r1], m7           ; store sse
     75   movd                eax, m6           ; store sum as return value
     76 %endif
     77   RET
     78 %endmacro
     79 
     80 %macro INC_SRC_BY_SRC_STRIDE  0
     81 %if ARCH_X86=1 && CONFIG_PIC=1
     82   add                srcq, src_stridemp
     83   add                srcq, src_stridemp
     84 %else
     85   lea                srcq, [srcq + src_strideq*2]
     86 %endif
     87 %endmacro
     88 
     89 %macro SUBPEL_VARIANCE 1-2 0 ; W
     90 %define bilin_filter_m bilin_filter_m_sse2
     91 %define filter_idx_shift 5
     92 
     93 
     94 %if ARCH_X86_64
     95   %if %2 == 1 ; avg
     96     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
     97                                       x_offset, y_offset, \
     98                                       ref, ref_stride, \
     99                                       second_pred, second_stride, height, sse
    100     %define second_str second_strideq
    101   %else
    102     cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
    103                                   x_offset, y_offset, \
    104                                   ref, ref_stride, height, sse
    105   %endif
    106   %define block_height heightd
    107   %define bilin_filter sseq
    108 %else
    109   %if CONFIG_PIC=1
    110     %if %2 == 1 ; avg
    111       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    112                                         x_offset, y_offset, \
    113                                         ref, ref_stride, \
    114                                         second_pred, second_stride, height, sse
    115       %define block_height dword heightm
    116       %define second_str second_stridemp
    117     %else
    118       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    119                                     x_offset, y_offset, \
    120                                     ref, ref_stride, height, sse
    121       %define block_height heightd
    122     %endif
    123 
    124     ; reuse argument stack space
    125     %define g_bilin_filterm x_offsetm
    126     %define g_pw_8m y_offsetm
    127 
    128     ; Store bilin_filter and pw_8 location in stack
    129     %if GET_GOT_DEFINED == 1
    130       GET_GOT eax
    131       add esp, 4                ; restore esp
    132     %endif
    133 
    134     lea ecx, [GLOBAL(bilin_filter_m)]
    135     mov g_bilin_filterm, ecx
    136 
    137     lea ecx, [GLOBAL(pw_8)]
    138     mov g_pw_8m, ecx
    139 
    140     LOAD_IF_USED 0, 1         ; load eax, ecx back
    141   %else
    142     %if %2 == 1 ; avg
    143       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
    144                                         x_offset, y_offset, \
    145                                         ref, ref_stride, \
    146                                         second_pred, second_stride, height, sse
    147       %define block_height dword heightm
    148       %define second_str second_stridemp
    149     %else
    150       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
    151                                     x_offset, y_offset, \
    152                                     ref, ref_stride, height, sse
    153       %define block_height heightd
    154     %endif
    155 
    156     %define bilin_filter bilin_filter_m
    157   %endif
    158 %endif
    159 
    160   ASSERT               %1 <= 16         ; m6 overflows if w > 16
    161   pxor                 m6, m6           ; sum
    162   pxor                 m7, m7           ; sse
    163 
    164 %if %1 < 16
    165   sar                   block_height, 1
    166 %endif
    167 %if %2 == 1 ; avg
    168   shl             second_str, 1
    169 %endif
    170 
    171   ; FIXME(rbultje) replace by jumptable?
    172   test          x_offsetd, x_offsetd
    173   jnz .x_nonzero
    174   ; x_offset == 0
    175   test          y_offsetd, y_offsetd
    176   jnz .x_zero_y_nonzero
    177 
    178   ; x_offset == 0 && y_offset == 0
    179 .x_zero_y_zero_loop:
    180 %if %1 == 16
    181   movu                 m0, [srcq]
    182   movu                 m2, [srcq + 16]
    183   mova                 m1, [refq]
    184   mova                 m3, [refq + 16]
    185 %if %2 == 1 ; avg
    186   pavgw                m0, [second_predq]
    187   pavgw                m2, [second_predq+16]
    188 %endif
    189   SUM_SSE              m0, m1, m2, m3, m6, m7
    190 
    191   lea                srcq, [srcq + src_strideq*2]
    192   lea                refq, [refq + ref_strideq*2]
    193 %if %2 == 1 ; avg
    194   add                second_predq, second_str
    195 %endif
    196 %else ; %1 < 16
    197   movu                 m0, [srcq]
    198   movu                 m2, [srcq + src_strideq*2]
    199   mova                 m1, [refq]
    200   mova                 m3, [refq + ref_strideq*2]
    201 %if %2 == 1 ; avg
    202   pavgw                m0, [second_predq]
    203   add                second_predq, second_str
    204   pavgw                m2, [second_predq]
    205 %endif
    206   SUM_SSE              m0, m1, m2, m3, m6, m7
    207 
    208   lea                srcq, [srcq + src_strideq*4]
    209   lea                refq, [refq + ref_strideq*4]
    210 %if %2 == 1 ; avg
    211   add                second_predq, second_str
    212 %endif
    213 %endif
    214   dec                   block_height
    215   jg .x_zero_y_zero_loop
    216   STORE_AND_RET
    217 
    218 .x_zero_y_nonzero:
    219   cmp           y_offsetd, 8
    220   jne .x_zero_y_nonhalf
    221 
    222   ; x_offset == 0 && y_offset == 0.5
    223 .x_zero_y_half_loop:
    224 %if %1 == 16
    225   movu                 m0, [srcq]
    226   movu                 m1, [srcq+16]
    227   movu                 m4, [srcq+src_strideq*2]
    228   movu                 m5, [srcq+src_strideq*2+16]
    229   mova                 m2, [refq]
    230   mova                 m3, [refq+16]
    231   pavgw                m0, m4
    232   pavgw                m1, m5
    233 %if %2 == 1 ; avg
    234   pavgw                m0, [second_predq]
    235   pavgw                m1, [second_predq+16]
    236 %endif
    237   SUM_SSE              m0, m2, m1, m3, m6, m7
    238 
    239   lea                srcq, [srcq + src_strideq*2]
    240   lea                refq, [refq + ref_strideq*2]
    241 %if %2 == 1 ; avg
    242   add                second_predq, second_str
    243 %endif
    244 %else ; %1 < 16
    245   movu                 m0, [srcq]
    246   movu                 m1, [srcq+src_strideq*2]
    247   movu                 m5, [srcq+src_strideq*4]
    248   mova                 m2, [refq]
    249   mova                 m3, [refq+ref_strideq*2]
    250   pavgw                m0, m1
    251   pavgw                m1, m5
    252 %if %2 == 1 ; avg
    253   pavgw                m0, [second_predq]
    254   add                second_predq, second_str
    255   pavgw                m1, [second_predq]
    256 %endif
    257   SUM_SSE              m0, m2, m1, m3, m6, m7
    258 
    259   lea                srcq, [srcq + src_strideq*4]
    260   lea                refq, [refq + ref_strideq*4]
    261 %if %2 == 1 ; avg
    262   add                second_predq, second_str
    263 %endif
    264 %endif
    265   dec                   block_height
    266   jg .x_zero_y_half_loop
    267   STORE_AND_RET
    268 
    269 .x_zero_y_nonhalf:
    270   ; x_offset == 0 && y_offset == bilin interpolation
    271 %if ARCH_X86_64
    272   lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    273 %endif
    274   shl           y_offsetd, filter_idx_shift
    275 %if ARCH_X86_64 && mmsize == 16
    276   mova                 m8, [bilin_filter+y_offsetq]
    277   mova                 m9, [bilin_filter+y_offsetq+16]
    278   mova                m10, [GLOBAL(pw_8)]
    279 %define filter_y_a m8
    280 %define filter_y_b m9
    281 %define filter_rnd m10
    282 %else ; x86-32 or mmx
    283 %if ARCH_X86=1 && CONFIG_PIC=1
    284 ; x_offset == 0, reuse x_offset reg
    285 %define tempq x_offsetq
    286   add y_offsetq, g_bilin_filterm
    287 %define filter_y_a [y_offsetq]
    288 %define filter_y_b [y_offsetq+16]
    289   mov tempq, g_pw_8m
    290 %define filter_rnd [tempq]
    291 %else
    292   add           y_offsetq, bilin_filter
    293 %define filter_y_a [y_offsetq]
    294 %define filter_y_b [y_offsetq+16]
    295 %define filter_rnd [GLOBAL(pw_8)]
    296 %endif
    297 %endif
    298 
    299 .x_zero_y_other_loop:
    300 %if %1 == 16
    301   movu                 m0, [srcq]
    302   movu                 m1, [srcq + 16]
    303   movu                 m4, [srcq+src_strideq*2]
    304   movu                 m5, [srcq+src_strideq*2+16]
    305   mova                 m2, [refq]
    306   mova                 m3, [refq+16]
    307   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
    308   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
    309   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
    310   ; slightly faster because of pmullw latency. It would also cut our rodata
    311   ; tables in half for this function, and save 1-2 registers on x86-64.
    312   pmullw               m1, filter_y_a
    313   pmullw               m5, filter_y_b
    314   paddw                m1, filter_rnd
    315   pmullw               m0, filter_y_a
    316   pmullw               m4, filter_y_b
    317   paddw                m0, filter_rnd
    318   paddw                m1, m5
    319   paddw                m0, m4
    320   psrlw                m1, 4
    321   psrlw                m0, 4
    322 %if %2 == 1 ; avg
    323   pavgw                m0, [second_predq]
    324   pavgw                m1, [second_predq+16]
    325 %endif
    326   SUM_SSE              m0, m2, m1, m3, m6, m7
    327 
    328   lea                srcq, [srcq + src_strideq*2]
    329   lea                refq, [refq + ref_strideq*2]
    330 %if %2 == 1 ; avg
    331   add                second_predq, second_str
    332 %endif
    333 %else ; %1 < 16
    334   movu                 m0, [srcq]
    335   movu                 m1, [srcq+src_strideq*2]
    336   movu                 m5, [srcq+src_strideq*4]
    337   mova                 m4, m1
    338   mova                 m2, [refq]
    339   mova                 m3, [refq+ref_strideq*2]
    340   pmullw               m1, filter_y_a
    341   pmullw               m5, filter_y_b
    342   paddw                m1, filter_rnd
    343   pmullw               m0, filter_y_a
    344   pmullw               m4, filter_y_b
    345   paddw                m0, filter_rnd
    346   paddw                m1, m5
    347   paddw                m0, m4
    348   psrlw                m1, 4
    349   psrlw                m0, 4
    350 %if %2 == 1 ; avg
    351   pavgw                m0, [second_predq]
    352   add                second_predq, second_str
    353   pavgw                m1, [second_predq]
    354 %endif
    355   SUM_SSE              m0, m2, m1, m3, m6, m7
    356 
    357   lea                srcq, [srcq + src_strideq*4]
    358   lea                refq, [refq + ref_strideq*4]
    359 %if %2 == 1 ; avg
    360   add                second_predq, second_str
    361 %endif
    362 %endif
    363   dec                   block_height
    364   jg .x_zero_y_other_loop
    365 %undef filter_y_a
    366 %undef filter_y_b
    367 %undef filter_rnd
    368   STORE_AND_RET
    369 
    370 .x_nonzero:
    371   cmp           x_offsetd, 8
    372   jne .x_nonhalf
    373   ; x_offset == 0.5
    374   test          y_offsetd, y_offsetd
    375   jnz .x_half_y_nonzero
    376 
    377   ; x_offset == 0.5 && y_offset == 0
    378 .x_half_y_zero_loop:
    379 %if %1 == 16
    380   movu                 m0, [srcq]
    381   movu                 m1, [srcq + 16]
    382   movu                 m4, [srcq + 2]
    383   movu                 m5, [srcq + 18]
    384   mova                 m2, [refq]
    385   mova                 m3, [refq + 16]
    386   pavgw                m0, m4
    387   pavgw                m1, m5
    388 %if %2 == 1 ; avg
    389   pavgw                m0, [second_predq]
    390   pavgw                m1, [second_predq+16]
    391 %endif
    392   SUM_SSE              m0, m2, m1, m3, m6, m7
    393 
    394   lea                srcq, [srcq + src_strideq*2]
    395   lea                refq, [refq + ref_strideq*2]
    396 %if %2 == 1 ; avg
    397   add                second_predq, second_str
    398 %endif
    399 %else ; %1 < 16
    400   movu                 m0, [srcq]
    401   movu                 m1, [srcq + src_strideq*2]
    402   movu                 m4, [srcq + 2]
    403   movu                 m5, [srcq + src_strideq*2 + 2]
    404   mova                 m2, [refq]
    405   mova                 m3, [refq + ref_strideq*2]
    406   pavgw                m0, m4
    407   pavgw                m1, m5
    408 %if %2 == 1 ; avg
    409   pavgw                m0, [second_predq]
    410   add                second_predq, second_str
    411   pavgw                m1, [second_predq]
    412 %endif
    413   SUM_SSE              m0, m2, m1, m3, m6, m7
    414 
    415   lea                srcq, [srcq + src_strideq*4]
    416   lea                refq, [refq + ref_strideq*4]
    417 %if %2 == 1 ; avg
    418   add                second_predq, second_str
    419 %endif
    420 %endif
    421   dec                   block_height
    422   jg .x_half_y_zero_loop
    423   STORE_AND_RET
    424 
    425 .x_half_y_nonzero:
    426   cmp           y_offsetd, 8
    427   jne .x_half_y_nonhalf
    428 
    429   ; x_offset == 0.5 && y_offset == 0.5
    430 %if %1 == 16
    431   movu                 m0, [srcq]
    432   movu                 m1, [srcq+16]
    433   movu                 m2, [srcq+2]
    434   movu                 m3, [srcq+18]
    435   lea                srcq, [srcq + src_strideq*2]
    436   pavgw                m0, m2
    437   pavgw                m1, m3
    438 .x_half_y_half_loop:
    439   movu                 m2, [srcq]
    440   movu                 m3, [srcq + 16]
    441   movu                 m4, [srcq + 2]
    442   movu                 m5, [srcq + 18]
    443   pavgw                m2, m4
    444   pavgw                m3, m5
    445   pavgw                m0, m2
    446   pavgw                m1, m3
    447   mova                 m4, [refq]
    448   mova                 m5, [refq + 16]
    449 %if %2 == 1 ; avg
    450   pavgw                m0, [second_predq]
    451   pavgw                m1, [second_predq+16]
    452 %endif
    453   SUM_SSE              m0, m4, m1, m5, m6, m7
    454   mova                 m0, m2
    455   mova                 m1, m3
    456 
    457   lea                srcq, [srcq + src_strideq*2]
    458   lea                refq, [refq + ref_strideq*2]
    459 %if %2 == 1 ; avg
    460   add                second_predq, second_str
    461 %endif
    462 %else ; %1 < 16
    463   movu                 m0, [srcq]
    464   movu                 m2, [srcq+2]
    465   lea                srcq, [srcq + src_strideq*2]
    466   pavgw                m0, m2
    467 .x_half_y_half_loop:
    468   movu                 m2, [srcq]
    469   movu                 m3, [srcq + src_strideq*2]
    470   movu                 m4, [srcq + 2]
    471   movu                 m5, [srcq + src_strideq*2 + 2]
    472   pavgw                m2, m4
    473   pavgw                m3, m5
    474   pavgw                m0, m2
    475   pavgw                m2, m3
    476   mova                 m4, [refq]
    477   mova                 m5, [refq + ref_strideq*2]
    478 %if %2 == 1 ; avg
    479   pavgw                m0, [second_predq]
    480   add                second_predq, second_str
    481   pavgw                m2, [second_predq]
    482 %endif
    483   SUM_SSE              m0, m4, m2, m5, m6, m7
    484   mova                 m0, m3
    485 
    486   lea                srcq, [srcq + src_strideq*4]
    487   lea                refq, [refq + ref_strideq*4]
    488 %if %2 == 1 ; avg
    489   add                second_predq, second_str
    490 %endif
    491 %endif
    492   dec                   block_height
    493   jg .x_half_y_half_loop
    494   STORE_AND_RET
    495 
    496 .x_half_y_nonhalf:
    497   ; x_offset == 0.5 && y_offset == bilin interpolation
    498 %if ARCH_X86_64
    499   lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    500 %endif
    501   shl           y_offsetd, filter_idx_shift
    502 %if ARCH_X86_64 && mmsize == 16
    503   mova                 m8, [bilin_filter+y_offsetq]
    504   mova                 m9, [bilin_filter+y_offsetq+16]
    505   mova                m10, [GLOBAL(pw_8)]
    506 %define filter_y_a m8
    507 %define filter_y_b m9
    508 %define filter_rnd m10
    509 %else  ; x86_32
    510 %if ARCH_X86=1 && CONFIG_PIC=1
    511 ; x_offset == 0.5. We can reuse x_offset reg
    512 %define tempq x_offsetq
    513   add y_offsetq, g_bilin_filterm
    514 %define filter_y_a [y_offsetq]
    515 %define filter_y_b [y_offsetq+16]
    516   mov tempq, g_pw_8m
    517 %define filter_rnd [tempq]
    518 %else
    519   add           y_offsetq, bilin_filter
    520 %define filter_y_a [y_offsetq]
    521 %define filter_y_b [y_offsetq+16]
    522 %define filter_rnd [GLOBAL(pw_8)]
    523 %endif
    524 %endif
    525 
    526 %if %1 == 16
    527   movu                 m0, [srcq]
    528   movu                 m1, [srcq+16]
    529   movu                 m2, [srcq+2]
    530   movu                 m3, [srcq+18]
    531   lea                srcq, [srcq + src_strideq*2]
    532   pavgw                m0, m2
    533   pavgw                m1, m3
    534 .x_half_y_other_loop:
    535   movu                 m2, [srcq]
    536   movu                 m3, [srcq+16]
    537   movu                 m4, [srcq+2]
    538   movu                 m5, [srcq+18]
    539   pavgw                m2, m4
    540   pavgw                m3, m5
    541   mova                 m4, m2
    542   mova                 m5, m3
    543   pmullw               m1, filter_y_a
    544   pmullw               m3, filter_y_b
    545   paddw                m1, filter_rnd
    546   paddw                m1, m3
    547   pmullw               m0, filter_y_a
    548   pmullw               m2, filter_y_b
    549   paddw                m0, filter_rnd
    550   psrlw                m1, 4
    551   paddw                m0, m2
    552   mova                 m2, [refq]
    553   psrlw                m0, 4
    554   mova                 m3, [refq+16]
    555 %if %2 == 1 ; avg
    556   pavgw                m0, [second_predq]
    557   pavgw                m1, [second_predq+16]
    558 %endif
    559   SUM_SSE              m0, m2, m1, m3, m6, m7
    560   mova                 m0, m4
    561   mova                 m1, m5
    562 
    563   lea                srcq, [srcq + src_strideq*2]
    564   lea                refq, [refq + ref_strideq*2]
    565 %if %2 == 1 ; avg
    566   add                second_predq, second_str
    567 %endif
    568 %else ; %1 < 16
    569   movu                 m0, [srcq]
    570   movu                 m2, [srcq+2]
    571   lea                srcq, [srcq + src_strideq*2]
    572   pavgw                m0, m2
    573 .x_half_y_other_loop:
    574   movu                 m2, [srcq]
    575   movu                 m3, [srcq+src_strideq*2]
    576   movu                 m4, [srcq+2]
    577   movu                 m5, [srcq+src_strideq*2+2]
    578   pavgw                m2, m4
    579   pavgw                m3, m5
    580   mova                 m4, m2
    581   mova                 m5, m3
    582   pmullw               m4, filter_y_a
    583   pmullw               m3, filter_y_b
    584   paddw                m4, filter_rnd
    585   paddw                m4, m3
    586   pmullw               m0, filter_y_a
    587   pmullw               m2, filter_y_b
    588   paddw                m0, filter_rnd
    589   psrlw                m4, 4
    590   paddw                m0, m2
    591   mova                 m2, [refq]
    592   psrlw                m0, 4
    593   mova                 m3, [refq+ref_strideq*2]
    594 %if %2 == 1 ; avg
    595   pavgw                m0, [second_predq]
    596   add                second_predq, second_str
    597   pavgw                m4, [second_predq]
    598 %endif
    599   SUM_SSE              m0, m2, m4, m3, m6, m7
    600   mova                 m0, m5
    601 
    602   lea                srcq, [srcq + src_strideq*4]
    603   lea                refq, [refq + ref_strideq*4]
    604 %if %2 == 1 ; avg
    605   add                second_predq, second_str
    606 %endif
    607 %endif
    608   dec                   block_height
    609   jg .x_half_y_other_loop
    610 %undef filter_y_a
    611 %undef filter_y_b
    612 %undef filter_rnd
    613   STORE_AND_RET
    614 
    615 .x_nonhalf:
    616   test          y_offsetd, y_offsetd
    617   jnz .x_nonhalf_y_nonzero
    618 
    619   ; x_offset == bilin interpolation && y_offset == 0
    620 %if ARCH_X86_64
    621   lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    622 %endif
    623   shl           x_offsetd, filter_idx_shift
    624 %if ARCH_X86_64 && mmsize == 16
    625   mova                 m8, [bilin_filter+x_offsetq]
    626   mova                 m9, [bilin_filter+x_offsetq+16]
    627   mova                m10, [GLOBAL(pw_8)]
    628 %define filter_x_a m8
    629 %define filter_x_b m9
    630 %define filter_rnd m10
    631 %else    ; x86-32
    632 %if ARCH_X86=1 && CONFIG_PIC=1
    633 ; y_offset == 0. We can reuse y_offset reg.
    634 %define tempq y_offsetq
    635   add x_offsetq, g_bilin_filterm
    636 %define filter_x_a [x_offsetq]
    637 %define filter_x_b [x_offsetq+16]
    638   mov tempq, g_pw_8m
    639 %define filter_rnd [tempq]
    640 %else
    641   add           x_offsetq, bilin_filter
    642 %define filter_x_a [x_offsetq]
    643 %define filter_x_b [x_offsetq+16]
    644 %define filter_rnd [GLOBAL(pw_8)]
    645 %endif
    646 %endif
    647 
    648 .x_other_y_zero_loop:
    649 %if %1 == 16
    650   movu                 m0, [srcq]
    651   movu                 m1, [srcq+16]
    652   movu                 m2, [srcq+2]
    653   movu                 m3, [srcq+18]
    654   mova                 m4, [refq]
    655   mova                 m5, [refq+16]
    656   pmullw               m1, filter_x_a
    657   pmullw               m3, filter_x_b
    658   paddw                m1, filter_rnd
    659   pmullw               m0, filter_x_a
    660   pmullw               m2, filter_x_b
    661   paddw                m0, filter_rnd
    662   paddw                m1, m3
    663   paddw                m0, m2
    664   psrlw                m1, 4
    665   psrlw                m0, 4
    666 %if %2 == 1 ; avg
    667   pavgw                m0, [second_predq]
    668   pavgw                m1, [second_predq+16]
    669 %endif
    670   SUM_SSE              m0, m4, m1, m5, m6, m7
    671 
    672   lea                srcq, [srcq+src_strideq*2]
    673   lea                refq, [refq+ref_strideq*2]
    674 %if %2 == 1 ; avg
    675   add                second_predq, second_str
    676 %endif
    677 %else ; %1 < 16
    678   movu                 m0, [srcq]
    679   movu                 m1, [srcq+src_strideq*2]
    680   movu                 m2, [srcq+2]
    681   movu                 m3, [srcq+src_strideq*2+2]
    682   mova                 m4, [refq]
    683   mova                 m5, [refq+ref_strideq*2]
    684   pmullw               m1, filter_x_a
    685   pmullw               m3, filter_x_b
    686   paddw                m1, filter_rnd
    687   pmullw               m0, filter_x_a
    688   pmullw               m2, filter_x_b
    689   paddw                m0, filter_rnd
    690   paddw                m1, m3
    691   paddw                m0, m2
    692   psrlw                m1, 4
    693   psrlw                m0, 4
    694 %if %2 == 1 ; avg
    695   pavgw                m0, [second_predq]
    696   add                second_predq, second_str
    697   pavgw                m1, [second_predq]
    698 %endif
    699   SUM_SSE              m0, m4, m1, m5, m6, m7
    700 
    701   lea                srcq, [srcq+src_strideq*4]
    702   lea                refq, [refq+ref_strideq*4]
    703 %if %2 == 1 ; avg
    704   add                second_predq, second_str
    705 %endif
    706 %endif
    707   dec                   block_height
    708   jg .x_other_y_zero_loop
    709 %undef filter_x_a
    710 %undef filter_x_b
    711 %undef filter_rnd
    712   STORE_AND_RET
    713 
    714 .x_nonhalf_y_nonzero:
    715   cmp           y_offsetd, 8
    716   jne .x_nonhalf_y_nonhalf
    717 
    718   ; x_offset == bilin interpolation && y_offset == 0.5
    719 %if ARCH_X86_64
    720   lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    721 %endif
    722   shl           x_offsetd, filter_idx_shift
    723 %if ARCH_X86_64 && mmsize == 16
    724   mova                 m8, [bilin_filter+x_offsetq]
    725   mova                 m9, [bilin_filter+x_offsetq+16]
    726   mova                m10, [GLOBAL(pw_8)]
    727 %define filter_x_a m8
    728 %define filter_x_b m9
    729 %define filter_rnd m10
    730 %else    ; x86-32
    731 %if ARCH_X86=1 && CONFIG_PIC=1
    732 ; y_offset == 0.5. We can reuse y_offset reg.
    733 %define tempq y_offsetq
    734   add x_offsetq, g_bilin_filterm
    735 %define filter_x_a [x_offsetq]
    736 %define filter_x_b [x_offsetq+16]
    737   mov tempq, g_pw_8m
    738 %define filter_rnd [tempq]
    739 %else
    740   add           x_offsetq, bilin_filter
    741 %define filter_x_a [x_offsetq]
    742 %define filter_x_b [x_offsetq+16]
    743 %define filter_rnd [GLOBAL(pw_8)]
    744 %endif
    745 %endif
    746 
    747 %if %1 == 16
    748   movu                 m0, [srcq]
    749   movu                 m1, [srcq+16]
    750   movu                 m2, [srcq+2]
    751   movu                 m3, [srcq+18]
    752   pmullw               m0, filter_x_a
    753   pmullw               m2, filter_x_b
    754   paddw                m0, filter_rnd
    755   pmullw               m1, filter_x_a
    756   pmullw               m3, filter_x_b
    757   paddw                m1, filter_rnd
    758   paddw                m0, m2
    759   paddw                m1, m3
    760   psrlw                m0, 4
    761   psrlw                m1, 4
    762   lea                srcq, [srcq+src_strideq*2]
    763 .x_other_y_half_loop:
    764   movu                 m2, [srcq]
    765   movu                 m3, [srcq+16]
    766   movu                 m4, [srcq+2]
    767   movu                 m5, [srcq+18]
    768   pmullw               m2, filter_x_a
    769   pmullw               m4, filter_x_b
    770   paddw                m2, filter_rnd
    771   pmullw               m3, filter_x_a
    772   pmullw               m5, filter_x_b
    773   paddw                m3, filter_rnd
    774   paddw                m2, m4
    775   paddw                m3, m5
    776   mova                 m4, [refq]
    777   mova                 m5, [refq+16]
    778   psrlw                m2, 4
    779   psrlw                m3, 4
    780   pavgw                m0, m2
    781   pavgw                m1, m3
    782 %if %2 == 1 ; avg
    783   pavgw                m0, [second_predq]
    784   pavgw                m1, [second_predq+16]
    785 %endif
    786   SUM_SSE              m0, m4, m1, m5, m6, m7
    787   mova                 m0, m2
    788   mova                 m1, m3
    789 
    790   lea                srcq, [srcq+src_strideq*2]
    791   lea                refq, [refq+ref_strideq*2]
    792 %if %2 == 1 ; avg
    793   add                second_predq, second_str
    794 %endif
    795 %else ; %1 < 16
    796   movu                 m0, [srcq]
    797   movu                 m2, [srcq+2]
    798   pmullw               m0, filter_x_a
    799   pmullw               m2, filter_x_b
    800   paddw                m0, filter_rnd
    801   paddw                m0, m2
    802   psrlw                m0, 4
    803   lea                srcq, [srcq+src_strideq*2]
    804 .x_other_y_half_loop:
    805   movu                 m2, [srcq]
    806   movu                 m3, [srcq+src_strideq*2]
    807   movu                 m4, [srcq+2]
    808   movu                 m5, [srcq+src_strideq*2+2]
    809   pmullw               m2, filter_x_a
    810   pmullw               m4, filter_x_b
    811   paddw                m2, filter_rnd
    812   pmullw               m3, filter_x_a
    813   pmullw               m5, filter_x_b
    814   paddw                m3, filter_rnd
    815   paddw                m2, m4
    816   paddw                m3, m5
    817   mova                 m4, [refq]
    818   mova                 m5, [refq+ref_strideq*2]
    819   psrlw                m2, 4
    820   psrlw                m3, 4
    821   pavgw                m0, m2
    822   pavgw                m2, m3
    823 %if %2 == 1 ; avg
    824   pavgw                m0, [second_predq]
    825   add                second_predq, second_str
    826   pavgw                m2, [second_predq]
    827 %endif
    828   SUM_SSE              m0, m4, m2, m5, m6, m7
    829   mova                 m0, m3
    830 
    831   lea                srcq, [srcq+src_strideq*4]
    832   lea                refq, [refq+ref_strideq*4]
    833 %if %2 == 1 ; avg
    834   add                second_predq, second_str
    835 %endif
    836 %endif
    837   dec                   block_height
    838   jg .x_other_y_half_loop
    839 %undef filter_x_a
    840 %undef filter_x_b
    841 %undef filter_rnd
    842   STORE_AND_RET
    843 
    844 .x_nonhalf_y_nonhalf:
    845 ; loading filter - this is same as in 8-bit depth
    846 %if ARCH_X86_64
    847   lea        bilin_filter, [GLOBAL(bilin_filter_m)]
    848 %endif
    849   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
    850   shl           y_offsetd, filter_idx_shift
    851 %if ARCH_X86_64 && mmsize == 16
    852   mova                 m8, [bilin_filter+x_offsetq]
    853   mova                 m9, [bilin_filter+x_offsetq+16]
    854   mova                m10, [bilin_filter+y_offsetq]
    855   mova                m11, [bilin_filter+y_offsetq+16]
    856   mova                m12, [GLOBAL(pw_8)]
    857 %define filter_x_a m8
    858 %define filter_x_b m9
    859 %define filter_y_a m10
    860 %define filter_y_b m11
    861 %define filter_rnd m12
    862 %else   ; x86-32
    863 %if ARCH_X86=1 && CONFIG_PIC=1
    864 ; In this case, there is NO unused register. Used src_stride register. Later,
    865 ; src_stride has to be loaded from stack when it is needed.
    866 %define tempq src_strideq
    867   mov tempq, g_bilin_filterm
    868   add           x_offsetq, tempq
    869   add           y_offsetq, tempq
    870 %define filter_x_a [x_offsetq]
    871 %define filter_x_b [x_offsetq+16]
    872 %define filter_y_a [y_offsetq]
    873 %define filter_y_b [y_offsetq+16]
    874 
    875   mov tempq, g_pw_8m
    876 %define filter_rnd [tempq]
    877 %else
    878   add           x_offsetq, bilin_filter
    879   add           y_offsetq, bilin_filter
    880 %define filter_x_a [x_offsetq]
    881 %define filter_x_b [x_offsetq+16]
    882 %define filter_y_a [y_offsetq]
    883 %define filter_y_b [y_offsetq+16]
    884 %define filter_rnd [GLOBAL(pw_8)]
    885 %endif
    886 %endif
    887 ; end of load filter
    888 
    889   ; x_offset == bilin interpolation && y_offset == bilin interpolation
    890 %if %1 == 16
    891   movu                 m0, [srcq]
    892   movu                 m2, [srcq+2]
    893   movu                 m1, [srcq+16]
    894   movu                 m3, [srcq+18]
    895   pmullw               m0, filter_x_a
    896   pmullw               m2, filter_x_b
    897   paddw                m0, filter_rnd
    898   pmullw               m1, filter_x_a
    899   pmullw               m3, filter_x_b
    900   paddw                m1, filter_rnd
    901   paddw                m0, m2
    902   paddw                m1, m3
    903   psrlw                m0, 4
    904   psrlw                m1, 4
    905 
    906   INC_SRC_BY_SRC_STRIDE
    907 
    908 .x_other_y_other_loop:
    909   movu                 m2, [srcq]
    910   movu                 m4, [srcq+2]
    911   movu                 m3, [srcq+16]
    912   movu                 m5, [srcq+18]
    913   pmullw               m2, filter_x_a
    914   pmullw               m4, filter_x_b
    915   paddw                m2, filter_rnd
    916   pmullw               m3, filter_x_a
    917   pmullw               m5, filter_x_b
    918   paddw                m3, filter_rnd
    919   paddw                m2, m4
    920   paddw                m3, m5
    921   psrlw                m2, 4
    922   psrlw                m3, 4
    923   mova                 m4, m2
    924   mova                 m5, m3
    925   pmullw               m0, filter_y_a
    926   pmullw               m2, filter_y_b
    927   paddw                m0, filter_rnd
    928   pmullw               m1, filter_y_a
    929   pmullw               m3, filter_y_b
    930   paddw                m0, m2
    931   paddw                m1, filter_rnd
    932   mova                 m2, [refq]
    933   paddw                m1, m3
    934   psrlw                m0, 4
    935   psrlw                m1, 4
    936   mova                 m3, [refq+16]
    937 %if %2 == 1 ; avg
    938   pavgw                m0, [second_predq]
    939   pavgw                m1, [second_predq+16]
    940 %endif
    941   SUM_SSE              m0, m2, m1, m3, m6, m7
    942   mova                 m0, m4
    943   mova                 m1, m5
    944 
    945   INC_SRC_BY_SRC_STRIDE
    946   lea                refq, [refq + ref_strideq * 2]
    947 %if %2 == 1 ; avg
    948   add                second_predq, second_str
    949 %endif
    950 %else ; %1 < 16
    951   movu                 m0, [srcq]
    952   movu                 m2, [srcq+2]
    953   pmullw               m0, filter_x_a
    954   pmullw               m2, filter_x_b
    955   paddw                m0, filter_rnd
    956   paddw                m0, m2
    957   psrlw                m0, 4
    958 
    959   INC_SRC_BY_SRC_STRIDE
    960 
    961 .x_other_y_other_loop:
    962   movu                 m2, [srcq]
    963   movu                 m4, [srcq+2]
    964   INC_SRC_BY_SRC_STRIDE
    965   movu                 m3, [srcq]
    966   movu                 m5, [srcq+2]
    967   pmullw               m2, filter_x_a
    968   pmullw               m4, filter_x_b
    969   paddw                m2, filter_rnd
    970   pmullw               m3, filter_x_a
    971   pmullw               m5, filter_x_b
    972   paddw                m3, filter_rnd
    973   paddw                m2, m4
    974   paddw                m3, m5
    975   psrlw                m2, 4
    976   psrlw                m3, 4
    977   mova                 m4, m2
    978   mova                 m5, m3
    979   pmullw               m0, filter_y_a
    980   pmullw               m2, filter_y_b
    981   paddw                m0, filter_rnd
    982   pmullw               m4, filter_y_a
    983   pmullw               m3, filter_y_b
    984   paddw                m0, m2
    985   paddw                m4, filter_rnd
    986   mova                 m2, [refq]
    987   paddw                m4, m3
    988   psrlw                m0, 4
    989   psrlw                m4, 4
    990   mova                 m3, [refq+ref_strideq*2]
    991 %if %2 == 1 ; avg
    992   pavgw                m0, [second_predq]
    993   add                second_predq, second_str
    994   pavgw                m4, [second_predq]
    995 %endif
    996   SUM_SSE              m0, m2, m4, m3, m6, m7
    997   mova                 m0, m5
    998 
    999   INC_SRC_BY_SRC_STRIDE
   1000   lea                refq, [refq + ref_strideq * 4]
   1001 %if %2 == 1 ; avg
   1002   add                second_predq, second_str
   1003 %endif
   1004 %endif
   1005   dec                   block_height
   1006   jg .x_other_y_other_loop
   1007 %undef filter_x_a
   1008 %undef filter_x_b
   1009 %undef filter_y_a
   1010 %undef filter_y_b
   1011 %undef filter_rnd
   1012   STORE_AND_RET
   1013 %endmacro
   1014 
   1015 INIT_XMM sse2
   1016 SUBPEL_VARIANCE  8
   1017 SUBPEL_VARIANCE 16
   1018 
   1019 INIT_XMM sse2
   1020 SUBPEL_VARIANCE  8, 1
   1021 SUBPEL_VARIANCE 16, 1
   1022