1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "third_party/x86inc/x86inc.asm" 12 13 SECTION .text 14 15 %macro SAD_FN 4 16 %if %4 == 0 17 %if %3 == 5 18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 19 %else ; %3 == 7 20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ 21 src_stride3, ref_stride3, n_rows 22 %endif ; %3 == 5/7 23 %else ; avg 24 %if %3 == 5 25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 26 second_pred, n_rows 27 %else ; %3 == 7 28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ 29 ref, ref_stride, \ 30 second_pred, \ 31 src_stride3, ref_stride3 32 %if ARCH_X86_64 33 %define n_rowsd r7d 34 %else ; x86-32 35 %define n_rowsd dword r0m 36 %endif ; x86-32/64 37 %endif ; %3 == 5/7 38 %endif ; avg/sad 39 movsxdifnidn src_strideq, src_strided 40 movsxdifnidn ref_strideq, ref_strided 41 %if %3 == 7 42 lea src_stride3q, [src_strideq*3] 43 lea ref_stride3q, [ref_strideq*3] 44 %endif ; %3 == 7 45 %endmacro 46 47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, 48 ; uint8_t *ref, int ref_stride); 49 %macro SAD64XN 1-2 0 50 SAD_FN 64, %1, 5, %2 51 mov n_rowsd, %1 52 pxor m0, m0 53 .loop: 54 movu m1, [refq] 55 movu m2, [refq+16] 56 movu m3, [refq+32] 57 movu m4, [refq+48] 58 %if %2 == 1 59 pavgb m1, [second_predq+mmsize*0] 60 pavgb m2, [second_predq+mmsize*1] 61 pavgb m3, [second_predq+mmsize*2] 62 pavgb m4, [second_predq+mmsize*3] 63 lea second_predq, [second_predq+mmsize*4] 64 %endif 65 psadbw m1, [srcq] 66 psadbw m2, [srcq+16] 67 psadbw m3, [srcq+32] 68 psadbw m4, [srcq+48] 69 paddd m1, m2 70 paddd m3, m4 71 add refq, ref_strideq 72 paddd m0, m1 73 add srcq, src_strideq 74 paddd m0, m3 75 dec n_rowsd 76 jg .loop 77 78 movhlps m1, m0 79 paddd m0, m1 80 movd eax, m0 81 RET 82 %endmacro 83 84 INIT_XMM sse2 85 SAD64XN 64 ; sad64x64_sse2 86 SAD64XN 32 ; sad64x32_sse2 87 SAD64XN 64, 1 ; sad64x64_avg_sse2 88 SAD64XN 32, 1 ; sad64x32_avg_sse2 89 90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, 91 ; uint8_t *ref, int ref_stride); 92 %macro SAD32XN 1-2 0 93 SAD_FN 32, %1, 5, %2 94 mov n_rowsd, %1/2 95 pxor m0, m0 96 .loop: 97 movu m1, [refq] 98 movu m2, [refq+16] 99 movu m3, [refq+ref_strideq] 100 movu m4, [refq+ref_strideq+16] 101 %if %2 == 1 102 pavgb m1, [second_predq+mmsize*0] 103 pavgb m2, [second_predq+mmsize*1] 104 pavgb m3, [second_predq+mmsize*2] 105 pavgb m4, [second_predq+mmsize*3] 106 lea second_predq, [second_predq+mmsize*4] 107 %endif 108 psadbw m1, [srcq] 109 psadbw m2, [srcq+16] 110 psadbw m3, [srcq+src_strideq] 111 psadbw m4, [srcq+src_strideq+16] 112 paddd m1, m2 113 paddd m3, m4 114 lea refq, [refq+ref_strideq*2] 115 paddd m0, m1 116 lea srcq, [srcq+src_strideq*2] 117 paddd m0, m3 118 dec n_rowsd 119 jg .loop 120 121 movhlps m1, m0 122 paddd m0, m1 123 movd eax, m0 124 RET 125 %endmacro 126 127 INIT_XMM sse2 128 SAD32XN 64 ; sad32x64_sse2 129 SAD32XN 32 ; sad32x32_sse2 130 SAD32XN 16 ; sad32x16_sse2 131 SAD32XN 64, 1 ; sad32x64_avg_sse2 132 SAD32XN 32, 1 ; sad32x32_avg_sse2 133 SAD32XN 16, 1 ; sad32x16_avg_sse2 134 135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 136 ; uint8_t *ref, int ref_stride); 137 %macro SAD16XN 1-2 0 138 SAD_FN 16, %1, 7, %2 139 mov n_rowsd, %1/4 140 pxor m0, m0 141 142 .loop: 143 movu m1, [refq] 144 movu m2, [refq+ref_strideq] 145 movu m3, [refq+ref_strideq*2] 146 movu m4, [refq+ref_stride3q] 147 %if %2 == 1 148 pavgb m1, [second_predq+mmsize*0] 149 pavgb m2, [second_predq+mmsize*1] 150 pavgb m3, [second_predq+mmsize*2] 151 pavgb m4, [second_predq+mmsize*3] 152 lea second_predq, [second_predq+mmsize*4] 153 %endif 154 psadbw m1, [srcq] 155 psadbw m2, [srcq+src_strideq] 156 psadbw m3, [srcq+src_strideq*2] 157 psadbw m4, [srcq+src_stride3q] 158 paddd m1, m2 159 paddd m3, m4 160 lea refq, [refq+ref_strideq*4] 161 paddd m0, m1 162 lea srcq, [srcq+src_strideq*4] 163 paddd m0, m3 164 dec n_rowsd 165 jg .loop 166 167 movhlps m1, m0 168 paddd m0, m1 169 movd eax, m0 170 RET 171 %endmacro 172 173 INIT_XMM sse2 174 SAD16XN 32 ; sad16x32_sse2 175 SAD16XN 16 ; sad16x16_sse2 176 SAD16XN 8 ; sad16x8_sse2 177 SAD16XN 32, 1 ; sad16x32_avg_sse2 178 SAD16XN 16, 1 ; sad16x16_avg_sse2 179 SAD16XN 8, 1 ; sad16x8_avg_sse2 180 181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 182 ; uint8_t *ref, int ref_stride); 183 %macro SAD8XN 1-2 0 184 SAD_FN 8, %1, 7, %2 185 mov n_rowsd, %1/4 186 pxor m0, m0 187 188 .loop: 189 movh m1, [refq] 190 movhps m1, [refq+ref_strideq] 191 movh m2, [refq+ref_strideq*2] 192 movhps m2, [refq+ref_stride3q] 193 %if %2 == 1 194 pavgb m1, [second_predq+mmsize*0] 195 pavgb m2, [second_predq+mmsize*1] 196 lea second_predq, [second_predq+mmsize*2] 197 %endif 198 movh m3, [srcq] 199 movhps m3, [srcq+src_strideq] 200 movh m4, [srcq+src_strideq*2] 201 movhps m4, [srcq+src_stride3q] 202 psadbw m1, m3 203 psadbw m2, m4 204 lea refq, [refq+ref_strideq*4] 205 paddd m0, m1 206 lea srcq, [srcq+src_strideq*4] 207 paddd m0, m2 208 dec n_rowsd 209 jg .loop 210 211 movhlps m1, m0 212 paddd m0, m1 213 movd eax, m0 214 RET 215 %endmacro 216 217 INIT_XMM sse2 218 SAD8XN 16 ; sad8x16_sse2 219 SAD8XN 8 ; sad8x8_sse2 220 SAD8XN 4 ; sad8x4_sse2 221 SAD8XN 16, 1 ; sad8x16_avg_sse2 222 SAD8XN 8, 1 ; sad8x8_avg_sse2 223 SAD8XN 4, 1 ; sad8x4_avg_sse2 224 225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, 226 ; uint8_t *ref, int ref_stride); 227 %macro SAD4XN 1-2 0 228 SAD_FN 4, %1, 7, %2 229 mov n_rowsd, %1/4 230 pxor m0, m0 231 232 .loop: 233 movd m1, [refq] 234 movd m2, [refq+ref_strideq] 235 movd m3, [refq+ref_strideq*2] 236 movd m4, [refq+ref_stride3q] 237 punpckldq m1, m2 238 punpckldq m3, m4 239 %if %2 == 1 240 pavgb m1, [second_predq+mmsize*0] 241 pavgb m3, [second_predq+mmsize*1] 242 lea second_predq, [second_predq+mmsize*2] 243 %endif 244 movd m2, [srcq] 245 movd m5, [srcq+src_strideq] 246 movd m4, [srcq+src_strideq*2] 247 movd m6, [srcq+src_stride3q] 248 punpckldq m2, m5 249 punpckldq m4, m6 250 psadbw m1, m2 251 psadbw m3, m4 252 lea refq, [refq+ref_strideq*4] 253 paddd m0, m1 254 lea srcq, [srcq+src_strideq*4] 255 paddd m0, m3 256 dec n_rowsd 257 jg .loop 258 259 movd eax, m0 260 RET 261 %endmacro 262 263 INIT_MMX sse 264 SAD4XN 8 ; sad4x8_sse 265 SAD4XN 4 ; sad4x4_sse 266 SAD4XN 8, 1 ; sad4x8_avg_sse 267 SAD4XN 4, 1 ; sad4x4_avg_sse 268