1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "vpx_ports/x86_abi_support.asm" 12 13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 14 %macro TABULATE_SSIM 0 15 paddusw xmm15, xmm3 ; sum_s 16 paddusw xmm14, xmm4 ; sum_r 17 movdqa xmm1, xmm3 18 pmaddwd xmm1, xmm1 19 paddd xmm13, xmm1 ; sum_sq_s 20 movdqa xmm2, xmm4 21 pmaddwd xmm2, xmm2 22 paddd xmm12, xmm2 ; sum_sq_r 23 pmaddwd xmm3, xmm4 24 paddd xmm11, xmm3 ; sum_sxr 25 %endmacro 26 27 ; Sum across the register %1 starting with q words 28 %macro SUM_ACROSS_Q 1 29 movdqa xmm2,%1 30 punpckldq %1,xmm0 31 punpckhdq xmm2,xmm0 32 paddq %1,xmm2 33 movdqa xmm2,%1 34 punpcklqdq %1,xmm0 35 punpckhqdq xmm2,xmm0 36 paddq %1,xmm2 37 %endmacro 38 39 ; Sum across the register %1 starting with q words 40 %macro SUM_ACROSS_W 1 41 movdqa xmm1, %1 42 punpcklwd %1,xmm0 43 punpckhwd xmm1,xmm0 44 paddd %1, xmm1 45 SUM_ACROSS_Q %1 46 %endmacro 47 ;void ssim_parms_sse2( 48 ; unsigned char *s, 49 ; int sp, 50 ; unsigned char *r, 51 ; int rp 52 ; unsigned long *sum_s, 53 ; unsigned long *sum_r, 54 ; unsigned long *sum_sq_s, 55 ; unsigned long *sum_sq_r, 56 ; unsigned long *sum_sxr); 57 ; 58 ; TODO: Use parm passing through structure, probably don't need the pxors 59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 60 ; without too much hastle, and can probably do better estimates with psadw 61 ; or pavgb At this point this is just meant to be first pass for calculating 62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion 63 ; in mode selection code. 64 global sym(vp9_ssim_parms_16x16_sse2) PRIVATE 65 sym(vp9_ssim_parms_16x16_sse2): 66 push rbp 67 mov rbp, rsp 68 SHADOW_ARGS_TO_STACK 9 69 SAVE_XMM 15 70 push rsi 71 push rdi 72 ; end prolog 73 74 mov rsi, arg(0) ;s 75 mov rcx, arg(1) ;sp 76 mov rdi, arg(2) ;r 77 mov rax, arg(3) ;rp 78 79 pxor xmm0, xmm0 80 pxor xmm15,xmm15 ;sum_s 81 pxor xmm14,xmm14 ;sum_r 82 pxor xmm13,xmm13 ;sum_sq_s 83 pxor xmm12,xmm12 ;sum_sq_r 84 pxor xmm11,xmm11 ;sum_sxr 85 86 mov rdx, 16 ;row counter 87 .NextRow: 88 89 ;grab source and reference pixels 90 movdqu xmm5, [rsi] 91 movdqu xmm6, [rdi] 92 movdqa xmm3, xmm5 93 movdqa xmm4, xmm6 94 punpckhbw xmm3, xmm0 ; high_s 95 punpckhbw xmm4, xmm0 ; high_r 96 97 TABULATE_SSIM 98 99 movdqa xmm3, xmm5 100 movdqa xmm4, xmm6 101 punpcklbw xmm3, xmm0 ; low_s 102 punpcklbw xmm4, xmm0 ; low_r 103 104 TABULATE_SSIM 105 106 add rsi, rcx ; next s row 107 add rdi, rax ; next r row 108 109 dec rdx ; counter 110 jnz .NextRow 111 112 SUM_ACROSS_W xmm15 113 SUM_ACROSS_W xmm14 114 SUM_ACROSS_Q xmm13 115 SUM_ACROSS_Q xmm12 116 SUM_ACROSS_Q xmm11 117 118 mov rdi,arg(4) 119 movd [rdi], xmm15; 120 mov rdi,arg(5) 121 movd [rdi], xmm14; 122 mov rdi,arg(6) 123 movd [rdi], xmm13; 124 mov rdi,arg(7) 125 movd [rdi], xmm12; 126 mov rdi,arg(8) 127 movd [rdi], xmm11; 128 129 ; begin epilog 130 pop rdi 131 pop rsi 132 RESTORE_XMM 133 UNSHADOW_ARGS 134 pop rbp 135 ret 136 137 ;void ssim_parms_sse2( 138 ; unsigned char *s, 139 ; int sp, 140 ; unsigned char *r, 141 ; int rp 142 ; unsigned long *sum_s, 143 ; unsigned long *sum_r, 144 ; unsigned long *sum_sq_s, 145 ; unsigned long *sum_sq_r, 146 ; unsigned long *sum_sxr); 147 ; 148 ; TODO: Use parm passing through structure, probably don't need the pxors 149 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 150 ; without too much hastle, and can probably do better estimates with psadw 151 ; or pavgb At this point this is just meant to be first pass for calculating 152 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion 153 ; in mode selection code. 154 global sym(vp9_ssim_parms_8x8_sse2) PRIVATE 155 sym(vp9_ssim_parms_8x8_sse2): 156 push rbp 157 mov rbp, rsp 158 SHADOW_ARGS_TO_STACK 9 159 SAVE_XMM 15 160 push rsi 161 push rdi 162 ; end prolog 163 164 mov rsi, arg(0) ;s 165 mov rcx, arg(1) ;sp 166 mov rdi, arg(2) ;r 167 mov rax, arg(3) ;rp 168 169 pxor xmm0, xmm0 170 pxor xmm15,xmm15 ;sum_s 171 pxor xmm14,xmm14 ;sum_r 172 pxor xmm13,xmm13 ;sum_sq_s 173 pxor xmm12,xmm12 ;sum_sq_r 174 pxor xmm11,xmm11 ;sum_sxr 175 176 mov rdx, 8 ;row counter 177 .NextRow: 178 179 ;grab source and reference pixels 180 movq xmm3, [rsi] 181 movq xmm4, [rdi] 182 punpcklbw xmm3, xmm0 ; low_s 183 punpcklbw xmm4, xmm0 ; low_r 184 185 TABULATE_SSIM 186 187 add rsi, rcx ; next s row 188 add rdi, rax ; next r row 189 190 dec rdx ; counter 191 jnz .NextRow 192 193 SUM_ACROSS_W xmm15 194 SUM_ACROSS_W xmm14 195 SUM_ACROSS_Q xmm13 196 SUM_ACROSS_Q xmm12 197 SUM_ACROSS_Q xmm11 198 199 mov rdi,arg(4) 200 movd [rdi], xmm15; 201 mov rdi,arg(5) 202 movd [rdi], xmm14; 203 mov rdi,arg(6) 204 movd [rdi], xmm13; 205 mov rdi,arg(7) 206 movd [rdi], xmm12; 207 mov rdi,arg(8) 208 movd [rdi], xmm11; 209 210 ; begin epilog 211 pop rdi 212 pop rsi 213 RESTORE_XMM 214 UNSHADOW_ARGS 215 pop rbp 216 ret 217