1 /***************************************************************************** 2 * * 3 * Copyright (c) 2012, Intel Corporation * 4 * * 5 * All rights reserved. * 6 * * 7 * Redistribution and use in source and binary forms, with or without * 8 * modification, are permitted provided that the following conditions are * 9 * met: * 10 * * 11 * * Redistributions of source code must retain the above copyright * 12 * notice, this list of conditions and the following disclaimer. * 13 * * 14 * * Redistributions in binary form must reproduce the above copyright * 15 * notice, this list of conditions and the following disclaimer in the * 16 * documentation and/or other materials provided with the * 17 * distribution. * 18 * * 19 * * Neither the name of the Intel Corporation nor the names of its * 20 * contributors may be used to endorse or promote products derived from * 21 * this software without specific prior written permission. * 22 * * 23 * * 24 * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY * 25 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR * 28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * 29 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * 30 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * 31 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 32 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 35 * * 36 ****************************************************************************** 37 * Developers and authors: * 38 * Shay Gueron (1, 2), and Vlad Krasnov (1) * 39 * (1) Intel Corporation, Israel Development Center, Haifa, Israel * 40 * (2) University of Haifa, Israel * 41 *****************************************************************************/ 42 43 #include <openssl/base.h> 44 45 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) 46 47 #include "rsaz_exp.h" 48 49 #include <openssl/mem.h> 50 51 #include "../../internal.h" 52 53 54 /* 55 * See crypto/bn/asm/rsaz-avx2.pl for further details. 56 */ 57 void rsaz_1024_norm2red_avx2(void *red,const void *norm); 58 void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k); 59 void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt); 60 void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i); 61 void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i); 62 void rsaz_1024_red2norm_avx2(void *norm,const void *red); 63 64 alignas(64) static const BN_ULONG one[40] = 65 {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 66 alignas(64) static const BN_ULONG two80[40] = 67 {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 68 69 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], 70 const BN_ULONG base_norm[16], const BN_ULONG exponent[16], 71 const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) 72 { 73 alignas(64) uint8_t storage[(320 * 3) + (32 * 9 * 16)]; /* 5.5KB */ 74 unsigned char *a_inv, *m, *result, 75 *table_s = storage + (320 * 3), 76 *R2 = table_s; /* borrow */ 77 int index; 78 int wvalue; 79 80 if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) { 81 result = storage; 82 a_inv = storage + 320; 83 m = storage + (320 * 2); /* should not cross page */ 84 } else { 85 m = storage; /* should not cross page */ 86 result = storage + 320; 87 a_inv = storage + (320 * 2); 88 } 89 90 rsaz_1024_norm2red_avx2(m, m_norm); 91 rsaz_1024_norm2red_avx2(a_inv, base_norm); 92 rsaz_1024_norm2red_avx2(R2, RR); 93 94 rsaz_1024_mul_avx2(R2, R2, R2, m, k0); 95 rsaz_1024_mul_avx2(R2, R2, two80, m, k0); 96 97 /* table[0] = 1 */ 98 rsaz_1024_mul_avx2(result, R2, one, m, k0); 99 /* table[1] = a_inv^1 */ 100 rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); 101 102 rsaz_1024_scatter5_avx2(table_s,result,0); 103 rsaz_1024_scatter5_avx2(table_s,a_inv,1); 104 105 /* table[2] = a_inv^2 */ 106 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); 107 rsaz_1024_scatter5_avx2(table_s,result,2); 108 #if 0 109 /* this is almost 2x smaller and less than 1% slower */ 110 for (index=3; index<32; index++) { 111 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 112 rsaz_1024_scatter5_avx2(table_s,result,index); 113 } 114 #else 115 /* table[4] = a_inv^4 */ 116 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 117 rsaz_1024_scatter5_avx2(table_s,result,4); 118 /* table[8] = a_inv^8 */ 119 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 120 rsaz_1024_scatter5_avx2(table_s,result,8); 121 /* table[16] = a_inv^16 */ 122 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 123 rsaz_1024_scatter5_avx2(table_s,result,16); 124 /* table[17] = a_inv^17 */ 125 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 126 rsaz_1024_scatter5_avx2(table_s,result,17); 127 128 /* table[3] */ 129 rsaz_1024_gather5_avx2(result,table_s,2); 130 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 131 rsaz_1024_scatter5_avx2(table_s,result,3); 132 /* table[6] */ 133 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 134 rsaz_1024_scatter5_avx2(table_s,result,6); 135 /* table[12] */ 136 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 137 rsaz_1024_scatter5_avx2(table_s,result,12); 138 /* table[24] */ 139 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 140 rsaz_1024_scatter5_avx2(table_s,result,24); 141 /* table[25] */ 142 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 143 rsaz_1024_scatter5_avx2(table_s,result,25); 144 145 /* table[5] */ 146 rsaz_1024_gather5_avx2(result,table_s,4); 147 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 148 rsaz_1024_scatter5_avx2(table_s,result,5); 149 /* table[10] */ 150 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 151 rsaz_1024_scatter5_avx2(table_s,result,10); 152 /* table[20] */ 153 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 154 rsaz_1024_scatter5_avx2(table_s,result,20); 155 /* table[21] */ 156 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 157 rsaz_1024_scatter5_avx2(table_s,result,21); 158 159 /* table[7] */ 160 rsaz_1024_gather5_avx2(result,table_s,6); 161 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 162 rsaz_1024_scatter5_avx2(table_s,result,7); 163 /* table[14] */ 164 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 165 rsaz_1024_scatter5_avx2(table_s,result,14); 166 /* table[28] */ 167 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 168 rsaz_1024_scatter5_avx2(table_s,result,28); 169 /* table[29] */ 170 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 171 rsaz_1024_scatter5_avx2(table_s,result,29); 172 173 /* table[9] */ 174 rsaz_1024_gather5_avx2(result,table_s,8); 175 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 176 rsaz_1024_scatter5_avx2(table_s,result,9); 177 /* table[18] */ 178 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 179 rsaz_1024_scatter5_avx2(table_s,result,18); 180 /* table[19] */ 181 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 182 rsaz_1024_scatter5_avx2(table_s,result,19); 183 184 /* table[11] */ 185 rsaz_1024_gather5_avx2(result,table_s,10); 186 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 187 rsaz_1024_scatter5_avx2(table_s,result,11); 188 /* table[22] */ 189 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 190 rsaz_1024_scatter5_avx2(table_s,result,22); 191 /* table[23] */ 192 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 193 rsaz_1024_scatter5_avx2(table_s,result,23); 194 195 /* table[13] */ 196 rsaz_1024_gather5_avx2(result,table_s,12); 197 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 198 rsaz_1024_scatter5_avx2(table_s,result,13); 199 /* table[26] */ 200 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 201 rsaz_1024_scatter5_avx2(table_s,result,26); 202 /* table[27] */ 203 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 204 rsaz_1024_scatter5_avx2(table_s,result,27); 205 206 /* table[15] */ 207 rsaz_1024_gather5_avx2(result,table_s,14); 208 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 209 rsaz_1024_scatter5_avx2(table_s,result,15); 210 /* table[30] */ 211 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 212 rsaz_1024_scatter5_avx2(table_s,result,30); 213 /* table[31] */ 214 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 215 rsaz_1024_scatter5_avx2(table_s,result,31); 216 #endif 217 218 const uint8_t *p_str = (const uint8_t *)exponent; 219 220 /* load first window */ 221 wvalue = p_str[127] >> 3; 222 rsaz_1024_gather5_avx2(result,table_s,wvalue); 223 224 index = 1014; 225 226 while(index > -1) { /* loop for the remaining 127 windows */ 227 228 rsaz_1024_sqr_avx2(result, result, m, k0, 5); 229 230 uint16_t wvalue_16; 231 memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16)); 232 wvalue = wvalue_16; 233 wvalue = (wvalue>> (index%8)) & 31; 234 index-=5; 235 236 rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ 237 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 238 } 239 240 /* square four times */ 241 rsaz_1024_sqr_avx2(result, result, m, k0, 4); 242 243 wvalue = p_str[0] & 15; 244 245 rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ 246 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 247 248 /* from Montgomery */ 249 rsaz_1024_mul_avx2(result, result, one, m, k0); 250 251 rsaz_1024_red2norm_avx2(result_norm, result); 252 253 OPENSSL_cleanse(storage,sizeof(storage)); 254 } 255 256 #endif /* OPENSSL_X86_64 */ 257