1 /***************************************************************************** 2 * * 3 * Copyright (c) 2012, Intel Corporation * 4 * * 5 * All rights reserved. * 6 * * 7 * Redistribution and use in source and binary forms, with or without * 8 * modification, are permitted provided that the following conditions are * 9 * met: * 10 * * 11 * * Redistributions of source code must retain the above copyright * 12 * notice, this list of conditions and the following disclaimer. * 13 * * 14 * * Redistributions in binary form must reproduce the above copyright * 15 * notice, this list of conditions and the following disclaimer in the * 16 * documentation and/or other materials provided with the * 17 * distribution. * 18 * * 19 * * Neither the name of the Intel Corporation nor the names of its * 20 * contributors may be used to endorse or promote products derived from * 21 * this software without specific prior written permission. * 22 * * 23 * * 24 * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY * 25 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR * 28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * 29 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * 30 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * 31 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 32 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 35 * * 36 ****************************************************************************** 37 * Developers and authors: * 38 * Shay Gueron (1, 2), and Vlad Krasnov (1) * 39 * (1) Intel Corporation, Israel Development Center, Haifa, Israel * 40 * (2) University of Haifa, Israel * 41 *****************************************************************************/ 42 43 #include <openssl/base.h> 44 45 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) 46 47 #include "rsaz_exp.h" 48 49 #include <openssl/mem.h> 50 51 /* 52 * See crypto/bn/asm/rsaz-avx2.pl for further details. 53 */ 54 void rsaz_1024_norm2red_avx2(void *red,const void *norm); 55 void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k); 56 void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt); 57 void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i); 58 void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i); 59 void rsaz_1024_red2norm_avx2(void *norm,const void *red); 60 61 #if defined(__GNUC__) 62 # define ALIGN64 __attribute__((aligned(64))) 63 #elif defined(_MSC_VER) 64 # define ALIGN64 __declspec(align(64)) 65 #elif defined(__SUNPRO_C) 66 # define ALIGN64 67 # pragma align 64(one,two80) 68 #else 69 # define ALIGN64 /* not fatal, might hurt performance a little */ 70 #endif 71 72 ALIGN64 static const BN_ULONG one[40] = 73 {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 74 ALIGN64 static const BN_ULONG two80[40] = 75 {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 76 77 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], 78 const BN_ULONG base_norm[16], const BN_ULONG exponent[16], 79 const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) 80 { 81 unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */ 82 unsigned char *p_str = storage + (64-((size_t)storage%64)); 83 unsigned char *a_inv, *m, *result, 84 *table_s = p_str+320*3, 85 *R2 = table_s; /* borrow */ 86 int index; 87 int wvalue; 88 89 if ((((size_t)p_str&4095)+320)>>12) { 90 result = p_str; 91 a_inv = p_str + 320; 92 m = p_str + 320*2; /* should not cross page */ 93 } else { 94 m = p_str; /* should not cross page */ 95 result = p_str + 320; 96 a_inv = p_str + 320*2; 97 } 98 99 rsaz_1024_norm2red_avx2(m, m_norm); 100 rsaz_1024_norm2red_avx2(a_inv, base_norm); 101 rsaz_1024_norm2red_avx2(R2, RR); 102 103 rsaz_1024_mul_avx2(R2, R2, R2, m, k0); 104 rsaz_1024_mul_avx2(R2, R2, two80, m, k0); 105 106 /* table[0] = 1 */ 107 rsaz_1024_mul_avx2(result, R2, one, m, k0); 108 /* table[1] = a_inv^1 */ 109 rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); 110 111 rsaz_1024_scatter5_avx2(table_s,result,0); 112 rsaz_1024_scatter5_avx2(table_s,a_inv,1); 113 114 /* table[2] = a_inv^2 */ 115 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); 116 rsaz_1024_scatter5_avx2(table_s,result,2); 117 #if 0 118 /* this is almost 2x smaller and less than 1% slower */ 119 for (index=3; index<32; index++) { 120 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 121 rsaz_1024_scatter5_avx2(table_s,result,index); 122 } 123 #else 124 /* table[4] = a_inv^4 */ 125 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 126 rsaz_1024_scatter5_avx2(table_s,result,4); 127 /* table[8] = a_inv^8 */ 128 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 129 rsaz_1024_scatter5_avx2(table_s,result,8); 130 /* table[16] = a_inv^16 */ 131 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 132 rsaz_1024_scatter5_avx2(table_s,result,16); 133 /* table[17] = a_inv^17 */ 134 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 135 rsaz_1024_scatter5_avx2(table_s,result,17); 136 137 /* table[3] */ 138 rsaz_1024_gather5_avx2(result,table_s,2); 139 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 140 rsaz_1024_scatter5_avx2(table_s,result,3); 141 /* table[6] */ 142 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 143 rsaz_1024_scatter5_avx2(table_s,result,6); 144 /* table[12] */ 145 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 146 rsaz_1024_scatter5_avx2(table_s,result,12); 147 /* table[24] */ 148 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 149 rsaz_1024_scatter5_avx2(table_s,result,24); 150 /* table[25] */ 151 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 152 rsaz_1024_scatter5_avx2(table_s,result,25); 153 154 /* table[5] */ 155 rsaz_1024_gather5_avx2(result,table_s,4); 156 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 157 rsaz_1024_scatter5_avx2(table_s,result,5); 158 /* table[10] */ 159 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 160 rsaz_1024_scatter5_avx2(table_s,result,10); 161 /* table[20] */ 162 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 163 rsaz_1024_scatter5_avx2(table_s,result,20); 164 /* table[21] */ 165 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 166 rsaz_1024_scatter5_avx2(table_s,result,21); 167 168 /* table[7] */ 169 rsaz_1024_gather5_avx2(result,table_s,6); 170 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 171 rsaz_1024_scatter5_avx2(table_s,result,7); 172 /* table[14] */ 173 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 174 rsaz_1024_scatter5_avx2(table_s,result,14); 175 /* table[28] */ 176 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 177 rsaz_1024_scatter5_avx2(table_s,result,28); 178 /* table[29] */ 179 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 180 rsaz_1024_scatter5_avx2(table_s,result,29); 181 182 /* table[9] */ 183 rsaz_1024_gather5_avx2(result,table_s,8); 184 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 185 rsaz_1024_scatter5_avx2(table_s,result,9); 186 /* table[18] */ 187 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 188 rsaz_1024_scatter5_avx2(table_s,result,18); 189 /* table[19] */ 190 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 191 rsaz_1024_scatter5_avx2(table_s,result,19); 192 193 /* table[11] */ 194 rsaz_1024_gather5_avx2(result,table_s,10); 195 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 196 rsaz_1024_scatter5_avx2(table_s,result,11); 197 /* table[22] */ 198 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 199 rsaz_1024_scatter5_avx2(table_s,result,22); 200 /* table[23] */ 201 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 202 rsaz_1024_scatter5_avx2(table_s,result,23); 203 204 /* table[13] */ 205 rsaz_1024_gather5_avx2(result,table_s,12); 206 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 207 rsaz_1024_scatter5_avx2(table_s,result,13); 208 /* table[26] */ 209 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 210 rsaz_1024_scatter5_avx2(table_s,result,26); 211 /* table[27] */ 212 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 213 rsaz_1024_scatter5_avx2(table_s,result,27); 214 215 /* table[15] */ 216 rsaz_1024_gather5_avx2(result,table_s,14); 217 rsaz_1024_mul_avx2(result,result,a_inv,m,k0); 218 rsaz_1024_scatter5_avx2(table_s,result,15); 219 /* table[30] */ 220 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 221 rsaz_1024_scatter5_avx2(table_s,result,30); 222 /* table[31] */ 223 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 224 rsaz_1024_scatter5_avx2(table_s,result,31); 225 #endif 226 227 /* load first window */ 228 p_str = (unsigned char*)exponent; 229 wvalue = p_str[127] >> 3; 230 rsaz_1024_gather5_avx2(result,table_s,wvalue); 231 232 index = 1014; 233 234 while(index > -1) { /* loop for the remaining 127 windows */ 235 236 rsaz_1024_sqr_avx2(result, result, m, k0, 5); 237 238 wvalue = *((unsigned short*)&p_str[index/8]); 239 wvalue = (wvalue>> (index%8)) & 31; 240 index-=5; 241 242 rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ 243 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 244 } 245 246 /* square four times */ 247 rsaz_1024_sqr_avx2(result, result, m, k0, 4); 248 249 wvalue = p_str[0] & 15; 250 251 rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */ 252 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 253 254 /* from Montgomery */ 255 rsaz_1024_mul_avx2(result, result, one, m, k0); 256 257 rsaz_1024_red2norm_avx2(result_norm, result); 258 259 OPENSSL_cleanse(storage,sizeof(storage)); 260 } 261 262 /* 263 * See crypto/bn/rsaz-x86_64.pl for further details. 264 */ 265 void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,BN_ULONG k); 266 void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,BN_ULONG k,const void *tbl,unsigned int power); 267 void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,BN_ULONG k,unsigned int power); 268 void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,BN_ULONG k); 269 void rsaz_512_sqr(void *ret,const void *a,const void *n,BN_ULONG k,int cnt); 270 void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); 271 void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); 272 273 void RSAZ_512_mod_exp(BN_ULONG result[8], 274 const BN_ULONG base[8], const BN_ULONG exponent[8], 275 const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) 276 { 277 unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */ 278 unsigned char *table = storage + (64-((size_t)storage%64)); 279 BN_ULONG *a_inv = (BN_ULONG *)(table+16*8*8), 280 *temp = (BN_ULONG *)(table+16*8*8+8*8); 281 unsigned char *p_str = (unsigned char*)exponent; 282 int index; 283 unsigned int wvalue; 284 285 /* table[0] = 1_inv */ 286 temp[0] = 0-m[0]; temp[1] = ~m[1]; 287 temp[2] = ~m[2]; temp[3] = ~m[3]; 288 temp[4] = ~m[4]; temp[5] = ~m[5]; 289 temp[6] = ~m[6]; temp[7] = ~m[7]; 290 rsaz_512_scatter4(table, temp, 0); 291 292 /* table [1] = a_inv^1 */ 293 rsaz_512_mul(a_inv, base, RR, m, k0); 294 rsaz_512_scatter4(table, a_inv, 1); 295 296 /* table [2] = a_inv^2 */ 297 rsaz_512_sqr(temp, a_inv, m, k0, 1); 298 rsaz_512_scatter4(table, temp, 2); 299 300 for (index=3; index<16; index++) 301 rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); 302 303 /* load first window */ 304 wvalue = p_str[63]; 305 306 rsaz_512_gather4(temp, table, wvalue>>4); 307 rsaz_512_sqr(temp, temp, m, k0, 4); 308 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf); 309 310 for (index=62; index>=0; index--) { 311 wvalue = p_str[index]; 312 313 rsaz_512_sqr(temp, temp, m, k0, 4); 314 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4); 315 316 rsaz_512_sqr(temp, temp, m, k0, 4); 317 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f); 318 } 319 320 /* from Montgomery */ 321 rsaz_512_mul_by_one(result, temp, m, k0); 322 323 OPENSSL_cleanse(storage,sizeof(storage)); 324 } 325 326 #endif /* OPENSSL_X86_64 */ 327