Home | History | Annotate | Download | only in elliptic
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "textflag.h"
      6 
      7 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
      8 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
      9 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
     10 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
     11 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
     12 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
     13 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
     14 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
     15 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
     16 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
     17 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
     18 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
     19 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
     20 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
     21 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
     22 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
     23 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
     24 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
     25 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
     26 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
     27 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
     28 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
     29 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
     30 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
     31 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
     32 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
     33 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
     34 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
     35 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
     36 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
     37 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
     38 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
     39 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
     40 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
     41 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
     42 GLOBL p256ordK0<>(SB), 8, $4
     43 GLOBL p256ord<>(SB), 8, $32
     44 GLOBL p256<>(SB), 8, $80
     45 GLOBL p256mul<>(SB), 8, $160
     46 
     47 // func hasVectorFacility() bool
     48 TEXT hasVectorFacility(SB), NOSPLIT, $24-1
     49 	MOVD  $x-24(SP), R1
     50 	XC    $24, 0(R1), 0(R1) // clear the storage
     51 	MOVD  $2, R0            // R0 is the number of double words stored -1
     52 	WORD  $0xB2B01000       // STFLE 0(R1)
     53 	XOR   R0, R0            // reset the value of R0
     54 	MOVBZ z-8(SP), R1
     55 	AND   $0x40, R1
     56 	BEQ   novector
     57 
     58 vectorinstalled:
     59 	// check if the vector instruction has been enabled
     60 	VLEIB  $0, $0xF, V16
     61 	VLGVB  $0, V16, R1
     62 	CMPBNE R1, $0xF, novector
     63 	MOVB   $1, ret+0(FP) // have vx
     64 	RET
     65 
     66 novector:
     67 	MOVB $0, ret+0(FP)   // no vx
     68 	RET
     69 
     70 // ---------------------------------------
     71 // iff cond == 1  val <- -val
     72 // func p256NegCond(val *p256Point, cond int)
     73 #define P1ptr   R1
     74 #define CPOOL   R4
     75 
     76 #define Y1L   V0
     77 #define Y1H   V1
     78 #define T1L   V2
     79 #define T1H   V3
     80 
     81 #define PL    V30
     82 #define PH    V31
     83 
     84 #define ZER   V4
     85 #define SEL1  V5
     86 #define CAR1  V6
     87 TEXT p256NegCond(SB), NOSPLIT, $0
     88 	MOVD val+0(FP), P1ptr
     89 
     90 	MOVD $p256mul<>+0x00(SB), CPOOL
     91 	VL   16(CPOOL), PL
     92 	VL   0(CPOOL), PH
     93 
     94 	VL 32(P1ptr), Y1H
     95 	VL 48(P1ptr), Y1L
     96 
     97 	VLREPG cond+8(FP), SEL1
     98 	VZERO  ZER
     99 	VCEQG  SEL1, ZER, SEL1
    100 
    101 	VSCBIQ Y1L, PL, CAR1
    102 	VSQ    Y1L, PL, T1L
    103 	VSBIQ  PH, Y1H, CAR1, T1H
    104 
    105 	VSEL Y1L, T1L, SEL1, Y1L
    106 	VSEL Y1H, T1H, SEL1, Y1H
    107 
    108 	VST Y1H, 32(P1ptr)
    109 	VST Y1L, 48(P1ptr)
    110 	RET
    111 
    112 #undef P1ptr
    113 #undef CPOOL
    114 #undef Y1L
    115 #undef Y1H
    116 #undef T1L
    117 #undef T1H
    118 #undef PL
    119 #undef PH
    120 #undef ZER
    121 #undef SEL1
    122 #undef CAR1
    123 
    124 // ---------------------------------------
    125 // if cond == 0 res <- b; else res <- a
    126 // func p256MovCond(res, a, b *p256Point, cond int)
    127 #define P3ptr   R1
    128 #define P1ptr   R2
    129 #define P2ptr   R3
    130 
    131 #define X1L    V0
    132 #define X1H    V1
    133 #define Y1L    V2
    134 #define Y1H    V3
    135 #define Z1L    V4
    136 #define Z1H    V5
    137 #define X2L    V6
    138 #define X2H    V7
    139 #define Y2L    V8
    140 #define Y2H    V9
    141 #define Z2L    V10
    142 #define Z2H    V11
    143 
    144 #define ZER   V18
    145 #define SEL1  V19
    146 TEXT p256MovCond(SB), NOSPLIT, $0
    147 	MOVD   res+0(FP), P3ptr
    148 	MOVD   a+8(FP), P1ptr
    149 	MOVD   b+16(FP), P2ptr
    150 	VLREPG cond+24(FP), SEL1
    151 	VZERO  ZER
    152 	VCEQG  SEL1, ZER, SEL1
    153 
    154 	VL 0(P1ptr), X1H
    155 	VL 16(P1ptr), X1L
    156 	VL 32(P1ptr), Y1H
    157 	VL 48(P1ptr), Y1L
    158 	VL 64(P1ptr), Z1H
    159 	VL 80(P1ptr), Z1L
    160 
    161 	VL 0(P2ptr), X2H
    162 	VL 16(P2ptr), X2L
    163 	VL 32(P2ptr), Y2H
    164 	VL 48(P2ptr), Y2L
    165 	VL 64(P2ptr), Z2H
    166 	VL 80(P2ptr), Z2L
    167 
    168 	VSEL X2L, X1L, SEL1, X1L
    169 	VSEL X2H, X1H, SEL1, X1H
    170 	VSEL Y2L, Y1L, SEL1, Y1L
    171 	VSEL Y2H, Y1H, SEL1, Y1H
    172 	VSEL Z2L, Z1L, SEL1, Z1L
    173 	VSEL Z2H, Z1H, SEL1, Z1H
    174 
    175 	VST X1H, 0(P3ptr)
    176 	VST X1L, 16(P3ptr)
    177 	VST Y1H, 32(P3ptr)
    178 	VST Y1L, 48(P3ptr)
    179 	VST Z1H, 64(P3ptr)
    180 	VST Z1L, 80(P3ptr)
    181 
    182 	RET
    183 
    184 #undef P3ptr
    185 #undef P1ptr
    186 #undef P2ptr
    187 #undef X1L
    188 #undef X1H
    189 #undef Y1L
    190 #undef Y1H
    191 #undef Z1L
    192 #undef Z1H
    193 #undef X2L
    194 #undef X2H
    195 #undef Y2L
    196 #undef Y2H
    197 #undef Z2L
    198 #undef Z2H
    199 #undef ZER
    200 #undef SEL1
    201 
    202 // ---------------------------------------
    203 // Constant time table access
    204 // Indexed from 1 to 15, with -1 offset
    205 // (index 0 is implicitly point at infinity)
    206 // func p256Select(point *p256Point, table []p256Point, idx int)
    207 #define P3ptr   R1
    208 #define P1ptr   R2
    209 #define COUNT   R4
    210 
    211 #define X1L    V0
    212 #define X1H    V1
    213 #define Y1L    V2
    214 #define Y1H    V3
    215 #define Z1L    V4
    216 #define Z1H    V5
    217 #define X2L    V6
    218 #define X2H    V7
    219 #define Y2L    V8
    220 #define Y2H    V9
    221 #define Z2L    V10
    222 #define Z2H    V11
    223 
    224 #define ONE   V18
    225 #define IDX   V19
    226 #define SEL1  V20
    227 #define SEL2  V21
    228 TEXT p256Select(SB), NOSPLIT, $0
    229 	MOVD   point+0(FP), P3ptr
    230 	MOVD   table+8(FP), P1ptr
    231 	VLREPB idx+(32+7)(FP), IDX
    232 	VREPIB $1, ONE
    233 	VREPIB $1, SEL2
    234 	MOVD   $1, COUNT
    235 
    236 	VZERO X1H
    237 	VZERO X1L
    238 	VZERO Y1H
    239 	VZERO Y1L
    240 	VZERO Z1H
    241 	VZERO Z1L
    242 
    243 loop_select:
    244 	VL 0(P1ptr), X2H
    245 	VL 16(P1ptr), X2L
    246 	VL 32(P1ptr), Y2H
    247 	VL 48(P1ptr), Y2L
    248 	VL 64(P1ptr), Z2H
    249 	VL 80(P1ptr), Z2L
    250 
    251 	VCEQG SEL2, IDX, SEL1
    252 
    253 	VSEL X2L, X1L, SEL1, X1L
    254 	VSEL X2H, X1H, SEL1, X1H
    255 	VSEL Y2L, Y1L, SEL1, Y1L
    256 	VSEL Y2H, Y1H, SEL1, Y1H
    257 	VSEL Z2L, Z1L, SEL1, Z1L
    258 	VSEL Z2H, Z1H, SEL1, Z1H
    259 
    260 	VAB  SEL2, ONE, SEL2
    261 	ADDW $1, COUNT
    262 	ADD  $96, P1ptr
    263 	CMPW COUNT, $17
    264 	BLT  loop_select
    265 
    266 	VST X1H, 0(P3ptr)
    267 	VST X1L, 16(P3ptr)
    268 	VST Y1H, 32(P3ptr)
    269 	VST Y1L, 48(P3ptr)
    270 	VST Z1H, 64(P3ptr)
    271 	VST Z1L, 80(P3ptr)
    272 	RET
    273 
    274 #undef P3ptr
    275 #undef P1ptr
    276 #undef COUNT
    277 #undef X1L
    278 #undef X1H
    279 #undef Y1L
    280 #undef Y1H
    281 #undef Z1L
    282 #undef Z1H
    283 #undef X2L
    284 #undef X2H
    285 #undef Y2L
    286 #undef Y2H
    287 #undef Z2L
    288 #undef Z2H
    289 #undef ONE
    290 #undef IDX
    291 #undef SEL1
    292 #undef SEL2
    293 
    294 // ---------------------------------------
    295 // Constant time table access
    296 // Indexed from 1 to 15, with -1 offset
    297 // (index 0 is implicitly point at infinity)
    298 // func p256SelectBase(point *p256Point, table []p256Point, idx int)
    299 #define P3ptr   R1
    300 #define P1ptr   R2
    301 #define COUNT   R4
    302 
    303 #define X1L    V0
    304 #define X1H    V1
    305 #define Y1L    V2
    306 #define Y1H    V3
    307 #define Z1L    V4
    308 #define Z1H    V5
    309 #define X2L    V6
    310 #define X2H    V7
    311 #define Y2L    V8
    312 #define Y2H    V9
    313 #define Z2L    V10
    314 #define Z2H    V11
    315 
    316 #define ONE   V18
    317 #define IDX   V19
    318 #define SEL1  V20
    319 #define SEL2  V21
    320 TEXT p256SelectBase(SB), NOSPLIT, $0
    321 	MOVD   point+0(FP), P3ptr
    322 	MOVD   table+8(FP), P1ptr
    323 	VLREPB idx+(32+7)(FP), IDX
    324 	VREPIB $1, ONE
    325 	VREPIB $1, SEL2
    326 	MOVD   $1, COUNT
    327 
    328 	VZERO X1H
    329 	VZERO X1L
    330 	VZERO Y1H
    331 	VZERO Y1L
    332 	VZERO Z1H
    333 	VZERO Z1L
    334 
    335 loop_select:
    336 	VL 0(P1ptr), X2H
    337 	VL 16(P1ptr), X2L
    338 	VL 32(P1ptr), Y2H
    339 	VL 48(P1ptr), Y2L
    340 	VL 64(P1ptr), Z2H
    341 	VL 80(P1ptr), Z2L
    342 
    343 	VCEQG SEL2, IDX, SEL1
    344 
    345 	VSEL X2L, X1L, SEL1, X1L
    346 	VSEL X2H, X1H, SEL1, X1H
    347 	VSEL Y2L, Y1L, SEL1, Y1L
    348 	VSEL Y2H, Y1H, SEL1, Y1H
    349 	VSEL Z2L, Z1L, SEL1, Z1L
    350 	VSEL Z2H, Z1H, SEL1, Z1H
    351 
    352 	VAB  SEL2, ONE, SEL2
    353 	ADDW $1, COUNT
    354 	ADD  $96, P1ptr
    355 	CMPW COUNT, $65
    356 	BLT  loop_select
    357 
    358 	VST X1H, 0(P3ptr)
    359 	VST X1L, 16(P3ptr)
    360 	VST Y1H, 32(P3ptr)
    361 	VST Y1L, 48(P3ptr)
    362 	VST Z1H, 64(P3ptr)
    363 	VST Z1L, 80(P3ptr)
    364 	RET
    365 
    366 #undef P3ptr
    367 #undef P1ptr
    368 #undef COUNT
    369 #undef X1L
    370 #undef X1H
    371 #undef Y1L
    372 #undef Y1H
    373 #undef Z1L
    374 #undef Z1H
    375 #undef X2L
    376 #undef X2H
    377 #undef Y2L
    378 #undef Y2H
    379 #undef Z2L
    380 #undef Z2H
    381 #undef ONE
    382 #undef IDX
    383 #undef SEL1
    384 #undef SEL2
    385 
    386 // ---------------------------------------
    387 // func p256FromMont(res, in []byte)
    388 #define res_ptr R1
    389 #define x_ptr   R2
    390 #define CPOOL   R4
    391 
    392 #define T0   V0
    393 #define T1   V1
    394 #define T2   V2
    395 #define TT0  V3
    396 #define TT1  V4
    397 
    398 #define ZER   V6
    399 #define SEL1  V7
    400 #define SEL2  V8
    401 #define CAR1  V9
    402 #define CAR2  V10
    403 #define RED1  V11
    404 #define RED2  V12
    405 #define PL    V13
    406 #define PH    V14
    407 
    408 TEXT p256FromMont(SB), NOSPLIT, $0
    409 	MOVD res+0(FP), res_ptr
    410 	MOVD in+24(FP), x_ptr
    411 
    412 	VZERO T2
    413 	VZERO ZER
    414 	MOVD  $p256<>+0x00(SB), CPOOL
    415 	VL    16(CPOOL), PL
    416 	VL    0(CPOOL), PH
    417 	VL    48(CPOOL), SEL2
    418 	VL    64(CPOOL), SEL1
    419 
    420 	VL (1*16)(x_ptr), T0
    421 	VL (0*16)(x_ptr), T1
    422 
    423 	// First round
    424 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
    425 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
    426 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
    427 
    428 	VSLDB $8, T1, T0, T0
    429 	VSLDB $8, T2, T1, T1
    430 
    431 	VACCQ  T0, RED1, CAR1
    432 	VAQ    T0, RED1, T0
    433 	VACCCQ T1, RED2, CAR1, CAR2
    434 	VACQ   T1, RED2, CAR1, T1
    435 	VAQ    T2, CAR2, T2
    436 
    437 	// Second round
    438 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
    439 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
    440 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
    441 
    442 	VSLDB $8, T1, T0, T0
    443 	VSLDB $8, T2, T1, T1
    444 
    445 	VACCQ  T0, RED1, CAR1
    446 	VAQ    T0, RED1, T0
    447 	VACCCQ T1, RED2, CAR1, CAR2
    448 	VACQ   T1, RED2, CAR1, T1
    449 	VAQ    T2, CAR2, T2
    450 
    451 	// Third round
    452 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
    453 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
    454 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
    455 
    456 	VSLDB $8, T1, T0, T0
    457 	VSLDB $8, T2, T1, T1
    458 
    459 	VACCQ  T0, RED1, CAR1
    460 	VAQ    T0, RED1, T0
    461 	VACCCQ T1, RED2, CAR1, CAR2
    462 	VACQ   T1, RED2, CAR1, T1
    463 	VAQ    T2, CAR2, T2
    464 
    465 	// Last round
    466 	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
    467 	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
    468 	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
    469 
    470 	VSLDB $8, T1, T0, T0
    471 	VSLDB $8, T2, T1, T1
    472 
    473 	VACCQ  T0, RED1, CAR1
    474 	VAQ    T0, RED1, T0
    475 	VACCCQ T1, RED2, CAR1, CAR2
    476 	VACQ   T1, RED2, CAR1, T1
    477 	VAQ    T2, CAR2, T2
    478 
    479 	// ---------------------------------------------------
    480 
    481 	VSCBIQ  PL, T0, CAR1
    482 	VSQ     PL, T0, TT0
    483 	VSBCBIQ T1, PH, CAR1, CAR2
    484 	VSBIQ   T1, PH, CAR1, TT1
    485 	VSBIQ   T2, ZER, CAR2, T2
    486 
    487 	// what output to use, TT1||TT0 or T1||T0?
    488 	VSEL T0, TT0, T2, T0
    489 	VSEL T1, TT1, T2, T1
    490 
    491 	VST T0, (1*16)(res_ptr)
    492 	VST T1, (0*16)(res_ptr)
    493 	RET
    494 
    495 #undef res_ptr
    496 #undef x_ptr
    497 #undef CPOOL
    498 #undef T0
    499 #undef T1
    500 #undef T2
    501 #undef TT0
    502 #undef TT1
    503 #undef ZER
    504 #undef SEL1
    505 #undef SEL2
    506 #undef CAR1
    507 #undef CAR2
    508 #undef RED1
    509 #undef RED2
    510 #undef PL
    511 #undef PH
    512 
    513 // ---------------------------------------
    514 // func p256OrdMul(res, in1, in2 []byte)
    515 #define res_ptr R1
    516 #define x_ptr R2
    517 #define y_ptr R3
    518 #define X0    V0
    519 #define X1    V1
    520 #define Y0    V2
    521 #define Y1    V3
    522 #define M0    V4
    523 #define M1    V5
    524 #define T0    V6
    525 #define T1    V7
    526 #define T2    V8
    527 #define YDIG  V9
    528 
    529 #define ADD1  V16
    530 #define ADD1H V17
    531 #define ADD2  V18
    532 #define ADD2H V19
    533 #define RED1  V20
    534 #define RED1H V21
    535 #define RED2  V22
    536 #define RED2H V23
    537 #define CAR1  V24
    538 #define CAR1M V25
    539 
    540 #define MK0   V30
    541 #define K0    V31
    542 TEXT p256OrdMul(SB), NOSPLIT, $0
    543 	MOVD res+0(FP), res_ptr
    544 	MOVD in1+24(FP), x_ptr
    545 	MOVD in2+48(FP), y_ptr
    546 
    547 	VZERO T2
    548 	MOVD  $p256ordK0<>+0x00(SB), R4
    549 
    550 	// VLEF    $3, 0(R4), K0
    551 	WORD $0xE7F40000
    552 	BYTE $0x38
    553 	BYTE $0x03
    554 	MOVD $p256ord<>+0x00(SB), R4
    555 	VL   16(R4), M0
    556 	VL   0(R4), M1
    557 
    558 	VL (1*16)(x_ptr), X0
    559 	VL (0*16)(x_ptr), X1
    560 	VL (1*16)(y_ptr), Y0
    561 	VL (0*16)(y_ptr), Y1
    562 
    563 	// ---------------------------------------------------------------------------/
    564 	VREPF $3, Y0, YDIG
    565 	VMLF  X0, YDIG, ADD1
    566 	VMLF  ADD1, K0, MK0
    567 	VREPF $3, MK0, MK0
    568 
    569 	VMLF  X1, YDIG, ADD2
    570 	VMLHF X0, YDIG, ADD1H
    571 	VMLHF X1, YDIG, ADD2H
    572 
    573 	VMALF  M0, MK0, ADD1, RED1
    574 	VMALHF M0, MK0, ADD1, RED1H
    575 	VMALF  M1, MK0, ADD2, RED2
    576 	VMALHF M1, MK0, ADD2, RED2H
    577 
    578 	VSLDB $12, RED2, RED1, RED1
    579 	VSLDB $12, T2, RED2, RED2
    580 
    581 	VACCQ RED1, ADD1H, CAR1
    582 	VAQ   RED1, ADD1H, T0
    583 	VACCQ RED1H, T0, CAR1M
    584 	VAQ   RED1H, T0, T0
    585 
    586 	// << ready for next MK0
    587 
    588 	VACQ   RED2, ADD2H, CAR1, T1
    589 	VACCCQ RED2, ADD2H, CAR1, CAR1
    590 	VACCCQ RED2H, T1, CAR1M, T2
    591 	VACQ   RED2H, T1, CAR1M, T1
    592 	VAQ    CAR1, T2, T2
    593 
    594 	// ---------------------------------------------------
    595 /* *
    596  * ---+--------+--------+
    597  *  T2|   T1   |   T0   |
    598  * ---+--------+--------+
    599  *           *(add)*
    600  *    +--------+--------+
    601  *    |   X1   |   X0   |
    602  *    +--------+--------+
    603  *           *(mul)*
    604  *    +--------+--------+
    605  *    |  YDIG  |  YDIG  |
    606  *    +--------+--------+
    607  *           *(add)*
    608  *    +--------+--------+
    609  *    |   M1   |   M0   |
    610  *    +--------+--------+
    611  *           *(mul)*
    612  *    +--------+--------+
    613  *    |   MK0  |   MK0  |
    614  *    +--------+--------+
    615  *
    616  *   ---------------------
    617  *
    618  *    +--------+--------+
    619  *    |  ADD2  |  ADD1  |
    620  *    +--------+--------+
    621  *  +--------+--------+
    622  *  | ADD2H  | ADD1H  |
    623  *  +--------+--------+
    624  *    +--------+--------+
    625  *    |  RED2  |  RED1  |
    626  *    +--------+--------+
    627  *  +--------+--------+
    628  *  | RED2H  | RED1H  |
    629  *  +--------+--------+
    630  */
    631 	VREPF $2, Y0, YDIG
    632 	VMALF X0, YDIG, T0, ADD1
    633 	VMLF  ADD1, K0, MK0
    634 	VREPF $3, MK0, MK0
    635 
    636 	VMALF  X1, YDIG, T1, ADD2
    637 	VMALHF X0, YDIG, T0, ADD1H
    638 	VMALHF X1, YDIG, T1, ADD2H
    639 
    640 	VMALF  M0, MK0, ADD1, RED1
    641 	VMALHF M0, MK0, ADD1, RED1H
    642 	VMALF  M1, MK0, ADD2, RED2
    643 	VMALHF M1, MK0, ADD2, RED2H
    644 
    645 	VSLDB $12, RED2, RED1, RED1
    646 	VSLDB $12, T2, RED2, RED2
    647 
    648 	VACCQ RED1, ADD1H, CAR1
    649 	VAQ   RED1, ADD1H, T0
    650 	VACCQ RED1H, T0, CAR1M
    651 	VAQ   RED1H, T0, T0
    652 
    653 	// << ready for next MK0
    654 
    655 	VACQ   RED2, ADD2H, CAR1, T1
    656 	VACCCQ RED2, ADD2H, CAR1, CAR1
    657 	VACCCQ RED2H, T1, CAR1M, T2
    658 	VACQ   RED2H, T1, CAR1M, T1
    659 	VAQ    CAR1, T2, T2
    660 
    661 	// ---------------------------------------------------
    662 	VREPF $1, Y0, YDIG
    663 	VMALF X0, YDIG, T0, ADD1
    664 	VMLF  ADD1, K0, MK0
    665 	VREPF $3, MK0, MK0
    666 
    667 	VMALF  X1, YDIG, T1, ADD2
    668 	VMALHF X0, YDIG, T0, ADD1H
    669 	VMALHF X1, YDIG, T1, ADD2H
    670 
    671 	VMALF  M0, MK0, ADD1, RED1
    672 	VMALHF M0, MK0, ADD1, RED1H
    673 	VMALF  M1, MK0, ADD2, RED2
    674 	VMALHF M1, MK0, ADD2, RED2H
    675 
    676 	VSLDB $12, RED2, RED1, RED1
    677 	VSLDB $12, T2, RED2, RED2
    678 
    679 	VACCQ RED1, ADD1H, CAR1
    680 	VAQ   RED1, ADD1H, T0
    681 	VACCQ RED1H, T0, CAR1M
    682 	VAQ   RED1H, T0, T0
    683 
    684 	// << ready for next MK0
    685 
    686 	VACQ   RED2, ADD2H, CAR1, T1
    687 	VACCCQ RED2, ADD2H, CAR1, CAR1
    688 	VACCCQ RED2H, T1, CAR1M, T2
    689 	VACQ   RED2H, T1, CAR1M, T1
    690 	VAQ    CAR1, T2, T2
    691 
    692 	// ---------------------------------------------------
    693 	VREPF $0, Y0, YDIG
    694 	VMALF X0, YDIG, T0, ADD1
    695 	VMLF  ADD1, K0, MK0
    696 	VREPF $3, MK0, MK0
    697 
    698 	VMALF  X1, YDIG, T1, ADD2
    699 	VMALHF X0, YDIG, T0, ADD1H
    700 	VMALHF X1, YDIG, T1, ADD2H
    701 
    702 	VMALF  M0, MK0, ADD1, RED1
    703 	VMALHF M0, MK0, ADD1, RED1H
    704 	VMALF  M1, MK0, ADD2, RED2
    705 	VMALHF M1, MK0, ADD2, RED2H
    706 
    707 	VSLDB $12, RED2, RED1, RED1
    708 	VSLDB $12, T2, RED2, RED2
    709 
    710 	VACCQ RED1, ADD1H, CAR1
    711 	VAQ   RED1, ADD1H, T0
    712 	VACCQ RED1H, T0, CAR1M
    713 	VAQ   RED1H, T0, T0
    714 
    715 	// << ready for next MK0
    716 
    717 	VACQ   RED2, ADD2H, CAR1, T1
    718 	VACCCQ RED2, ADD2H, CAR1, CAR1
    719 	VACCCQ RED2H, T1, CAR1M, T2
    720 	VACQ   RED2H, T1, CAR1M, T1
    721 	VAQ    CAR1, T2, T2
    722 
    723 	// ---------------------------------------------------
    724 	VREPF $3, Y1, YDIG
    725 	VMALF X0, YDIG, T0, ADD1
    726 	VMLF  ADD1, K0, MK0
    727 	VREPF $3, MK0, MK0
    728 
    729 	VMALF  X1, YDIG, T1, ADD2
    730 	VMALHF X0, YDIG, T0, ADD1H
    731 	VMALHF X1, YDIG, T1, ADD2H
    732 
    733 	VMALF  M0, MK0, ADD1, RED1
    734 	VMALHF M0, MK0, ADD1, RED1H
    735 	VMALF  M1, MK0, ADD2, RED2
    736 	VMALHF M1, MK0, ADD2, RED2H
    737 
    738 	VSLDB $12, RED2, RED1, RED1
    739 	VSLDB $12, T2, RED2, RED2
    740 
    741 	VACCQ RED1, ADD1H, CAR1
    742 	VAQ   RED1, ADD1H, T0
    743 	VACCQ RED1H, T0, CAR1M
    744 	VAQ   RED1H, T0, T0
    745 
    746 	// << ready for next MK0
    747 
    748 	VACQ   RED2, ADD2H, CAR1, T1
    749 	VACCCQ RED2, ADD2H, CAR1, CAR1
    750 	VACCCQ RED2H, T1, CAR1M, T2
    751 	VACQ   RED2H, T1, CAR1M, T1
    752 	VAQ    CAR1, T2, T2
    753 
    754 	// ---------------------------------------------------
    755 	VREPF $2, Y1, YDIG
    756 	VMALF X0, YDIG, T0, ADD1
    757 	VMLF  ADD1, K0, MK0
    758 	VREPF $3, MK0, MK0
    759 
    760 	VMALF  X1, YDIG, T1, ADD2
    761 	VMALHF X0, YDIG, T0, ADD1H
    762 	VMALHF X1, YDIG, T1, ADD2H
    763 
    764 	VMALF  M0, MK0, ADD1, RED1
    765 	VMALHF M0, MK0, ADD1, RED1H
    766 	VMALF  M1, MK0, ADD2, RED2
    767 	VMALHF M1, MK0, ADD2, RED2H
    768 
    769 	VSLDB $12, RED2, RED1, RED1
    770 	VSLDB $12, T2, RED2, RED2
    771 
    772 	VACCQ RED1, ADD1H, CAR1
    773 	VAQ   RED1, ADD1H, T0
    774 	VACCQ RED1H, T0, CAR1M
    775 	VAQ   RED1H, T0, T0
    776 
    777 	// << ready for next MK0
    778 
    779 	VACQ   RED2, ADD2H, CAR1, T1
    780 	VACCCQ RED2, ADD2H, CAR1, CAR1
    781 	VACCCQ RED2H, T1, CAR1M, T2
    782 	VACQ   RED2H, T1, CAR1M, T1
    783 	VAQ    CAR1, T2, T2
    784 
    785 	// ---------------------------------------------------
    786 	VREPF $1, Y1, YDIG
    787 	VMALF X0, YDIG, T0, ADD1
    788 	VMLF  ADD1, K0, MK0
    789 	VREPF $3, MK0, MK0
    790 
    791 	VMALF  X1, YDIG, T1, ADD2
    792 	VMALHF X0, YDIG, T0, ADD1H
    793 	VMALHF X1, YDIG, T1, ADD2H
    794 
    795 	VMALF  M0, MK0, ADD1, RED1
    796 	VMALHF M0, MK0, ADD1, RED1H
    797 	VMALF  M1, MK0, ADD2, RED2
    798 	VMALHF M1, MK0, ADD2, RED2H
    799 
    800 	VSLDB $12, RED2, RED1, RED1
    801 	VSLDB $12, T2, RED2, RED2
    802 
    803 	VACCQ RED1, ADD1H, CAR1
    804 	VAQ   RED1, ADD1H, T0
    805 	VACCQ RED1H, T0, CAR1M
    806 	VAQ   RED1H, T0, T0
    807 
    808 	// << ready for next MK0
    809 
    810 	VACQ   RED2, ADD2H, CAR1, T1
    811 	VACCCQ RED2, ADD2H, CAR1, CAR1
    812 	VACCCQ RED2H, T1, CAR1M, T2
    813 	VACQ   RED2H, T1, CAR1M, T1
    814 	VAQ    CAR1, T2, T2
    815 
    816 	// ---------------------------------------------------
    817 	VREPF $0, Y1, YDIG
    818 	VMALF X0, YDIG, T0, ADD1
    819 	VMLF  ADD1, K0, MK0
    820 	VREPF $3, MK0, MK0
    821 
    822 	VMALF  X1, YDIG, T1, ADD2
    823 	VMALHF X0, YDIG, T0, ADD1H
    824 	VMALHF X1, YDIG, T1, ADD2H
    825 
    826 	VMALF  M0, MK0, ADD1, RED1
    827 	VMALHF M0, MK0, ADD1, RED1H
    828 	VMALF  M1, MK0, ADD2, RED2
    829 	VMALHF M1, MK0, ADD2, RED2H
    830 
    831 	VSLDB $12, RED2, RED1, RED1
    832 	VSLDB $12, T2, RED2, RED2
    833 
    834 	VACCQ RED1, ADD1H, CAR1
    835 	VAQ   RED1, ADD1H, T0
    836 	VACCQ RED1H, T0, CAR1M
    837 	VAQ   RED1H, T0, T0
    838 
    839 	// << ready for next MK0
    840 
    841 	VACQ   RED2, ADD2H, CAR1, T1
    842 	VACCCQ RED2, ADD2H, CAR1, CAR1
    843 	VACCCQ RED2H, T1, CAR1M, T2
    844 	VACQ   RED2H, T1, CAR1M, T1
    845 	VAQ    CAR1, T2, T2
    846 
    847 	// ---------------------------------------------------
    848 
    849 	VZERO   RED1
    850 	VSCBIQ  M0, T0, CAR1
    851 	VSQ     M0, T0, ADD1
    852 	VSBCBIQ T1, M1, CAR1, CAR1M
    853 	VSBIQ   T1, M1, CAR1, ADD2
    854 	VSBIQ   T2, RED1, CAR1M, T2
    855 
    856 	// what output to use, ADD2||ADD1 or T1||T0?
    857 	VSEL T0, ADD1, T2, T0
    858 	VSEL T1, ADD2, T2, T1
    859 
    860 	VST T0, (1*16)(res_ptr)
    861 	VST T1, (0*16)(res_ptr)
    862 	RET
    863 
    864 #undef res_ptr
    865 #undef x_ptr
    866 #undef y_ptr
    867 #undef X0
    868 #undef X1
    869 #undef Y0
    870 #undef Y1
    871 #undef M0
    872 #undef M1
    873 #undef T0
    874 #undef T1
    875 #undef T2
    876 #undef YDIG
    877 
    878 #undef ADD1
    879 #undef ADD1H
    880 #undef ADD2
    881 #undef ADD2H
    882 #undef RED1
    883 #undef RED1H
    884 #undef RED2
    885 #undef RED2H
    886 #undef CAR1
    887 #undef CAR1M
    888 
    889 #undef MK0
    890 #undef K0
    891 
    892 // ---------------------------------------
    893 // p256MulInternal
    894 // V0-V3,V30,V31 - Not Modified
    895 // V4-V15 - Volatile
    896 
    897 #define CPOOL   R4
    898 
    899 // Parameters
    900 #define X0    V0 // Not modified
    901 #define X1    V1 // Not modified
    902 #define Y0    V2 // Not modified
    903 #define Y1    V3 // Not modified
    904 #define T0    V4
    905 #define T1    V5
    906 #define P0    V30 // Not modified
    907 #define P1    V31 // Not modified
    908 
    909 // Temporaries
    910 #define YDIG  V6 // Overloaded with CAR2, ZER
    911 #define ADD1H V7 // Overloaded with ADD3H
    912 #define ADD2H V8 // Overloaded with ADD4H
    913 #define ADD3  V9 // Overloaded with SEL2,SEL5
    914 #define ADD4  V10 // Overloaded with SEL3,SEL6
    915 #define RED1  V11 // Overloaded with CAR2
    916 #define RED2  V12
    917 #define RED3  V13 // Overloaded with SEL1
    918 #define T2    V14
    919 // Overloaded temporaries
    920 #define ADD1  V4 // Overloaded with T0
    921 #define ADD2  V5 // Overloaded with T1
    922 #define ADD3H V7 // Overloaded with ADD1H
    923 #define ADD4H V8 // Overloaded with ADD2H
    924 #define ZER   V6 // Overloaded with YDIG, CAR2
    925 #define CAR1  V6 // Overloaded with YDIG, ZER
    926 #define CAR2  V11 // Overloaded with RED1
    927 // Constant Selects
    928 #define SEL1  V13 // Overloaded with RED3
    929 #define SEL2  V9 // Overloaded with ADD3,SEL5
    930 #define SEL3  V10 // Overloaded with ADD4,SEL6
    931 #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
    932 #define SEL5  V9 // Overloaded with ADD3,SEL2
    933 #define SEL6  V10 // Overloaded with ADD4,SEL3
    934 
    935 /* *
    936  * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
    937  * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
    938  * With you, SIMD be...
    939  *
    940  *                                           +--------+--------+
    941  *                                  +--------|  RED2  |  RED1  |
    942  *                                  |        +--------+--------+
    943  *                                  |       ---+--------+--------+
    944  *                                  |  +---- T2|   T1   |   T0   |--+
    945  *                                  |  |    ---+--------+--------+  |
    946  *                                  |  |                            |
    947  *                                  |  |    ======================= |
    948  *                                  |  |                            |
    949  *                                  |  |       +--------+--------+<-+
    950  *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
    951  *                                  |  |       +--------+--------+  |     |
    952  *                                  |  |     +--------+--------+<---+     |
    953  *                                  |  |     | ADD2H  | ADD1H  |--+       |
    954  *                                  |  |     +--------+--------+  |       |
    955  *                                  |  |     +--------+--------+<-+       |
    956  *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
    957  *                                  |  |     +--------+--------+  | |     |
    958  *                                  |  |   +--------+--------+<---+ |     |
    959  *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
    960  *                                  |  |   +--------+--------+      | |   V
    961  *                                  |  | ------------------------   | | +--------+
    962  *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
    963  *                                  |  |                            | | +--------+
    964  *                                  |  +---->+--------+--------+    | |   |
    965  *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
    966  *                                  |        +--------+--------+    | |   |
    967  *                                  +---->---+--------+--------+    | |   |
    968  *                                         T2|   T1   |   T0   |----+ |   |
    969  *                                        ---+--------+--------+    | |   |
    970  *                                        ---+--------+--------+<---+ |   |
    971  *                                    +--- T2|   T1   |   T0   |----------+
    972  *                                    |   ---+--------+--------+      |   |
    973  *                                    |  +--------+--------+<-------------+
    974  *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
    975  *                                    |  +--------+--------+     |    |   |
    976  *                                    |  +--------+<----------------------+
    977  *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
    978  *                                    |  +--------+              |    |
    979  *                                    +--->+--------+--------+   |    |
    980  *                                         |   T1   |   T0   |--------+
    981  *                                         +--------+--------+   |    |
    982  *                                   --------------------------- |    |
    983  *                                                               |    |
    984  *                                       +--------+--------+<----+    |
    985  *                                       |  RED2  |  RED1  |          |
    986  *                                       +--------+--------+          |
    987  *                                      ---+--------+--------+<-------+
    988  *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
    989  *                                      ---+--------+--------+
    990  *
    991  *                                                                *Mi obra de arte de siglo XXI @vpaprots
    992  *
    993  *
    994  * First group is special, doesnt get the two inputs:
    995  *                                             +--------+--------+<-+
    996  *                                     +-------|  ADD2  |  ADD1  |--|-----+
    997  *                                     |       +--------+--------+  |     |
    998  *                                     |     +--------+--------+<---+     |
    999  *                                     |     | ADD2H  | ADD1H  |--+       |
   1000  *                                     |     +--------+--------+  |       |
   1001  *                                     |     +--------+--------+<-+       |
   1002  *                                     |     |  ADD4  |  ADD3  |--|-+     |
   1003  *                                     |     +--------+--------+  | |     |
   1004  *                                     |   +--------+--------+<---+ |     |
   1005  *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   1006  *                                     |   +--------+--------+      | |   V
   1007  *                                     | ------------------------   | | +--------+
   1008  *                                     |                            | | |  RED3  |  [d0 0 0 d0]
   1009  *                                     |                            | | +--------+
   1010  *                                     +---->+--------+--------+    | |   |
   1011  *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
   1012  *                                           +--------+--------+    | |   |
   1013  *                                        ---+--------+--------+<---+ |   |
   1014  *                                    +--- T2|   T1   |   T0   |----------+
   1015  *                                    |   ---+--------+--------+      |   |
   1016  *                                    |  +--------+--------+<-------------+
   1017  *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   1018  *                                    |  +--------+--------+     |    |   |
   1019  *                                    |  +--------+<----------------------+
   1020  *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   1021  *                                    |  +--------+              |    |
   1022  *                                    +--->+--------+--------+   |    |
   1023  *                                         |   T1   |   T0   |--------+
   1024  *                                         +--------+--------+   |    |
   1025  *                                   --------------------------- |    |
   1026  *                                                               |    |
   1027  *                                       +--------+--------+<----+    |
   1028  *                                       |  RED2  |  RED1  |          |
   1029  *                                       +--------+--------+          |
   1030  *                                      ---+--------+--------+<-------+
   1031  *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   1032  *                                      ---+--------+--------+
   1033  *
   1034  * Last 'group' needs to RED2||RED1 shifted less
   1035  */
   1036 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
   1037 	VL 32(CPOOL), SEL1
   1038 	VL 48(CPOOL), SEL2
   1039 	VL 64(CPOOL), SEL3
   1040 	VL 80(CPOOL), SEL4
   1041 
   1042 	// ---------------------------------------------------
   1043 
   1044 	VREPF $3, Y0, YDIG
   1045 	VMLHF X0, YDIG, ADD1H
   1046 	VMLHF X1, YDIG, ADD2H
   1047 	VMLF  X0, YDIG, ADD1
   1048 	VMLF  X1, YDIG, ADD2
   1049 
   1050 	VREPF  $2, Y0, YDIG
   1051 	VMALF  X0, YDIG, ADD1H, ADD3
   1052 	VMALF  X1, YDIG, ADD2H, ADD4
   1053 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   1054 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   1055 
   1056 	VZERO ZER
   1057 	VL    32(CPOOL), SEL1
   1058 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   1059 
   1060 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
   1061 	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
   1062 
   1063 	VACCQ  T0, ADD3, CAR1
   1064 	VAQ    T0, ADD3, T0       // ADD3 Free
   1065 	VACCCQ T1, ADD4, CAR1, T2
   1066 	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
   1067 
   1068 	VL    48(CPOOL), SEL2
   1069 	VL    64(CPOOL), SEL3
   1070 	VL    80(CPOOL), SEL4
   1071 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   1072 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   1073 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   1074 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   1075 
   1076 	VSLDB $12, T1, T0, T0
   1077 	VSLDB $12, T2, T1, T1
   1078 
   1079 	VACCQ  T0, ADD3H, CAR1
   1080 	VAQ    T0, ADD3H, T0
   1081 	VACCCQ T1, ADD4H, CAR1, T2
   1082 	VACQ   T1, ADD4H, CAR1, T1
   1083 
   1084 	// ---------------------------------------------------
   1085 
   1086 	VREPF  $1, Y0, YDIG
   1087 	VMALHF X0, YDIG, T0, ADD1H
   1088 	VMALHF X1, YDIG, T1, ADD2H
   1089 	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
   1090 	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
   1091 
   1092 	VREPF  $0, Y0, YDIG
   1093 	VMALF  X0, YDIG, ADD1H, ADD3
   1094 	VMALF  X1, YDIG, ADD2H, ADD4
   1095 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
   1096 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
   1097 
   1098 	VZERO ZER
   1099 	VL    32(CPOOL), SEL1
   1100 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   1101 
   1102 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
   1103 	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
   1104 
   1105 	VACCQ  T0, RED1, CAR1
   1106 	VAQ    T0, RED1, T0
   1107 	VACCCQ T1, RED2, CAR1, T2
   1108 	VACQ   T1, RED2, CAR1, T1
   1109 
   1110 	VACCQ  T0, ADD3, CAR1
   1111 	VAQ    T0, ADD3, T0
   1112 	VACCCQ T1, ADD4, CAR1, CAR2
   1113 	VACQ   T1, ADD4, CAR1, T1
   1114 	VAQ    T2, CAR2, T2
   1115 
   1116 	VL    48(CPOOL), SEL2
   1117 	VL    64(CPOOL), SEL3
   1118 	VL    80(CPOOL), SEL4
   1119 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   1120 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   1121 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   1122 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   1123 
   1124 	VSLDB $12, T1, T0, T0
   1125 	VSLDB $12, T2, T1, T1
   1126 
   1127 	VACCQ  T0, ADD3H, CAR1
   1128 	VAQ    T0, ADD3H, T0
   1129 	VACCCQ T1, ADD4H, CAR1, T2
   1130 	VACQ   T1, ADD4H, CAR1, T1
   1131 
   1132 	// ---------------------------------------------------
   1133 
   1134 	VREPF  $3, Y1, YDIG
   1135 	VMALHF X0, YDIG, T0, ADD1H
   1136 	VMALHF X1, YDIG, T1, ADD2H
   1137 	VMALF  X0, YDIG, T0, ADD1
   1138 	VMALF  X1, YDIG, T1, ADD2
   1139 
   1140 	VREPF  $2, Y1, YDIG
   1141 	VMALF  X0, YDIG, ADD1H, ADD3
   1142 	VMALF  X1, YDIG, ADD2H, ADD4
   1143 	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   1144 	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   1145 
   1146 	VZERO ZER
   1147 	VL    32(CPOOL), SEL1
   1148 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   1149 
   1150 	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
   1151 	VSLDB $12, T2, ADD2, T1   // ADD2 Free
   1152 
   1153 	VACCQ  T0, RED1, CAR1
   1154 	VAQ    T0, RED1, T0
   1155 	VACCCQ T1, RED2, CAR1, T2
   1156 	VACQ   T1, RED2, CAR1, T1
   1157 
   1158 	VACCQ  T0, ADD3, CAR1
   1159 	VAQ    T0, ADD3, T0
   1160 	VACCCQ T1, ADD4, CAR1, CAR2
   1161 	VACQ   T1, ADD4, CAR1, T1
   1162 	VAQ    T2, CAR2, T2
   1163 
   1164 	VL    48(CPOOL), SEL2
   1165 	VL    64(CPOOL), SEL3
   1166 	VL    80(CPOOL), SEL4
   1167 	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   1168 	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   1169 	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   1170 	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   1171 
   1172 	VSLDB $12, T1, T0, T0
   1173 	VSLDB $12, T2, T1, T1
   1174 
   1175 	VACCQ  T0, ADD3H, CAR1
   1176 	VAQ    T0, ADD3H, T0
   1177 	VACCCQ T1, ADD4H, CAR1, T2
   1178 	VACQ   T1, ADD4H, CAR1, T1
   1179 
   1180 	// ---------------------------------------------------
   1181 
   1182 	VREPF  $1, Y1, YDIG
   1183 	VMALHF X0, YDIG, T0, ADD1H
   1184 	VMALHF X1, YDIG, T1, ADD2H
   1185 	VMALF  X0, YDIG, T0, ADD1
   1186 	VMALF  X1, YDIG, T1, ADD2
   1187 
   1188 	VREPF  $0, Y1, YDIG
   1189 	VMALF  X0, YDIG, ADD1H, ADD3
   1190 	VMALF  X1, YDIG, ADD2H, ADD4
   1191 	VMALHF X0, YDIG, ADD1H, ADD3H
   1192 	VMALHF X1, YDIG, ADD2H, ADD4H
   1193 
   1194 	VZERO ZER
   1195 	VL    32(CPOOL), SEL1
   1196 	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   1197 
   1198 	VSLDB $12, ADD2, ADD1, T0
   1199 	VSLDB $12, T2, ADD2, T1
   1200 
   1201 	VACCQ  T0, RED1, CAR1
   1202 	VAQ    T0, RED1, T0
   1203 	VACCCQ T1, RED2, CAR1, T2
   1204 	VACQ   T1, RED2, CAR1, T1
   1205 
   1206 	VACCQ  T0, ADD3, CAR1
   1207 	VAQ    T0, ADD3, T0
   1208 	VACCCQ T1, ADD4, CAR1, CAR2
   1209 	VACQ   T1, ADD4, CAR1, T1
   1210 	VAQ    T2, CAR2, T2
   1211 
   1212 	VL    96(CPOOL), SEL5
   1213 	VL    112(CPOOL), SEL6
   1214 	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
   1215 	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
   1216 	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
   1217 
   1218 	VSLDB $12, T1, T0, T0
   1219 	VSLDB $12, T2, T1, T1
   1220 
   1221 	VACCQ  T0, ADD3H, CAR1
   1222 	VAQ    T0, ADD3H, T0
   1223 	VACCCQ T1, ADD4H, CAR1, T2
   1224 	VACQ   T1, ADD4H, CAR1, T1
   1225 
   1226 	VACCQ  T0, RED1, CAR1
   1227 	VAQ    T0, RED1, T0
   1228 	VACCCQ T1, RED2, CAR1, CAR2
   1229 	VACQ   T1, RED2, CAR1, T1
   1230 	VAQ    T2, CAR2, T2
   1231 
   1232 	// ---------------------------------------------------
   1233 
   1234 	VZERO   RED3
   1235 	VSCBIQ  P0, T0, CAR1
   1236 	VSQ     P0, T0, ADD1H
   1237 	VSBCBIQ T1, P1, CAR1, CAR2
   1238 	VSBIQ   T1, P1, CAR1, ADD2H
   1239 	VSBIQ   T2, RED3, CAR2, T2
   1240 
   1241 	// what output to use, ADD2H||ADD1H or T1||T0?
   1242 	VSEL T0, ADD1H, T2, T0
   1243 	VSEL T1, ADD2H, T2, T1
   1244 	RET
   1245 
   1246 #undef CPOOL
   1247 
   1248 #undef X0
   1249 #undef X1
   1250 #undef Y0
   1251 #undef Y1
   1252 #undef T0
   1253 #undef T1
   1254 #undef P0
   1255 #undef P1
   1256 
   1257 #undef SEL1
   1258 #undef SEL2
   1259 #undef SEL3
   1260 #undef SEL4
   1261 #undef SEL5
   1262 #undef SEL6
   1263 
   1264 #undef YDIG
   1265 #undef ADD1H
   1266 #undef ADD2H
   1267 #undef ADD3
   1268 #undef ADD4
   1269 #undef RED1
   1270 #undef RED2
   1271 #undef RED3
   1272 #undef T2
   1273 #undef ADD1
   1274 #undef ADD2
   1275 #undef ADD3H
   1276 #undef ADD4H
   1277 #undef ZER
   1278 #undef CAR1
   1279 #undef CAR2
   1280 
   1281 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
   1282 	VZERO   ZER                \
   1283 	VSCBIQ  Y0, X0, CAR1       \
   1284 	VSQ     Y0, X0, T0         \
   1285 	VSBCBIQ X1, Y1, CAR1, SEL1 \
   1286 	VSBIQ   X1, Y1, CAR1, T1   \
   1287 	VSQ     SEL1, ZER, SEL1    \
   1288 	                           \
   1289 	VACCQ   T0, PL, CAR1       \
   1290 	VAQ     T0, PL, TT0        \
   1291 	VACQ    T1, PH, CAR1, TT1  \
   1292 	                           \
   1293 	VSEL    T0, TT0, SEL1, T0  \
   1294 	VSEL    T1, TT1, SEL1, T1  \
   1295 
   1296 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
   1297 	VACCQ   X0, Y0, CAR1        \
   1298 	VAQ     X0, Y0, T0          \
   1299 	VACCCQ  X1, Y1, CAR1, T2    \
   1300 	VACQ    X1, Y1, CAR1, T1    \
   1301 	                            \
   1302 	VZERO   ZER                 \
   1303 	VSCBIQ  PL, T0, CAR1        \
   1304 	VSQ     PL, T0, TT0         \
   1305 	VSBCBIQ T1, PH, CAR1, CAR2  \
   1306 	VSBIQ   T1, PH, CAR1, TT1   \
   1307 	VSBIQ   T2, ZER, CAR2, SEL1 \
   1308 	                            \
   1309 	VSEL    T0, TT0, SEL1, T0   \
   1310 	VSEL    T1, TT1, SEL1, T1
   1311 
   1312 #define p256HalfInternal(T1, T0, X1, X0) \
   1313 	VZERO  ZER                \
   1314 	VSBIQ  ZER, ZER, X0, SEL1 \
   1315 	                          \
   1316 	VACCQ  X0, PL, CAR1       \
   1317 	VAQ    X0, PL, T0         \
   1318 	VACCCQ X1, PH, CAR1, T2   \
   1319 	VACQ   X1, PH, CAR1, T1   \
   1320 	                          \
   1321 	VSEL   X0, T0, SEL1, T0   \
   1322 	VSEL   X1, T1, SEL1, T1   \
   1323 	VSEL   ZER, T2, SEL1, T2  \
   1324 	                          \
   1325 	VSLDB  $15, T2, ZER, TT1  \
   1326 	VSLDB  $15, T1, ZER, TT0  \
   1327 	VREPIB $1, SEL1           \
   1328 	VSRL   SEL1, T0, T0       \
   1329 	VSRL   SEL1, T1, T1       \
   1330 	VREPIB $7, SEL1           \
   1331 	VSL    SEL1, TT0, TT0     \
   1332 	VSL    SEL1, TT1, TT1     \
   1333 	VO     T0, TT0, T0        \
   1334 	VO     T1, TT1, T1
   1335 
   1336 // ---------------------------------------
   1337 // func p256MulAsm(res, in1, in2 []byte)
   1338 #define res_ptr R1
   1339 #define x_ptr   R2
   1340 #define y_ptr   R3
   1341 #define CPOOL   R4
   1342 
   1343 // Parameters
   1344 #define X0    V0
   1345 #define X1    V1
   1346 #define Y0    V2
   1347 #define Y1    V3
   1348 #define T0    V4
   1349 #define T1    V5
   1350 
   1351 // Constants
   1352 #define P0    V30
   1353 #define P1    V31
   1354 TEXT p256MulAsm(SB), NOSPLIT, $0
   1355 	MOVD res+0(FP), res_ptr
   1356 	MOVD in1+24(FP), x_ptr
   1357 	MOVD in2+48(FP), y_ptr
   1358 
   1359 	VL (1*16)(x_ptr), X0
   1360 	VL (0*16)(x_ptr), X1
   1361 	VL (1*16)(y_ptr), Y0
   1362 	VL (0*16)(y_ptr), Y1
   1363 
   1364 	MOVD $p256mul<>+0x00(SB), CPOOL
   1365 	VL   16(CPOOL), P0
   1366 	VL   0(CPOOL), P1
   1367 
   1368 	CALL p256MulInternal<>(SB)
   1369 
   1370 	VST T0, (1*16)(res_ptr)
   1371 	VST T1, (0*16)(res_ptr)
   1372 	RET
   1373 
   1374 #undef res_ptr
   1375 #undef x_ptr
   1376 #undef y_ptr
   1377 #undef CPOOL
   1378 
   1379 #undef X0
   1380 #undef X1
   1381 #undef Y0
   1382 #undef Y1
   1383 #undef T0
   1384 #undef T1
   1385 #undef P0
   1386 #undef P1
   1387 
   1388 // Point add with P2 being affine point
   1389 // If sign == 1 -> P2 = -P2
   1390 // If sel == 0 -> P3 = P1
   1391 // if zero == 0 -> P3 = P2
   1392 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
   1393 #define P3ptr   R1
   1394 #define P1ptr   R2
   1395 #define P2ptr   R3
   1396 #define CPOOL   R4
   1397 
   1398 // Temporaries in REGs
   1399 #define Y2L    V15
   1400 #define Y2H    V16
   1401 #define T1L    V17
   1402 #define T1H    V18
   1403 #define T2L    V19
   1404 #define T2H    V20
   1405 #define T3L    V21
   1406 #define T3H    V22
   1407 #define T4L    V23
   1408 #define T4H    V24
   1409 
   1410 // Temps for Sub and Add
   1411 #define TT0  V11
   1412 #define TT1  V12
   1413 #define T2   V13
   1414 
   1415 // p256MulAsm Parameters
   1416 #define X0    V0
   1417 #define X1    V1
   1418 #define Y0    V2
   1419 #define Y1    V3
   1420 #define T0    V4
   1421 #define T1    V5
   1422 
   1423 #define PL    V30
   1424 #define PH    V31
   1425 
   1426 // Names for zero/sel selects
   1427 #define X1L    V0
   1428 #define X1H    V1
   1429 #define Y1L    V2 // p256MulAsmParmY
   1430 #define Y1H    V3 // p256MulAsmParmY
   1431 #define Z1L    V4
   1432 #define Z1H    V5
   1433 #define X2L    V0
   1434 #define X2H    V1
   1435 #define Z2L    V4
   1436 #define Z2H    V5
   1437 #define X3L    V17 // T1L
   1438 #define X3H    V18 // T1H
   1439 #define Y3L    V21 // T3L
   1440 #define Y3H    V22 // T3H
   1441 #define Z3L    V28
   1442 #define Z3H    V29
   1443 
   1444 #define ZER   V6
   1445 #define SEL1  V7
   1446 #define CAR1  V8
   1447 #define CAR2  V9
   1448 /* *
   1449  * Three operand formula:
   1450  * Source: 2004 HankersonMenezesVanstone, page 91.
   1451  * T1 = Z1
   1452  * T2 = T1*Z1
   1453  * T1 = T1*X2
   1454  * T2 = T2*Y2
   1455  * T1 = T1-X1
   1456  * T2 = T2-Y1
   1457  * Z3 = Z1*T1
   1458  * T3 = T1
   1459  * T4 = T3*T1
   1460  * T3 = T3*X1
   1461  * T1 = 2*T3
   1462  * X3 = T2
   1463  * X3 = X3-T1
   1464  * X3 = X3-T4
   1465  * T3 = T3-X3
   1466  * T3 = T3*T2
   1467  * T4 = T4*Y1
   1468  * Y3 = T3-T4
   1469 
   1470  * Three operand formulas, but with MulInternal X,Y used to store temps
   1471 X=Z1; Y=Z1; MUL;T-   // T1 = Z1      T1
   1472 X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
   1473 X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
   1474 X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
   1475 SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
   1476 SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
   1477 X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
   1478 X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
   1479 X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
   1480 X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
   1481 ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
   1482 X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
   1483 SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
   1484 SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
   1485 SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
   1486 X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
   1487 X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
   1488 SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
   1489 
   1490 	*/
   1491 TEXT p256PointAddAffineAsm(SB), NOSPLIT, $0
   1492 	MOVD P3+0(FP), P3ptr
   1493 	MOVD P1+8(FP), P1ptr
   1494 	MOVD P2+16(FP), P2ptr
   1495 
   1496 	MOVD $p256mul<>+0x00(SB), CPOOL
   1497 	VL   16(CPOOL), PL
   1498 	VL   0(CPOOL), PH
   1499 
   1500 	//	if (sign == 1) {
   1501 	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
   1502 	//	}
   1503 
   1504 	VL 32(P2ptr), Y2H
   1505 	VL 48(P2ptr), Y2L
   1506 
   1507 	VLREPG sign+24(FP), SEL1
   1508 	VZERO  ZER
   1509 	VCEQG  SEL1, ZER, SEL1
   1510 
   1511 	VSCBIQ Y2L, PL, CAR1
   1512 	VSQ    Y2L, PL, T1L
   1513 	VSBIQ  PH, Y2H, CAR1, T1H
   1514 
   1515 	VSEL Y2L, T1L, SEL1, Y2L
   1516 	VSEL Y2H, T1H, SEL1, Y2H
   1517 
   1518 /* *
   1519  * Three operand formula:
   1520  * Source: 2004 HankersonMenezesVanstone, page 91.
   1521  */
   1522 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1      T1
   1523 	VL   64(P1ptr), X1       // Z1H
   1524 	VL   80(P1ptr), X0       // Z1L
   1525 	VLR  X0, Y0
   1526 	VLR  X1, Y1
   1527 	CALL p256MulInternal<>(SB)
   1528 
   1529 	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
   1530 	VLR  T0, X0
   1531 	VLR  T1, X1
   1532 	CALL p256MulInternal<>(SB)
   1533 	VLR  T0, T2L
   1534 	VLR  T1, T2H
   1535 
   1536 	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
   1537 	VL   0(P2ptr), Y1        // X2H
   1538 	VL   16(P2ptr), Y0       // X2L
   1539 	CALL p256MulInternal<>(SB)
   1540 	VLR  T0, T1L
   1541 	VLR  T1, T1H
   1542 
   1543 	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
   1544 	VLR  T2L, X0
   1545 	VLR  T2H, X1
   1546 	VLR  Y2L, Y0
   1547 	VLR  Y2H, Y1
   1548 	CALL p256MulInternal<>(SB)
   1549 
   1550 	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
   1551 	VL 32(P1ptr), Y1H
   1552 	VL 48(P1ptr), Y1L
   1553 	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
   1554 
   1555 	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
   1556 	VL 0(P1ptr), X1H
   1557 	VL 16(P1ptr), X1L
   1558 	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
   1559 
   1560 	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
   1561 	VL   64(P1ptr), X1       // Z1H
   1562 	VL   80(P1ptr), X0       // Z1L
   1563 	CALL p256MulInternal<>(SB)
   1564 
   1565 	// VST T1, 64(P3ptr)
   1566 	// VST T0, 80(P3ptr)
   1567 	VLR T0, Z3L
   1568 	VLR T1, Z3H
   1569 
   1570 	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
   1571 	VLR  Y0, X0
   1572 	VLR  Y1, X1
   1573 	CALL p256MulInternal<>(SB)
   1574 	VLR  T0, X0
   1575 	VLR  T1, X1
   1576 
   1577 	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
   1578 	CALL p256MulInternal<>(SB)
   1579 	VLR  T0, T4L
   1580 	VLR  T1, T4H
   1581 
   1582 	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
   1583 	VL   0(P1ptr), Y1        // X1H
   1584 	VL   16(P1ptr), Y0       // X1L
   1585 	CALL p256MulInternal<>(SB)
   1586 	VLR  T0, T3L
   1587 	VLR  T1, T3H
   1588 
   1589 	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
   1590 	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
   1591 
   1592 	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
   1593 	VLR  T2L, X0
   1594 	VLR  T2H, X1
   1595 	VLR  T2L, Y0
   1596 	VLR  T2H, Y1
   1597 	CALL p256MulInternal<>(SB)
   1598 
   1599 	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
   1600 	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
   1601 
   1602 	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
   1603 	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
   1604 	VLR T0, X3L
   1605 	VLR T1, X3H
   1606 
   1607 	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
   1608 	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
   1609 
   1610 	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
   1611 	CALL p256MulInternal<>(SB)
   1612 	VLR  T0, T3L
   1613 	VLR  T1, T3H
   1614 
   1615 	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
   1616 	VLR  T4L, X0
   1617 	VLR  T4H, X1
   1618 	VL   32(P1ptr), Y1       // Y1H
   1619 	VL   48(P1ptr), Y0       // Y1L
   1620 	CALL p256MulInternal<>(SB)
   1621 
   1622 	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
   1623 	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
   1624 
   1625 	//	if (sel == 0) {
   1626 	//		copy(P3.x[:], X1)
   1627 	//		copy(P3.y[:], Y1)
   1628 	//		copy(P3.z[:], Z1)
   1629 	//	}
   1630 
   1631 	VL 0(P1ptr), X1H
   1632 	VL 16(P1ptr), X1L
   1633 
   1634 	// Y1 already loaded, left over from addition
   1635 	VL 64(P1ptr), Z1H
   1636 	VL 80(P1ptr), Z1L
   1637 
   1638 	VLREPG sel+32(FP), SEL1
   1639 	VZERO  ZER
   1640 	VCEQG  SEL1, ZER, SEL1
   1641 
   1642 	VSEL X1L, X3L, SEL1, X3L
   1643 	VSEL X1H, X3H, SEL1, X3H
   1644 	VSEL Y1L, Y3L, SEL1, Y3L
   1645 	VSEL Y1H, Y3H, SEL1, Y3H
   1646 	VSEL Z1L, Z3L, SEL1, Z3L
   1647 	VSEL Z1H, Z3H, SEL1, Z3H
   1648 
   1649 	//	if (zero == 0) {
   1650 	//		copy(P3.x[:], X2)
   1651 	//		copy(P3.y[:], Y2)
   1652 	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1653 	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
   1654 	//	}
   1655 	VL 0(P2ptr), X2H
   1656 	VL 16(P2ptr), X2L
   1657 
   1658 	// Y2 already loaded
   1659 	VL 128(CPOOL), Z2H
   1660 	VL 144(CPOOL), Z2L
   1661 
   1662 	VLREPG zero+40(FP), SEL1
   1663 	VZERO  ZER
   1664 	VCEQG  SEL1, ZER, SEL1
   1665 
   1666 	VSEL X2L, X3L, SEL1, X3L
   1667 	VSEL X2H, X3H, SEL1, X3H
   1668 	VSEL Y2L, Y3L, SEL1, Y3L
   1669 	VSEL Y2H, Y3H, SEL1, Y3H
   1670 	VSEL Z2L, Z3L, SEL1, Z3L
   1671 	VSEL Z2H, Z3H, SEL1, Z3H
   1672 
   1673 	// All done, store out the result!!!
   1674 	VST X3H, 0(P3ptr)
   1675 	VST X3L, 16(P3ptr)
   1676 	VST Y3H, 32(P3ptr)
   1677 	VST Y3L, 48(P3ptr)
   1678 	VST Z3H, 64(P3ptr)
   1679 	VST Z3L, 80(P3ptr)
   1680 
   1681 	RET
   1682 
   1683 #undef P3ptr
   1684 #undef P1ptr
   1685 #undef P2ptr
   1686 #undef CPOOL
   1687 
   1688 #undef Y2L
   1689 #undef Y2H
   1690 #undef T1L
   1691 #undef T1H
   1692 #undef T2L
   1693 #undef T2H
   1694 #undef T3L
   1695 #undef T3H
   1696 #undef T4L
   1697 #undef T4H
   1698 
   1699 #undef TT0
   1700 #undef TT1
   1701 #undef T2
   1702 
   1703 #undef X0
   1704 #undef X1
   1705 #undef Y0
   1706 #undef Y1
   1707 #undef T0
   1708 #undef T1
   1709 
   1710 #undef PL
   1711 #undef PH
   1712 
   1713 #undef X1L
   1714 #undef X1H
   1715 #undef Y1L
   1716 #undef Y1H
   1717 #undef Z1L
   1718 #undef Z1H
   1719 #undef X2L
   1720 #undef X2H
   1721 #undef Z2L
   1722 #undef Z2H
   1723 #undef X3L
   1724 #undef X3H
   1725 #undef Y3L
   1726 #undef Y3H
   1727 #undef Z3L
   1728 #undef Z3H
   1729 
   1730 #undef ZER
   1731 #undef SEL1
   1732 #undef CAR1
   1733 #undef CAR2
   1734 
   1735 // p256PointDoubleAsm(P3, P1 *p256Point)
   1736 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
   1737 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
   1738 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
   1739 #define P3ptr   R1
   1740 #define P1ptr   R2
   1741 #define CPOOL   R4
   1742 
   1743 // Temporaries in REGs
   1744 #define X3L    V15
   1745 #define X3H    V16
   1746 #define Y3L    V17
   1747 #define Y3H    V18
   1748 #define T1L    V19
   1749 #define T1H    V20
   1750 #define T2L    V21
   1751 #define T2H    V22
   1752 #define T3L    V23
   1753 #define T3H    V24
   1754 
   1755 #define X1L    V6
   1756 #define X1H    V7
   1757 #define Y1L    V8
   1758 #define Y1H    V9
   1759 #define Z1L    V10
   1760 #define Z1H    V11
   1761 
   1762 // Temps for Sub and Add
   1763 #define TT0  V11
   1764 #define TT1  V12
   1765 #define T2   V13
   1766 
   1767 // p256MulAsm Parameters
   1768 #define X0    V0
   1769 #define X1    V1
   1770 #define Y0    V2
   1771 #define Y1    V3
   1772 #define T0    V4
   1773 #define T1    V5
   1774 
   1775 #define PL    V30
   1776 #define PH    V31
   1777 
   1778 #define Z3L    V23
   1779 #define Z3H    V24
   1780 
   1781 #define ZER   V26
   1782 #define SEL1  V27
   1783 #define CAR1  V28
   1784 #define CAR2  V29
   1785 /*
   1786  * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
   1787  * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
   1788  * Source: 2004 HankersonMenezesVanstone, page 91.
   1789  * 	A  = 3(X-Z)(X+Z)
   1790  * 	B  = 2Y
   1791  * 	Z = BZ
   1792  * 	C  = B
   1793  * 	D  = CX
   1794  * 	X = A-2D
   1795  * 	Y = (D-X)A-C/2
   1796  *
   1797  * Three-operand formula:
   1798  *       T1 = Z1
   1799  *       T2 = X1-T1
   1800  *       T1 = X1+T1
   1801  *       T2 = T2*T1
   1802  *       T2 = 3*T2
   1803  *       Y3 = 2*Y1
   1804  *       Z3 = Y3*Z1
   1805  *       Y3 = Y3
   1806  *       T3 = Y3*X1
   1807  *       Y3 = Y3
   1808  *       Y3 = half*Y3
   1809  *       X3 = T2
   1810  *       T1 = 2*T3
   1811  *       X3 = X3-T1
   1812  *       T1 = T3-X3
   1813  *       T1 = T1*T2
   1814  *       Y3 = T1-Y3
   1815  */
   1816 
   1817 TEXT p256PointDoubleAsm(SB), NOSPLIT, $0
   1818 	MOVD P3+0(FP), P3ptr
   1819 	MOVD P1+8(FP), P1ptr
   1820 
   1821 	MOVD $p256mul<>+0x00(SB), CPOOL
   1822 	VL   16(CPOOL), PL
   1823 	VL   0(CPOOL), PH
   1824 
   1825 	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1
   1826 	VL   64(P1ptr), X1       // Z1H
   1827 	VL   80(P1ptr), X0       // Z1L
   1828 	VLR  X0, Y0
   1829 	VLR  X1, Y1
   1830 	CALL p256MulInternal<>(SB)
   1831 
   1832 	// SUB(X<X1-T)            // T2 = X1-T1
   1833 	VL 0(P1ptr), X1H
   1834 	VL 16(P1ptr), X1L
   1835 	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
   1836 
   1837 	// ADD(Y<X1+T)            // T1 = X1+T1
   1838 	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
   1839 
   1840 	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
   1841 	CALL p256MulInternal<>(SB)
   1842 
   1843 	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
   1844 	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
   1845 	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
   1846 
   1847 	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
   1848 	VL 32(P1ptr), Y1H
   1849 	VL 48(P1ptr), Y1L
   1850 	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
   1851 
   1852 	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
   1853 	VL   64(P1ptr), Y1       // Z1H
   1854 	VL   80(P1ptr), Y0       // Z1L
   1855 	CALL p256MulInternal<>(SB)
   1856 	VST  T1, 64(P3ptr)
   1857 	VST  T0, 80(P3ptr)
   1858 
   1859 	// X-  ; Y=X ; MUL; T-    // Y3 = Y3
   1860 	VLR  X0, Y0
   1861 	VLR  X1, Y1
   1862 	CALL p256MulInternal<>(SB)
   1863 
   1864 	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
   1865 	VLR  T0, X0
   1866 	VLR  T1, X1
   1867 	VL   0(P1ptr), Y1
   1868 	VL   16(P1ptr), Y0
   1869 	CALL p256MulInternal<>(SB)
   1870 	VLR  T0, T3L
   1871 	VLR  T1, T3H
   1872 
   1873 	// X-  ; Y=X ; MUL; T-    // Y3 = Y3
   1874 	VLR  X0, Y0
   1875 	VLR  X1, Y1
   1876 	CALL p256MulInternal<>(SB)
   1877 
   1878 	// HAL(Y3<T)              // Y3 = half*Y3
   1879 	p256HalfInternal(Y3H,Y3L, T1,T0)
   1880 
   1881 	// X=T2; Y=T2; MUL; T-    // X3 = T2
   1882 	VLR  T2L, X0
   1883 	VLR  T2H, X1
   1884 	VLR  T2L, Y0
   1885 	VLR  T2H, Y1
   1886 	CALL p256MulInternal<>(SB)
   1887 
   1888 	// ADD(T1<T3+T3)          // T1 = 2*T3
   1889 	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
   1890 
   1891 	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
   1892 	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
   1893 	VST X3H, 0(P3ptr)
   1894 	VST X3L, 16(P3ptr)
   1895 
   1896 	// SUB(X<T3-X3)           // T1 = T3-X3
   1897 	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
   1898 
   1899 	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
   1900 	CALL p256MulInternal<>(SB)
   1901 
   1902 	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
   1903 	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
   1904 
   1905 	VST Y3H, 32(P3ptr)
   1906 	VST Y3L, 48(P3ptr)
   1907 	RET
   1908 
   1909 #undef P3ptr
   1910 #undef P1ptr
   1911 #undef CPOOL
   1912 #undef X3L
   1913 #undef X3H
   1914 #undef Y3L
   1915 #undef Y3H
   1916 #undef T1L
   1917 #undef T1H
   1918 #undef T2L
   1919 #undef T2H
   1920 #undef T3L
   1921 #undef T3H
   1922 #undef X1L
   1923 #undef X1H
   1924 #undef Y1L
   1925 #undef Y1H
   1926 #undef Z1L
   1927 #undef Z1H
   1928 #undef TT0
   1929 #undef TT1
   1930 #undef T2
   1931 #undef X0
   1932 #undef X1
   1933 #undef Y0
   1934 #undef Y1
   1935 #undef T0
   1936 #undef T1
   1937 #undef PL
   1938 #undef PH
   1939 #undef Z3L
   1940 #undef Z3H
   1941 #undef ZER
   1942 #undef SEL1
   1943 #undef CAR1
   1944 #undef CAR2
   1945 
   1946 // p256PointAddAsm(P3, P1, P2 *p256Point)
   1947 #define P3ptr  R1
   1948 #define P1ptr  R2
   1949 #define P2ptr  R3
   1950 #define CPOOL  R4
   1951 #define ISZERO R5
   1952 #define TRUE   R6
   1953 
   1954 // Temporaries in REGs
   1955 #define T1L   V16
   1956 #define T1H   V17
   1957 #define T2L   V18
   1958 #define T2H   V19
   1959 #define U1L   V20
   1960 #define U1H   V21
   1961 #define S1L   V22
   1962 #define S1H   V23
   1963 #define HL    V24
   1964 #define HH    V25
   1965 #define RL    V26
   1966 #define RH    V27
   1967 
   1968 // Temps for Sub and Add
   1969 #define ZER   V6
   1970 #define SEL1  V7
   1971 #define CAR1  V8
   1972 #define CAR2  V9
   1973 #define TT0  V11
   1974 #define TT1  V12
   1975 #define T2   V13
   1976 
   1977 // p256MulAsm Parameters
   1978 #define X0    V0
   1979 #define X1    V1
   1980 #define Y0    V2
   1981 #define Y1    V3
   1982 #define T0    V4
   1983 #define T1    V5
   1984 
   1985 #define PL    V30
   1986 #define PH    V31
   1987 /*
   1988  * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
   1989  *
   1990  * A = XZ
   1991  * B = YZ
   1992  * C = XZ-A
   1993  * D = YZ-B
   1994  * X = D - 2AC - C
   1995  * Y = D(AC - X) - BC
   1996  * Z = ZZC
   1997  *
   1998  * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
   1999  * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
   2000  *
   2001  * T1 = Z1*Z1
   2002  * T2 = Z2*Z2
   2003  * U1 = X1*T2
   2004  * H  = X2*T1
   2005  * H  = H-U1
   2006  * Z3 = Z1*Z2
   2007  * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
   2008  *
   2009  * S1 = Z2*T2
   2010  * S1 = Y1*S1
   2011  * R  = Z1*T1
   2012  * R  = Y2*R
   2013  * R  = R-S1
   2014  *
   2015  * T1 = H*H
   2016  * T2 = H*T1
   2017  * U1 = U1*T1
   2018  *
   2019  * X3 = R*R
   2020  * X3 = X3-T2
   2021  * T1 = 2*U1
   2022  * X3 = X3-T1 << store-out X3 result reg
   2023  *
   2024  * T2 = S1*T2
   2025  * Y3 = U1-X3
   2026  * Y3 = R*Y3
   2027  * Y3 = Y3-T2 << store-out Y3 result reg
   2028 
   2029  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
   2030 	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
   2031 	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
   2032 	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
   2033 	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
   2034 	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
   2035 	// SUB(H<H-T)            // H  = H-U1
   2036 	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
   2037 	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
   2038 	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
   2039 	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
   2040 	// SUB(R<T-S1)           // R  = R-S1
   2041 	// X=H ; Y=H ; MUL; T-   // T1 = H*H
   2042 	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
   2043 	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
   2044 	// X=R ; Y=R ; MUL; T-   // X3 = R*R
   2045 	// SUB(T<T-T2)           // X3 = X3-T2
   2046 	// ADD(X<U1+U1)          // T1 = 2*U1
   2047 	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
   2048 	// SUB(Y<U1-T)           // Y3 = U1-X3
   2049 	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
   2050 	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
   2051 	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
   2052 	*/
   2053 TEXT p256PointAddAsm(SB), NOSPLIT, $0
   2054 	MOVD P3+0(FP), P3ptr
   2055 	MOVD P1+8(FP), P1ptr
   2056 	MOVD P2+16(FP), P2ptr
   2057 
   2058 	MOVD $p256mul<>+0x00(SB), CPOOL
   2059 	VL   16(CPOOL), PL
   2060 	VL   0(CPOOL), PH
   2061 
   2062 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
   2063 	VL   64(P1ptr), X1       // Z1H
   2064 	VL   80(P1ptr), X0       // Z1L
   2065 	VLR  X0, Y0
   2066 	VLR  X1, Y1
   2067 	CALL p256MulInternal<>(SB)
   2068 
   2069 	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
   2070 	VLR  T0, Y0
   2071 	VLR  T1, Y1
   2072 	CALL p256MulInternal<>(SB)
   2073 	VLR  T0, RL
   2074 	VLR  T1, RH
   2075 
   2076 	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
   2077 	VL   0(P2ptr), X1        // X2H
   2078 	VL   16(P2ptr), X0       // X2L
   2079 	CALL p256MulInternal<>(SB)
   2080 	VLR  T0, HL
   2081 	VLR  T1, HH
   2082 
   2083 	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
   2084 	VL   64(P2ptr), X1       // Z2H
   2085 	VL   80(P2ptr), X0       // Z2L
   2086 	VLR  X0, Y0
   2087 	VLR  X1, Y1
   2088 	CALL p256MulInternal<>(SB)
   2089 
   2090 	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
   2091 	VLR  T0, Y0
   2092 	VLR  T1, Y1
   2093 	CALL p256MulInternal<>(SB)
   2094 	VLR  T0, S1L
   2095 	VLR  T1, S1H
   2096 
   2097 	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
   2098 	VL   0(P1ptr), X1        // X1H
   2099 	VL   16(P1ptr), X0       // X1L
   2100 	CALL p256MulInternal<>(SB)
   2101 	VLR  T0, U1L
   2102 	VLR  T1, U1H
   2103 
   2104 	// SUB(H<H-T)            // H  = H-U1
   2105 	p256SubInternal(HH,HL,HH,HL,T1,T0)
   2106 
   2107 	// if H == 0 or H^P == 0 then ret=1 else ret=0
   2108 	// clobbers T1H and T1L
   2109 	MOVD   $0, ISZERO
   2110 	MOVD   $1, TRUE
   2111 	VZERO  ZER
   2112 	VO     HL, HH, T1H
   2113 	VCEQGS ZER, T1H, T1H
   2114 	MOVDEQ TRUE, ISZERO
   2115 	VX     HL, PL, T1L
   2116 	VX     HH, PH, T1H
   2117 	VO     T1L, T1H, T1H
   2118 	VCEQGS ZER, T1H, T1H
   2119 	MOVDEQ TRUE, ISZERO
   2120 	MOVD   ISZERO, ret+24(FP)
   2121 
   2122 	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
   2123 	VL   64(P1ptr), X1       // Z1H
   2124 	VL   80(P1ptr), X0       // Z1L
   2125 	VL   64(P2ptr), Y1       // Z2H
   2126 	VL   80(P2ptr), Y0       // Z2L
   2127 	CALL p256MulInternal<>(SB)
   2128 
   2129 	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
   2130 	VLR  T0, X0
   2131 	VLR  T1, X1
   2132 	VLR  HL, Y0
   2133 	VLR  HH, Y1
   2134 	CALL p256MulInternal<>(SB)
   2135 	VST  T1, 64(P3ptr)
   2136 	VST  T0, 80(P3ptr)
   2137 
   2138 	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
   2139 	VL   32(P1ptr), X1
   2140 	VL   48(P1ptr), X0
   2141 	VLR  S1L, Y0
   2142 	VLR  S1H, Y1
   2143 	CALL p256MulInternal<>(SB)
   2144 	VLR  T0, S1L
   2145 	VLR  T1, S1H
   2146 
   2147 	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
   2148 	VL   32(P2ptr), X1
   2149 	VL   48(P2ptr), X0
   2150 	VLR  RL, Y0
   2151 	VLR  RH, Y1
   2152 	CALL p256MulInternal<>(SB)
   2153 
   2154 	// SUB(R<T-S1)           // R  = T-S1
   2155 	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
   2156 
   2157 	// if R == 0 or R^P == 0 then ret=ret else ret=0
   2158 	// clobbers T1H and T1L
   2159 	MOVD   $0, ISZERO
   2160 	MOVD   $1, TRUE
   2161 	VZERO  ZER
   2162 	VO     RL, RH, T1H
   2163 	VCEQGS ZER, T1H, T1H
   2164 	MOVDEQ TRUE, ISZERO
   2165 	VX     RL, PL, T1L
   2166 	VX     RH, PH, T1H
   2167 	VO     T1L, T1H, T1H
   2168 	VCEQGS ZER, T1H, T1H
   2169 	MOVDEQ TRUE, ISZERO
   2170 	AND    ret+24(FP), ISZERO
   2171 	MOVD   ISZERO, ret+24(FP)
   2172 
   2173 	// X=H ; Y=H ; MUL; T-   // T1 = H*H
   2174 	VLR  HL, X0
   2175 	VLR  HH, X1
   2176 	VLR  HL, Y0
   2177 	VLR  HH, Y1
   2178 	CALL p256MulInternal<>(SB)
   2179 
   2180 	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
   2181 	VLR  T0, Y0
   2182 	VLR  T1, Y1
   2183 	CALL p256MulInternal<>(SB)
   2184 	VLR  T0, T2L
   2185 	VLR  T1, T2H
   2186 
   2187 	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
   2188 	VLR  U1L, X0
   2189 	VLR  U1H, X1
   2190 	CALL p256MulInternal<>(SB)
   2191 	VLR  T0, U1L
   2192 	VLR  T1, U1H
   2193 
   2194 	// X=R ; Y=R ; MUL; T-   // X3 = R*R
   2195 	VLR  RL, X0
   2196 	VLR  RH, X1
   2197 	VLR  RL, Y0
   2198 	VLR  RH, Y1
   2199 	CALL p256MulInternal<>(SB)
   2200 
   2201 	// SUB(T<T-T2)           // X3 = X3-T2
   2202 	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
   2203 
   2204 	// ADD(X<U1+U1)          // T1 = 2*U1
   2205 	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
   2206 
   2207 	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
   2208 	p256SubInternal(T1,T0,T1,T0,X1,X0)
   2209 	VST T1, 0(P3ptr)
   2210 	VST T0, 16(P3ptr)
   2211 
   2212 	// SUB(Y<U1-T)           // Y3 = U1-X3
   2213 	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
   2214 
   2215 	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
   2216 	VLR  RL, X0
   2217 	VLR  RH, X1
   2218 	CALL p256MulInternal<>(SB)
   2219 	VLR  T0, U1L
   2220 	VLR  T1, U1H
   2221 
   2222 	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
   2223 	VLR  S1L, X0
   2224 	VLR  S1H, X1
   2225 	VLR  T2L, Y0
   2226 	VLR  T2H, Y1
   2227 	CALL p256MulInternal<>(SB)
   2228 
   2229 	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
   2230 	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
   2231 	VST T1, 32(P3ptr)
   2232 	VST T0, 48(P3ptr)
   2233 
   2234 	RET
   2235