Home | History | Annotate | Download | only in poly1305
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build arm,!gccgo,!appengine,!nacl
      6 
      7 #include "textflag.h"
      8 
      9 // This code was translated into a form compatible with 5a from the public
     10 // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
     11 
     12 DATA poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
     13 DATA poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
     14 DATA poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
     15 DATA poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
     16 DATA poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
     17 GLOBL poly1305_init_constants_armv6<>(SB), 8, $20
     18 
     19 // Warning: the linker may use R11 to synthesize certain instructions. Please
     20 // take care and verify that no synthetic instructions use it.
     21 
     22 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
     23 	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
     24 	// might look like it's only 60 bytes of space but the final four bytes
     25 	// will be written by another function.) We need to skip over four
     26 	// bytes of stack because that's saving the value of 'g'.
     27 	ADD       $4, R13, R8
     28 	MOVM.IB   [R4-R7], (R8)
     29 	MOVM.IA.W (R1), [R2-R5]
     30 	MOVW      $poly1305_init_constants_armv6<>(SB), R7
     31 	MOVW      R2, R8
     32 	MOVW      R2>>26, R9
     33 	MOVW      R3>>20, g
     34 	MOVW      R4>>14, R11
     35 	MOVW      R5>>8, R12
     36 	ORR       R3<<6, R9, R9
     37 	ORR       R4<<12, g, g
     38 	ORR       R5<<18, R11, R11
     39 	MOVM.IA   (R7), [R2-R6]
     40 	AND       R8, R2, R2
     41 	AND       R9, R3, R3
     42 	AND       g, R4, R4
     43 	AND       R11, R5, R5
     44 	AND       R12, R6, R6
     45 	MOVM.IA.W [R2-R6], (R0)
     46 	EOR       R2, R2, R2
     47 	EOR       R3, R3, R3
     48 	EOR       R4, R4, R4
     49 	EOR       R5, R5, R5
     50 	EOR       R6, R6, R6
     51 	MOVM.IA.W [R2-R6], (R0)
     52 	MOVM.IA.W (R1), [R2-R5]
     53 	MOVM.IA   [R2-R6], (R0)
     54 	ADD       $20, R13, R0
     55 	MOVM.DA   (R0), [R4-R7]
     56 	RET
     57 
     58 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
     59 	MOVBU (offset+0)(Rsrc), Rtmp; \
     60 	MOVBU Rtmp, (offset+0)(Rdst); \
     61 	MOVBU (offset+1)(Rsrc), Rtmp; \
     62 	MOVBU Rtmp, (offset+1)(Rdst); \
     63 	MOVBU (offset+2)(Rsrc), Rtmp; \
     64 	MOVBU Rtmp, (offset+2)(Rdst); \
     65 	MOVBU (offset+3)(Rsrc), Rtmp; \
     66 	MOVBU Rtmp, (offset+3)(Rdst)
     67 
     68 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
     69 	// Needs 24 bytes of stack for saved registers and then 88 bytes of
     70 	// scratch space after that. We assume that 24 bytes at (R13) have
     71 	// already been used: four bytes for the link register saved in the
     72 	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
     73 	// in that function and 16 bytes of scratch space used around
     74 	// poly1305_finish_ext_armv6_skip1.
     75 	ADD     $24, R13, R12
     76 	MOVM.IB [R4-R8, R14], (R12)
     77 	MOVW    R0, 88(R13)
     78 	MOVW    R1, 92(R13)
     79 	MOVW    R2, 96(R13)
     80 	MOVW    R1, R14
     81 	MOVW    R2, R12
     82 	MOVW    56(R0), R8
     83 	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
     84 	EOR     R6, R6, R6
     85 	MOVW.EQ $(1<<24), R6
     86 	MOVW    R6, 84(R13)
     87 	ADD     $116, R13, g
     88 	MOVM.IA (R0), [R0-R9]
     89 	MOVM.IA [R0-R4], (g)
     90 	CMP     $16, R12
     91 	BLO     poly1305_blocks_armv6_done
     92 
     93 poly1305_blocks_armv6_mainloop:
     94 	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
     95 	BEQ     poly1305_blocks_armv6_mainloop_aligned
     96 	ADD     $100, R13, g
     97 	MOVW_UNALIGNED(R14, g, R0, 0)
     98 	MOVW_UNALIGNED(R14, g, R0, 4)
     99 	MOVW_UNALIGNED(R14, g, R0, 8)
    100 	MOVW_UNALIGNED(R14, g, R0, 12)
    101 	MOVM.IA (g), [R0-R3]
    102 	ADD     $16, R14
    103 	B       poly1305_blocks_armv6_mainloop_loaded
    104 
    105 poly1305_blocks_armv6_mainloop_aligned:
    106 	MOVM.IA.W (R14), [R0-R3]
    107 
    108 poly1305_blocks_armv6_mainloop_loaded:
    109 	MOVW    R0>>26, g
    110 	MOVW    R1>>20, R11
    111 	MOVW    R2>>14, R12
    112 	MOVW    R14, 92(R13)
    113 	MOVW    R3>>8, R4
    114 	ORR     R1<<6, g, g
    115 	ORR     R2<<12, R11, R11
    116 	ORR     R3<<18, R12, R12
    117 	BIC     $0xfc000000, R0, R0
    118 	BIC     $0xfc000000, g, g
    119 	MOVW    84(R13), R3
    120 	BIC     $0xfc000000, R11, R11
    121 	BIC     $0xfc000000, R12, R12
    122 	ADD     R0, R5, R5
    123 	ADD     g, R6, R6
    124 	ORR     R3, R4, R4
    125 	ADD     R11, R7, R7
    126 	ADD     $116, R13, R14
    127 	ADD     R12, R8, R8
    128 	ADD     R4, R9, R9
    129 	MOVM.IA (R14), [R0-R4]
    130 	MULLU   R4, R5, (R11, g)
    131 	MULLU   R3, R5, (R14, R12)
    132 	MULALU  R3, R6, (R11, g)
    133 	MULALU  R2, R6, (R14, R12)
    134 	MULALU  R2, R7, (R11, g)
    135 	MULALU  R1, R7, (R14, R12)
    136 	ADD     R4<<2, R4, R4
    137 	ADD     R3<<2, R3, R3
    138 	MULALU  R1, R8, (R11, g)
    139 	MULALU  R0, R8, (R14, R12)
    140 	MULALU  R0, R9, (R11, g)
    141 	MULALU  R4, R9, (R14, R12)
    142 	MOVW    g, 76(R13)
    143 	MOVW    R11, 80(R13)
    144 	MOVW    R12, 68(R13)
    145 	MOVW    R14, 72(R13)
    146 	MULLU   R2, R5, (R11, g)
    147 	MULLU   R1, R5, (R14, R12)
    148 	MULALU  R1, R6, (R11, g)
    149 	MULALU  R0, R6, (R14, R12)
    150 	MULALU  R0, R7, (R11, g)
    151 	MULALU  R4, R7, (R14, R12)
    152 	ADD     R2<<2, R2, R2
    153 	ADD     R1<<2, R1, R1
    154 	MULALU  R4, R8, (R11, g)
    155 	MULALU  R3, R8, (R14, R12)
    156 	MULALU  R3, R9, (R11, g)
    157 	MULALU  R2, R9, (R14, R12)
    158 	MOVW    g, 60(R13)
    159 	MOVW    R11, 64(R13)
    160 	MOVW    R12, 52(R13)
    161 	MOVW    R14, 56(R13)
    162 	MULLU   R0, R5, (R11, g)
    163 	MULALU  R4, R6, (R11, g)
    164 	MULALU  R3, R7, (R11, g)
    165 	MULALU  R2, R8, (R11, g)
    166 	MULALU  R1, R9, (R11, g)
    167 	ADD     $52, R13, R0
    168 	MOVM.IA (R0), [R0-R7]
    169 	MOVW    g>>26, R12
    170 	MOVW    R4>>26, R14
    171 	ORR     R11<<6, R12, R12
    172 	ORR     R5<<6, R14, R14
    173 	BIC     $0xfc000000, g, g
    174 	BIC     $0xfc000000, R4, R4
    175 	ADD.S   R12, R0, R0
    176 	ADC     $0, R1, R1
    177 	ADD.S   R14, R6, R6
    178 	ADC     $0, R7, R7
    179 	MOVW    R0>>26, R12
    180 	MOVW    R6>>26, R14
    181 	ORR     R1<<6, R12, R12
    182 	ORR     R7<<6, R14, R14
    183 	BIC     $0xfc000000, R0, R0
    184 	BIC     $0xfc000000, R6, R6
    185 	ADD     R14<<2, R14, R14
    186 	ADD.S   R12, R2, R2
    187 	ADC     $0, R3, R3
    188 	ADD     R14, g, g
    189 	MOVW    R2>>26, R12
    190 	MOVW    g>>26, R14
    191 	ORR     R3<<6, R12, R12
    192 	BIC     $0xfc000000, g, R5
    193 	BIC     $0xfc000000, R2, R7
    194 	ADD     R12, R4, R4
    195 	ADD     R14, R0, R0
    196 	MOVW    R4>>26, R12
    197 	BIC     $0xfc000000, R4, R8
    198 	ADD     R12, R6, R9
    199 	MOVW    96(R13), R12
    200 	MOVW    92(R13), R14
    201 	MOVW    R0, R6
    202 	CMP     $32, R12
    203 	SUB     $16, R12, R12
    204 	MOVW    R12, 96(R13)
    205 	BHS     poly1305_blocks_armv6_mainloop
    206 
    207 poly1305_blocks_armv6_done:
    208 	MOVW    88(R13), R12
    209 	MOVW    R5, 20(R12)
    210 	MOVW    R6, 24(R12)
    211 	MOVW    R7, 28(R12)
    212 	MOVW    R8, 32(R12)
    213 	MOVW    R9, 36(R12)
    214 	ADD     $48, R13, R0
    215 	MOVM.DA (R0), [R4-R8, R14]
    216 	RET
    217 
    218 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
    219 	MOVBU.P 1(Rsrc), Rtmp; \
    220 	MOVBU.P Rtmp, 1(Rdst); \
    221 	MOVBU.P 1(Rsrc), Rtmp; \
    222 	MOVBU.P Rtmp, 1(Rdst)
    223 
    224 #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
    225 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
    226 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
    227 
    228 // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
    229 TEXT poly1305_auth_armv6(SB), $196-16
    230 	// The value 196, just above, is the sum of 64 (the size of the context
    231 	// structure) and 132 (the amount of stack needed).
    232 	//
    233 	// At this point, the stack pointer (R13) has been moved down. It
    234 	// points to the saved link register and there's 196 bytes of free
    235 	// space above it.
    236 	//
    237 	// The stack for this function looks like:
    238 	//
    239 	// +---------------------
    240 	// |
    241 	// | 64 bytes of context structure
    242 	// |
    243 	// +---------------------
    244 	// |
    245 	// | 112 bytes for poly1305_blocks_armv6
    246 	// |
    247 	// +---------------------
    248 	// | 16 bytes of final block, constructed at
    249 	// | poly1305_finish_ext_armv6_skip8
    250 	// +---------------------
    251 	// | four bytes of saved 'g'
    252 	// +---------------------
    253 	// | lr, saved by prelude    <- R13 points here
    254 	// +---------------------
    255 	MOVW g, 4(R13)
    256 
    257 	MOVW out+0(FP), R4
    258 	MOVW m+4(FP), R5
    259 	MOVW mlen+8(FP), R6
    260 	MOVW key+12(FP), R7
    261 
    262 	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
    263 	MOVW R7, R1
    264 
    265 	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
    266 	// that's ok because none of the other values have been written yet.
    267 	BL    poly1305_init_ext_armv6<>(SB)
    268 	BIC.S $15, R6, R2
    269 	BEQ   poly1305_auth_armv6_noblocks
    270 	ADD   $136, R13, R0
    271 	MOVW  R5, R1
    272 	ADD   R2, R5, R5
    273 	SUB   R2, R6, R6
    274 	BL    poly1305_blocks_armv6<>(SB)
    275 
    276 poly1305_auth_armv6_noblocks:
    277 	ADD  $136, R13, R0
    278 	MOVW R5, R1
    279 	MOVW R6, R2
    280 	MOVW R4, R3
    281 
    282 	MOVW  R0, R5
    283 	MOVW  R1, R6
    284 	MOVW  R2, R7
    285 	MOVW  R3, R8
    286 	AND.S R2, R2, R2
    287 	BEQ   poly1305_finish_ext_armv6_noremaining
    288 	EOR   R0, R0
    289 	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
    290 	MOVW  R0, (R9)
    291 	MOVW  R0, 4(R9)
    292 	MOVW  R0, 8(R9)
    293 	MOVW  R0, 12(R9)
    294 	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
    295 	BEQ   poly1305_finish_ext_armv6_aligned
    296 	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
    297 	BEQ   poly1305_finish_ext_armv6_skip8
    298 	MOVWP_UNALIGNED(R1, R9, g)
    299 	MOVWP_UNALIGNED(R1, R9, g)
    300 
    301 poly1305_finish_ext_armv6_skip8:
    302 	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
    303 	BEQ  poly1305_finish_ext_armv6_skip4
    304 	MOVWP_UNALIGNED(R1, R9, g)
    305 
    306 poly1305_finish_ext_armv6_skip4:
    307 	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
    308 	BEQ  poly1305_finish_ext_armv6_skip2
    309 	MOVHUP_UNALIGNED(R1, R9, g)
    310 	B    poly1305_finish_ext_armv6_skip2
    311 
    312 poly1305_finish_ext_armv6_aligned:
    313 	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
    314 	BEQ       poly1305_finish_ext_armv6_skip8_aligned
    315 	MOVM.IA.W (R1), [g-R11]
    316 	MOVM.IA.W [g-R11], (R9)
    317 
    318 poly1305_finish_ext_armv6_skip8_aligned:
    319 	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
    320 	BEQ    poly1305_finish_ext_armv6_skip4_aligned
    321 	MOVW.P 4(R1), g
    322 	MOVW.P g, 4(R9)
    323 
    324 poly1305_finish_ext_armv6_skip4_aligned:
    325 	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
    326 	BEQ     poly1305_finish_ext_armv6_skip2
    327 	MOVHU.P 2(R1), g
    328 	MOVH.P  g, 2(R9)
    329 
    330 poly1305_finish_ext_armv6_skip2:
    331 	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
    332 	BEQ     poly1305_finish_ext_armv6_skip1
    333 	MOVBU.P 1(R1), g
    334 	MOVBU.P g, 1(R9)
    335 
    336 poly1305_finish_ext_armv6_skip1:
    337 	MOVW  $1, R11
    338 	MOVBU R11, 0(R9)
    339 	MOVW  R11, 56(R5)
    340 	MOVW  R5, R0
    341 	ADD   $8, R13, R1
    342 	MOVW  $16, R2
    343 	BL    poly1305_blocks_armv6<>(SB)
    344 
    345 poly1305_finish_ext_armv6_noremaining:
    346 	MOVW      20(R5), R0
    347 	MOVW      24(R5), R1
    348 	MOVW      28(R5), R2
    349 	MOVW      32(R5), R3
    350 	MOVW      36(R5), R4
    351 	MOVW      R4>>26, R12
    352 	BIC       $0xfc000000, R4, R4
    353 	ADD       R12<<2, R12, R12
    354 	ADD       R12, R0, R0
    355 	MOVW      R0>>26, R12
    356 	BIC       $0xfc000000, R0, R0
    357 	ADD       R12, R1, R1
    358 	MOVW      R1>>26, R12
    359 	BIC       $0xfc000000, R1, R1
    360 	ADD       R12, R2, R2
    361 	MOVW      R2>>26, R12
    362 	BIC       $0xfc000000, R2, R2
    363 	ADD       R12, R3, R3
    364 	MOVW      R3>>26, R12
    365 	BIC       $0xfc000000, R3, R3
    366 	ADD       R12, R4, R4
    367 	ADD       $5, R0, R6
    368 	MOVW      R6>>26, R12
    369 	BIC       $0xfc000000, R6, R6
    370 	ADD       R12, R1, R7
    371 	MOVW      R7>>26, R12
    372 	BIC       $0xfc000000, R7, R7
    373 	ADD       R12, R2, g
    374 	MOVW      g>>26, R12
    375 	BIC       $0xfc000000, g, g
    376 	ADD       R12, R3, R11
    377 	MOVW      $-(1<<26), R12
    378 	ADD       R11>>26, R12, R12
    379 	BIC       $0xfc000000, R11, R11
    380 	ADD       R12, R4, R9
    381 	MOVW      R9>>31, R12
    382 	SUB       $1, R12
    383 	AND       R12, R6, R6
    384 	AND       R12, R7, R7
    385 	AND       R12, g, g
    386 	AND       R12, R11, R11
    387 	AND       R12, R9, R9
    388 	MVN       R12, R12
    389 	AND       R12, R0, R0
    390 	AND       R12, R1, R1
    391 	AND       R12, R2, R2
    392 	AND       R12, R3, R3
    393 	AND       R12, R4, R4
    394 	ORR       R6, R0, R0
    395 	ORR       R7, R1, R1
    396 	ORR       g, R2, R2
    397 	ORR       R11, R3, R3
    398 	ORR       R9, R4, R4
    399 	ORR       R1<<26, R0, R0
    400 	MOVW      R1>>6, R1
    401 	ORR       R2<<20, R1, R1
    402 	MOVW      R2>>12, R2
    403 	ORR       R3<<14, R2, R2
    404 	MOVW      R3>>18, R3
    405 	ORR       R4<<8, R3, R3
    406 	MOVW      40(R5), R6
    407 	MOVW      44(R5), R7
    408 	MOVW      48(R5), g
    409 	MOVW      52(R5), R11
    410 	ADD.S     R6, R0, R0
    411 	ADC.S     R7, R1, R1
    412 	ADC.S     g, R2, R2
    413 	ADC.S     R11, R3, R3
    414 	MOVM.IA   [R0-R3], (R8)
    415 	MOVW      R5, R12
    416 	EOR       R0, R0, R0
    417 	EOR       R1, R1, R1
    418 	EOR       R2, R2, R2
    419 	EOR       R3, R3, R3
    420 	EOR       R4, R4, R4
    421 	EOR       R5, R5, R5
    422 	EOR       R6, R6, R6
    423 	EOR       R7, R7, R7
    424 	MOVM.IA.W [R0-R7], (R12)
    425 	MOVM.IA   [R0-R7], (R12)
    426 	MOVW      4(R13), g
    427 	RET
    428