2021-09-01 22:02:08 +08:00

156 lines
5.5 KiB
ArmAsm

// func keccakf(state *[25]uint64)
TEXT ·keccakf(SB),$0-24
MOVD state+0(FP), R0
MOVD $round_consts(SB), R1 // TODO: move this to the place that the const table is used
MOVD $24, R2 // counter for loop
VLD1.P 16(R0), [V0.D1, V1.D1]
VLD1.P 16(R0), [V2.D1, V3.D1]
VLD1.P 16(R0), [V4.D1, V5.D1]
VLD1.P 16(R0), [V6.D1, V7.D1]
VLD1.P 16(R0), [V8.D1, V9.D1]
VLD1.P 16(R0), [V10.D1, V11.D1]
VLD1.P 16(R0), [V12.D1, V13.D1]
VLD1.P 16(R0), [V14.D1, V15.D1]
VLD1.P 16(R0), [V16.D1, V17.D1]
VLD1.P 16(R0), [V18.D1, V19.D1]
VLD1.P 16(R0), [V20.D1, V21.D1]
VLD1.P 16(R0), [V22.D1, V23.D1]
VLD1 (R0), [V24.D1]
SUB $192, R0, R0
loop:
// Theta
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
VRAX1 V27.D2, V25.D2, V30.D2 // D[1]
VRAX1 V28.D2, V26.D2, V31.D2 // D[2]
VRAX1 V29.D2, V27.D2, V27.D2 // D[3]
VRAX1 V25.D2, V28.D2, V28.D2 // D[4]
VRAX1 V26.D2, V29.D2, V29.D2 // D[0]
// Theta + Rho + Pi
VXAR $64-1, V30.D2, V1.D2, V25.D2 // C[0] = A[2][0]
VXAR $64-44, V30.D2, V6.D2, V1.D2
VXAR $64-20, V28.D2, V9.D2, V6.D2
VXAR $64-61, V31.D2, V22.D2, V9.D2
VXAR $64-39, V28.D2, V14.D2, V22.D2
VXAR $64-18, V29.D2, V20.D2, V14.D2
VXAR $64-62, V31.D2, V2.D2, V26.D2 // C[1] = A[4][0]
VXAR $64-43, V31.D2, V12.D2, V2.D2
VXAR $64-25, V27.D2, V13.D2, V12.D2
VXAR $64-8, V28.D2, V19.D2, V13.D2
VXAR $64-56, V27.D2, V23.D2, V19.D2
VXAR $64-41, V29.D2, V15.D2, V23.D2
VXAR $64-27, V28.D2, V4.D2, V15.D2
VXAR $64-14, V28.D2, V24.D2, V28.D2 // D[4] = A[0][4]
VXAR $64-2, V30.D2, V21.D2, V24.D2
VXAR $64-55, V27.D2, V8.D2, V8.D2 // A[1][3] = A[4][1]
VXAR $64-45, V30.D2, V16.D2, V4.D2 // A[0][4] = A[1][3]
VXAR $64-36, V29.D2, V5.D2, V16.D2
VXAR $64-28, V27.D2, V3.D2, V5.D2
VEOR V29.B16, V0.B16, V0.B16
VXAR $64-21, V27.D2, V18.D2, V27.D2 // D[3] = A[0][3]
VXAR $64-15, V31.D2, V17.D2, V3.D2 // A[0][3] = A[3][3]
VXAR $64-10, V30.D2, V11.D2, V30.D2 // D[1] = A[3][2]
VXAR $64-6, V31.D2, V7.D2, V31.D2 // D[2] = A[2][1]
VXAR $64-3, V29.D2, V10.D2, V29.D2 // D[0] = A[1][2]
// Chi + Iota
VBCAX V8.B16, V22.B16, V26.B16, V20.B16 // A[1][3] = A[4][1]
VBCAX V22.B16, V23.B16, V8.B16, V21.B16 // A[1][3] = A[4][1]
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
VBCAX V26.B16, V8.B16, V24.B16, V24.B16 // A[1][3] = A[4][1]
VLD1R.P 8(R1), [V26.D2]
VBCAX V3.B16, V19.B16, V30.B16, V17.B16 // A[0][3] = A[3][3]
VBCAX V19.B16, V15.B16, V3.B16, V18.B16 // A[0][3] = A[3][3]
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
VBCAX V30.B16, V3.B16, V16.B16, V16.B16 // A[0][3] = A[3][3]
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
VBCAX V4.B16, V9.B16, V29.B16, V7.B16 // A[0][4] = A[1][3]
VBCAX V9.B16, V5.B16, V4.B16, V8.B16 // A[0][4] = A[1][3]
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
VBCAX V29.B16, V4.B16, V6.B16, V6.B16 // A[0][4] = A[1][3]
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
VEOR V26.B16, V0.B16, V0.B16 // IOTA
SUBS $1, R2, R2
BNE loop
VST1.P [V0.D1, V1.D1], 16(R0)
VST1.P [V2.D1, V3.D1], 16(R0)
VST1.P [V4.D1, V5.D1], 16(R0)
VST1.P [V6.D1, V7.D1], 16(R0)
VST1.P [V8.D1, V9.D1], 16(R0)
VST1.P [V10.D1, V11.D1], 16(R0)
VST1.P [V12.D1, V13.D1], 16(R0)
VST1.P [V14.D1, V15.D1], 16(R0)
VST1.P [V16.D1, V17.D1], 16(R0)
VST1.P [V18.D1, V19.D1], 16(R0)
VST1.P [V20.D1, V21.D1], 16(R0)
VST1.P [V22.D1, V23.D1], 16(R0)
VST1 [V24.D1], (R0)
RET
DATA round_consts+0x00(SB)/8, $0x0000000000000001
DATA round_consts+0x08(SB)/8, $0x0000000000008082
DATA round_consts+0x10(SB)/8, $0x800000000000808a
DATA round_consts+0x18(SB)/8, $0x8000000080008000
DATA round_consts+0x20(SB)/8, $0x000000000000808b
DATA round_consts+0x28(SB)/8, $0x0000000080000001
DATA round_consts+0x30(SB)/8, $0x8000000080008081
DATA round_consts+0x38(SB)/8, $0x8000000000008009
DATA round_consts+0x40(SB)/8, $0x000000000000008a
DATA round_consts+0x48(SB)/8, $0x0000000000000088
DATA round_consts+0x50(SB)/8, $0x0000000080008009
DATA round_consts+0x58(SB)/8, $0x000000008000000a
DATA round_consts+0x60(SB)/8, $0x000000008000808b
DATA round_consts+0x68(SB)/8, $0x800000000000008b
DATA round_consts+0x70(SB)/8, $0x8000000000008089
DATA round_consts+0x78(SB)/8, $0x8000000000008003
DATA round_consts+0x80(SB)/8, $0x8000000000008002
DATA round_consts+0x88(SB)/8, $0x8000000000000080
DATA round_consts+0x90(SB)/8, $0x000000000000800a
DATA round_consts+0x98(SB)/8, $0x800000008000000a
DATA round_consts+0xA0(SB)/8, $0x8000000080008081
DATA round_consts+0xA8(SB)/8, $0x8000000000008080
DATA round_consts+0xB0(SB)/8, $0x0000000080000001
DATA round_consts+0xB8(SB)/8, $0x8000000080008008
GLOBL round_consts(SB), (8+16), $192