mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-02-05 10:34:54 +00:00
d2edcad66e
Thanks to Phil Zimmermann for the code and for the license exception we needed to include it. There remains some build system integration work to be done before this code will build properly in the FreeSWITCH tree.
174 lines
4.9 KiB
ArmAsm
174 lines
4.9 KiB
ArmAsm
@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
|
|
@
|
|
@ This uses the standard ARM calling convetion, which is that arguments
|
|
@ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR)
|
|
@ are volatile across the function; all others are callee-save.
|
|
@ However, note that r14 (LR) is the return address, so it would be
|
|
@ wise to save it somewhere before trashing it. Fortunately, there is
|
|
@ a neat trick possible, in that you can pop LR from the stack straight
|
|
@ into r15 (PC), effecting a return at the same time.
|
|
@
|
|
@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
|
|
@ reserved by hardware. Temps should use lr, then r4..r9 in order.
|
|
|
|
.text
|
|
.align 2
|
|
|
|
@ out[0..len] = in[0..len-1] * k
|
|
@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
.global lbnMulN1_32
|
|
.type lbnMulN1_32, %function
|
|
lbnMulN1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
umull r5, r4, lr, r3 @ (r4,r5) = lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
movs r2, r2, lsr #1
|
|
bcc m32_even
|
|
mov r5, r4 @ Get carry in the right register
|
|
beq m32_done
|
|
m32_loop:
|
|
@ carry is in r5
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
m32_even:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
subs r2, r2, #1
|
|
str r4, [r0], #4 @ *out++ = r4
|
|
|
|
bne m32_loop
|
|
m32_done:
|
|
str r5, [r0, #0] @ store carry
|
|
ldmfd sp!, {r4, r5, pc}
|
|
.size lbnMulN1_32, .-lbnMulN1_32
|
|
|
|
@ out[0..len-1] += in[0..len-1] * k, return carry
|
|
@ BNWORD32
|
|
@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
|
|
.global lbnMulAdd1_32
|
|
.type lbnMulAdd1_32, %function
|
|
lbnMulAdd1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
|
|
mov r4, #0
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
ldr r5, [r0, #0] @ r5 = *out
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
str r5, [r0], #4 @ *out++ = r5
|
|
movs r2, r2, lsr #1
|
|
bcc ma32_even
|
|
beq ma32_done
|
|
ma32_loop:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
adds lr, lr, r4 @ lr += product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
adc r4, r5, #0 @ Compute carry and move back to r4
|
|
ma32_even:
|
|
@ another unrolled copy
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
adds lr, lr, r4 @ lr += product.low
|
|
adc r4, r5, #0 @ Compute carry and move back to r4
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
subs r2, r2, #1
|
|
|
|
bne ma32_loop
|
|
ma32_done:
|
|
mov r0, r4
|
|
ldmfd sp!, {r4, r5, pc}
|
|
.size lbnMulAdd1_32, .-lbnMulAdd1_32
|
|
|
|
@@@ This is a bit messy... punt for now...
|
|
@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
|
|
@ BNWORD32
|
|
@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
|
|
.global lbnMulSub1_32
|
|
.type lbnMulSub1_32, %function
|
|
lbnMulSub1_32:
|
|
stmfd sp!, {r4, r5, lr}
|
|
|
|
mov r4, #0
|
|
mov r5, #0
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
umull r4, r5, lr, r3 @ (r5,r4) = lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r4 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r5, r5, #1 @ propagate carry
|
|
|
|
movs r2, r2, lsr #1
|
|
bcc ms32_even
|
|
mov r4, r5
|
|
beq ms32_done
|
|
ms32_loop:
|
|
@ carry is in r4
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r5, #0
|
|
umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r4 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r5, r5, #1 @ propagate carry
|
|
ms32_even:
|
|
@ carry is in r5
|
|
ldr lr, [r1], #4 @ lr = *in++
|
|
mov r4, #0
|
|
umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3
|
|
ldr lr, [r0, #0] @ lr = *out
|
|
subs lr, lr, r5 @ lr -= product.low
|
|
str lr, [r0], #4 @ *out++ = lr
|
|
addcc r4, r4, #1 @ Propagate carry
|
|
|
|
subs r2, r2, #1
|
|
bne ms32_loop
|
|
ms32_done:
|
|
mov r0, r4
|
|
ldmfd sp!, {r4, r5, pc}
|
|
|
|
.size lbnMulSub1_32, .-lbnMulSub1_32
|
|
|
|
@@
|
|
@@ It's possible to eliminate the store traffic by doing the multiplies
|
|
@@ in a different order, forming all the partial products in one column
|
|
@@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The
|
|
@@ ARM has the MAC, but no carry out.
|
|
@@
|
|
@@ The question is, is it faster to do the add directly (3 instructions),
|
|
@@ or can we compute the carry out in 1 instruction (+1 to do the add)?
|
|
@@ Well... it takes at least 1 instruction to copy the original accumulator,
|
|
@@ out of the way, and 1 to do a compare, so no.
|
|
@@
|
|
@@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1,
|
|
@@ we sum i+1 multiplies in each (plus the carry in from the
|
|
@@ previous one). For i = n..2*n-1 we sum 2*n-1-i, plus the previous
|
|
@@ carry.
|
|
@@
|
|
@@ This "non-square" structure makes things more complicated.
|
|
@@
|
|
@@ void
|
|
@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
|
|
@@ unsigned len)
|
|
@ .global lbnMulX_32
|
|
@ .type lbnMulX_32, %function
|
|
@lbnMulX_32:
|
|
@ stmfd sp!, {r4, r5, r6, r7, lr}
|
|
@
|
|
@ mov r4, #0
|
|
@ mov r5, #0
|
|
@ mov r0, r4
|
|
@ ldmfd sp!, {r4, r5, pc}
|
|
@ .size lbnMulX_32, .-lbnMulX_32
|