Travis Cross d2edcad66e Merge Phil Zimmermann's libzrtp as a FreeSWITCH library
Thanks to Phil Zimmermann for the code and for the license exception
we needed to include it.

There remains some build system integration work to be done before
this code will build properly in the FreeSWITCH tree.
2012-03-31 23:42:27 +00:00

415 lines
9.1 KiB
NASM

;;; Copyright (c) 1995, Colin Plumb.
;;; For licensing and other legal details, see the file legal.c.
;;;
;;; Assembly primitives for bignum library, 80386 family, 32-bit code.
;;;
;;; Several primitives are included here. Only lbnMulAdd1 is *really*
;;; critical, but once that's written, lnmMulN1 and lbnMulSub1 are quite
;;; easy to write as well, so they are included here as well.
;;; lbnDiv21 and lbnModQ are so easy to write that they're included, too.
;;;
;;; All functions here are for 32-bit flat mode. I.e. near code and
;;; near data, although the near offsets are 32 bits.
;;;
;;; The usual 80x86 calling conventions have AX, BX, CX and DX
;;; volatile, and SI, DI, SP and BP preserved across calls.
;;; This includes the "E"xtended forms of all of those registers
;;;
;;; However, just to be confusing, recent 32-bit DOS compilers have
;;; quietly changed that to require EBX preserved across calls, too.
;;; Joy.
.386
;_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
;_TEXT ends
ifdef @Version
if @Version le 510
FLAT group _TEXT
endif
else
FLAT group _TEXT
endif
assume cs:FLAT, ds:FLAT, ss:FLAT
_TEXT segment para public use32 'CODE' ; 16-byte aligned because 486 cares
public _lbnMulN1_32
public _lbnMulAdd1_32
public _lbnMulSub1_32
public _lbnDiv21_32
public _lbnModQ_32
;; Register usage:
;; eax - low half of product
;; ebx - carry to next iteration
;; ecx - multiplier (k)
;; edx - high half of product
;; esi - source pointer
;; edi - dest pointer
;; ebp - loop counter
;;
;; Stack frame:
;; +--------+ esp+20 esp+24 esp+28 esp+32 esp+36
;; | k |
;; +--------+ esp+16 esp+20 esp+24 esp+28 esp+32
;; | len |
;; +--------+ esp+12 esp+16 esp+20 esp+24 esp+28
;; | in |
;; +--------+ esp+8 esp+12 esp+16 esp+20 esp+24
;; | out |
;; +--------+ esp+4 esp+8 esp+12 esp+16 esp+20
;; | return |
;; +--------+ esp esp+4 esp+8 esp+12 esp+16
;; | esi |
;; +--------+ esp esp+4 esp+8 esp+12
;; | ebp |
;; +--------+ esp esp+4 esp+8
;; | ebx |
;; +--------+ esp esp+4
;; | edi |
;; +--------+ esp
align 16
_lbnMulN1_32 proc near
push esi ; U
mov esi,[esp+12] ; V load in
push ebp ; U
mov ebp,[esp+20] ; V load len
push ebx ; U
mov ecx,[esp+28] ; V load k
push edi ; U
mov edi,[esp+20] ; V load out
;; First multiply step has no carry in.
mov eax,[esi] ; U
lea ebx,[ebp*4-4] ; V loop unrolling
mul ecx ; NP first multiply
mov [edi],eax ; U
and ebx,12 ; V loop unrolling
add esi,ebx ; U loop unrolling
add edi,ebx ; V loop unrolling
jmp DWORD PTR m32_jumptable[ebx] ; NP loop unrolling
align 4
m32_jumptable:
dd m32_case0
dd m32_case1
dd m32_case2
dd m32_case3
nop
align 8
nop
nop
nop ; Get loop nicely aligned
m32_case0:
sub ebp,4 ; U
jbe SHORT m32_done ; V
m32_loop:
mov eax,[esi+4] ; U
mov ebx,edx ; V Remember carry for later
add esi,16 ; U
add edi,16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [edi-12],eax ; V
m32_case3:
mov eax,[esi-8] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [edi-8],eax ; V
m32_case2:
mov eax,[esi-4] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [edi-4],eax ; V
m32_case1:
mov eax,[esi] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
adc edx,0 ; U
mov [edi],eax ; V
sub ebp,4 ; U
ja SHORT m32_loop ; V
m32_done:
mov [edi+4],edx ; U
pop edi ; V
pop ebx ; U
pop ebp ; V
pop esi ; U
ret ; NP
_lbnMulN1_32 endp
align 16
_lbnMulAdd1_32 proc near
push esi ; U
mov esi,[esp+12] ; V load in
push edi ; U
mov edi,[esp+12] ; V load out
push ebp ; U
mov ebp,[esp+24] ; V load len
push ebx ; U
mov ecx,[esp+32] ; V load k
;; First multiply step has no carry in.
mov eax,[esi] ; U
mov ebx,[edi] ; V
mul ecx ; NP first multiply
add ebx,eax ; U
lea eax,[ebp*4-4] ; V loop unrolling
adc edx,0 ; U
and eax,12 ; V loop unrolling
mov [edi],ebx ; U
add esi,eax ; V loop unrolling
add edi,eax ; U loop unrolling
jmp DWORD PTR ma32_jumptable[eax] ; NP loop unrolling
align 4
ma32_jumptable:
dd ma32_case0
dd ma32_case1
dd ma32_case2
dd ma32_case3
nop
align 8
nop
nop
nop ; To align loop properly
ma32_case0:
sub ebp,4 ; U
jbe SHORT ma32_done ; V
ma32_loop:
mov eax,[esi+4] ; U
mov ebx,edx ; V Remember carry for later
add esi,16 ; U
add edi,16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-12] ; V
adc edx,0 ; U
add ebx,eax ; V
adc edx,0 ; U
mov [edi-12],ebx ; V
ma32_case3:
mov eax,[esi-8] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-8] ; V
adc edx,0 ; U
add ebx,eax ; V
adc edx,0 ; U
mov [edi-8],ebx ; V
ma32_case2:
mov eax,[esi-4] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-4] ; V
adc edx,0 ; U
add ebx,eax ; V
adc edx,0 ; U
mov [edi-4],ebx ; V
ma32_case1:
mov eax,[esi] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi] ; V
adc edx,0 ; U
add ebx,eax ; V
adc edx,0 ; U
mov [edi],ebx ; V
sub ebp,4 ; U
ja SHORT ma32_loop ; V
ma32_done:
pop ebx ; U
pop ebp ; V
mov eax,edx ; U
pop edi ; V
pop esi ; U
ret ; NP
_lbnMulAdd1_32 endp
align 16
_lbnMulSub1_32 proc near
push esi ; U
mov esi,[esp+12] ; V load in
push edi ; U
mov edi,[esp+12] ; V load out
push ebp ; U
mov ebp,[esp+24] ; V load len
push ebx ; U
mov ecx,[esp+32] ; V load k
;; First multiply step has no carry in.
push esi ; U
mov esi,[esp+12] ; V load in
push edi ; U
mov edi,[esp+12] ; V load out
push ebp ; U
mov ebp,[esp+24] ; V load len
mov ecx,[esp+28] ; U load k
;; First multiply step has no carry in.
mov eax,[esi] ; V
mov ebx,[edi] ; U
mul ecx ; NP first multiply
sub ebx,eax ; U
lea eax,[ebp*4-4] ; V loop unrolling
adc edx,0 ; U
and eax,12 ; V loop unrolling
mov [edi],ebx ; U
add esi,eax ; V loop unrolling
add edi,eax ; U loop unrolling
jmp DWORD PTR ms32_jumptable[eax] ; NP loop unrolling
align 4
ms32_jumptable:
dd ms32_case0
dd ms32_case1
dd ms32_case2
dd ms32_case3
nop
align 8
nop
nop
nop
ms32_case0:
sub ebp,4 ; U
jbe SHORT ms32_done ; V
ms32_loop:
mov eax,[esi+4] ; U
mov ebx,edx ; V Remember carry for later
add esi,16 ; U
add edi,16 ; V
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-12] ; V
adc edx,0 ; U
sub ebx,eax ; V
adc edx,0 ; U
mov [edi-12],ebx ; V
ms32_case3:
mov eax,[esi-8] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-8] ; V
adc edx,0 ; U
sub ebx,eax ; V
adc edx,0 ; U
mov [edi-8],ebx ; V
ms32_case2:
mov eax,[esi-4] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi-4] ; V
adc edx,0 ; U
sub ebx,eax ; V
adc edx,0 ; U
mov [edi-4],ebx ; V
ms32_case1:
mov eax,[esi] ; U
mov ebx,edx ; V Remember carry for later
mul ecx ; NP
add eax,ebx ; U Add carry in from previous word
mov ebx,[edi] ; V
adc edx,0 ; U
sub ebx,eax ; V
adc edx,0 ; U
mov [edi],ebx ; V
sub ebp,4 ; U
ja SHORT ms32_loop ; V
ms32_done:
pop ebx ; U
pop ebp ; V
mov eax,edx ; U
pop edi ; V
pop esi ; U
ret ; NP
_lbnMulSub1_32 endp
;; Two-word by one-word divide. Stores quotient, returns remainder.
;; BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
;; 4 8 12 16
align 4
_lbnDiv21_32 proc near
mov edx,[esp+8] ; U Load nh
mov eax,[esp+12] ; V Load nl
mov ecx,[esp+4] ; U Load q
div DWORD PTR [esp+16] ; NP
mov [ecx],eax ; U Store quotient
mov eax,edx ; V Return remainder
ret
_lbnDiv21_32 endp
;; Multi-word by one-word remainder.
;; This speeds up key generation. It's not worth unrolling and so on;
;; using 32-bit divides is enough of a speedup.
;;
;; The modulus (in ebp) is often 16 bits. Given that the dividend is 32
;; bits, the chances of saving the first divide because the high word of the
;; dividend is less than the modulus are low enough it's not worth taking
;; the cycles to test for it.
;;
;; unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
;; 4 8 12
align 4
_lbnModQ_32 proc near
mov eax,[esp+4] ; U Load n
push ebp ; V
mov ebp,[esp+12] ; U Load len
push esi ; V
lea esi,[ebp*4+eax-4] ; U
mov ecx,[esp+20] ; V Load d
xor edx,edx ; U Clear edx for first iteration
modq32_loop:
mov eax,[esi] ; U Load new low word for divide
sub esi,4 ; V
div ecx ; NP edx = edx:eax % ecx
dec ebp ; U
jnz SHORT modq32_loop ; V
pop esi ; U
mov eax,edx ; V Return remainder in eax
pop ebp ; U
ret ; NP
_lbnModQ_32 endp
end