// file kernel/n/ppc32/burnikel.S: Burnikel-Ziegler division
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                     Division de Burnikel et Ziegler                   |
 |                                                                       |
 +-----------------------------------------------------------------------*/

; void xn(burnidiv)(chiffre *a, long lc, chiffre *b, long lb, chiffre *c)
;
; entre :
; a = naturel de longueur lc+lb
; b = naturel de longueur lb
; c = naturel de longueur lc
; 
; contraintes :
; lb >= 2, lc > 0, le bit de poids frt de b est non nul,
; a < BASE^lc*b
; a,b,c non confondus
; 
; sortie :
; a <- a mod b
; c <- floor(a/b)

#ifdef assembly_sn_burnidiv
#define L(x) .Lsn_burnidiv_##x
#ifdef debug_burnidiv
.globl _sn_burnidiv_buggy
_sn_burnidiv_buggy:
#else
.globl _sn_burnidiv
_sn_burnidiv:
Lsn_burnidiv:
#endif

	cmpwi  cr0,   r6, burnidiv_lim	; petite division ?
	ble    Lsn_div_n2		; => algorithme en n^2
	cmpwi  cr0,   r4, div_small_c_lim
	ble    Lsn_div_n2
	
	; cas rcursif, dcoupe b en deux et divise par tranches de q chiffres
	#define _x_  36(r1)
	#define _a_  r31
	#define _b_  r30
	#define _c_  r29
	#define _lc_ r28
	#define _p_  r27
	#define _q_  r26
	#define _r_  r25
	#define _ra_ 32(r1)

	stmw   r25,  4(r1)	; sauvegarde r25-r31
	slwi   r2,   r6,    2	; rserve lb chiffres + lr + cadre de pile,
	addi   r2,   r2,   48
	clrrwi r2,   r2,    4	; arrondi  un multiple de 16 octets
	neg    r2,   r2
	stwux  r1,   r1,   r2
	mflr   r0
	stw    r0,   _ra_

	srwi   _p_,  r6,   1	; p <- lb/2
	subf   _q_,  _p_,  r6	; r19 <- q = lb-p
	mr     _lc_, r4
	mr     _r_,  _lc_	; r <- min(lc,q)
	cmpw   cr0,  _lc_, _q_
	ble    1f
	mr     _r_,  _q_
1:
	subf   _lc_, _r_,  _lc_	; lc -= r
	slwi   r2,   _lc_,  2
	add    _a_,  r2,   r3	; a += lc
	add    _c_,  r2,   r7	; c += lc
	mr     _b_,  r5

	; boucle sur les tranches
L(loop):
	; compare a[p+r..p+q+r-1] et b[p..p+q-1]
	slwi   r2,   _p_,  2
	add    r3,   r2,   _a_	; r3 <- &a[p]
	add    r5,   r2,   _b_	; r5 <- &b[p]
	slwi   r2,   _r_,  2
	add    r10,  r2,   r3	; r10 <- &a[p+r-1]
	subi   r10,  r10,  4
	subi   r11,  r5,   4	; r11 <- &b[p-1]
	mtctr  _q_
1:
	lwzu   r4,   4(r10)
	lwzu   r6,   4(r11)
	xor.   r4,   r4,   r6
	bdnzt  eq,   1b
	bne    L(small_a)

	; ici a[p+r..p+q+1] = b[p..p+q-1], quotient <- BASE^r-1
	mtctr  _q_
	addi   r10,  r10,  4
1:
	stwu   r4,   -4(r10)	; a[p+r..p+q+r-1] <- 0
	bdnz   1b
	subi   r10,  _c_,  4
	mtctr  _r_
	li     r3,   -1
1:
	stwu   r3,   4(r10)	; c <- BASE^r - 1
	bdnz   1b
	slwi   r2,   _p_,  2
	add    r3,   r2,   _a_	; r3 <- &a[p]
	add    r4,   _q_,  _r_
	add    r5,   r2,   _b_	; r5 <- &b[p]
	mr     r6,   _q_
	bl     Lsn_inc		; a[p..p+q+r-1] += b[p..p+q-1]
	b      L(mulsub)

	; ici a[p+r..p+q+r-1] < b[p..p+q-1], divise a1 par b1
L(small_a):
	mr     r4,  _r_
	mr     r6,  _q_
	mr     r7,  _c_
	bl     Lsn_burnidiv

	; a0 + BASE^p*r1 -= c*b0
L(mulsub):
	cmpw   cr0,  _r_,  _p_	; place le plus long argument en premier
	ble    1f
	mr     r3,   _c_
	mr     r4,   _r_
	mr     r5,   _b_
	mr     r6,   _p_
	b      2f
1:
	mr     r5,   _c_
	mr     r6,   _r_
	mr     r3,   _b_
	mr     r4,   _p_
2:
	la     r7,   _x_
	bl     Lsn_toommul	; x <- c*b0
	mr     r3,   _a_
	add    r4,   _p_,  _q_
	addi   r4,   r4,   1	; r4 <- lb+1
	la     r5,   _x_
	add    r6,   _p_,  _r_
	bl     Lsn_dec		; a -= c*b0

	; corrige si < 0
	and.   r3,   r3,   r3
	beq    L(next)
1:
	subi   r10,   _c_,  4	; c--
2:
	lwzu   r3,   4(r10)
	and.   r3,   r3,   r3
	subi   r3,   r3,   1
	stw    r3,   0(r10)
	beq    2b
	mr     r3,   _a_
	add    r6,   _p_,  _q_
	addi   r4,   r6,   1	; r4 <- lb+1
	mr     r5,   _b_
	bl     Lsn_inc		; a += b
	and.   r3,   r3,   r3
	beq    1b

	; tranche suivante
L(next):
	mr     _r_,  _lc_	; r <- min(lc,q)
	cmpw   cr0,  _lc_, _q_
	ble    1f
	mr     _r_,  _q_
1:
	subf   _lc_, _r_,  _lc_	; lc -= r
	slwi.  r2,   _r_,  2
	subf   _a_,  r2,   _a_	; a -= r
	subf   _c_,  r2,   _c_	; c -= r
	bgt    L(loop)

	; termin
	lwz   r0,    _ra_	; rcupre l adresse de retour
	mtlr  r0
	lwz   r1,    0(r1)	; nettoie la pile
	lmw   r25,   4(r1)	; rcupre r25-r31
	blr

	#undef _x_
	#undef _a_
	#undef _b_
	#undef _c_
	#undef _lc_
	#undef _p_
	#undef _q_
	#undef _r_
	#undef _ra_

#undef L
#endif /* assembly_sn_burnidiv */
#if !defined(assembly_sn_burnidiv) || defined(debug_burnidiv)
	REPLACE(sn_burnidiv)
#endif
