// file kernel/n/ppc32/gcd_n2.S: O(n^2) greatest common divisor
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                           PGCD quadratique                            |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                        ; +------------------------+
                        ; |  Pgcd  deux chiffres  |
                        ; +------------------------+
        
; void xn(gcd_2)(chiffre *x)
;
; entre :
;   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
;   a = a0 + BASE*a1, b = b0 + BASE*b1
;
; contrainte : 0 < b < a < BASE*b
;
; Dveloppe en fraction continue la fraction a/b tant que les coefficients
; tiennent sur un chiffre
;
; sortie :
;   [a0,a1,b0,b1] <- ind.
;   [p,s,q,r]     <- coefficients des combinaisons effectues
        
#ifdef assembly_sn_gcd_2
#undef L
#define L(x) .Lsn_gcd_2_##x
.globl _sn_gcd_2
_sn_gcd_2:

	; variables locales
	#define _a0_ r2
	#define _a1_ r4
	#define _b0_ r5
	#define _b1_ r6
	#define _p_  r7
	#define _q_  r8
	#define _r_  r9
	#define _s_  r10
	#define _c_  r11
	#define _x_  r12

	lwz  _a0_,  0(r3)
	lwz  _a1_,  4(r3)
	lwz  _b0_,  8(r3)
	lwz  _b1_, 12(r3)

        ; [p,q,r,s] <- Id
	li   _p_,   1
	li   _q_,   0
	li   _r_,   0
	li   _s_,   1
	li   _c_,  -1		; compteur dcalages

        ; ici a > b, p > r, q >= s-1
        ; dcale b,p,r tant que a >= 2b
L(loop):
	subfc  _a0_, _b0_, _a0_
	subfe  _a1_, _b1_, _a1_
L(shift_1):
	subfc  _a0_, _b0_, _a0_
	subfe  _a1_, _b1_, _a1_
	subfe. _x_,  _x_,  _x_
	addi   _c_,  _c_,  1
	bne    L(div_1)
	and.   _p_,  _p_,  _p_
	addc   _b0_, _b0_, _b0_
	adde   _b1_, _b1_, _b1_
	blt    L(unshift_1)
	add    _r_,  _r_,  _r_
	add    _p_,  _p_,  _p_
	b      L(shift_1)
	
        ; ici 2^i*b <= a < 2^(i+1)*b, p > r, q >= s-1
        ; calcule a/b par soustractions et dcalages
L(div_1):
	addc   _a0_, _b0_, _a0_
	adde   _a1_, _b1_, _a1_
L(loop_10):
	add    _s_,  _r_,  _s_
	addc   _q_,  _p_,  _q_
	subfe. _x_,  _x_,  _x_
	bne    L(loop_12)
	subf   _s_,  _r_,  _s_	;  dfait les additions
	subf   _q_,  _p_,  _q_
L(unshift_1):
	srw    _p_,  _p_,  _c_	; divise p et r par 2^i
	srw    _r_,  _r_,  _c_
	b      L(done)
L(loop_11):
	srwi   _b0_, _b0_,  1
	insrwi _b0_, _b1_,  1,0
	srwi   _b1_, _b1_,  1
	srwi   _p_,  _p_,   1
	srwi   _r_,  _r_,   1
	subfc  _a0_, _b0_, _a0_
	subfe  _a1_, _b1_, _a1_
	subfe. _x_,  _x_,  _x_
	beq    L(loop_10)
	addc   _a0_, _b0_, _a0_
	adde   _a1_, _b1_, _a1_
L(loop_12):
	subic. _c_,  _c_,  1
	bge    L(loop_11)
	or.    _x_,  _a0_, _a1_	; si a = 0, termin
	beq    L(done)

        ; ici b > a, p > r, q >= s
        ; dcale a,s,q tant que b >= 2a
	subfc  _b0_, _a0_, _b0_
	subfe  _b1_, _a1_, _b1_
L(shift_2):
	subfc  _b0_, _a0_, _b0_
	subfe  _b1_, _a1_, _b1_
	subfe. _x_,  _x_,  _x_
	addi   _c_,  _c_,  1
	bne    L(div_2)
	and.   _q_,  _q_,  _q_
	addc   _a0_, _a0_, _a0_
	adde   _a1_, _a1_, _a1_
	blt    L(unshift_2)
	add    _s_,  _s_,  _s_
	add    _q_,  _q_,  _q_
	b      L(shift_2)
	
        ; ici 2^i*a <= b < 2^(i+1)*a, p > r, q >= s
        ; calcule a/b par soustractions et dcalages
L(div_2):
	addc   _b0_, _a0_, _b0_
	adde   _b1_, _a1_, _b1_
L(loop_20):
	add    _r_,  _s_,  _r_
	addc   _p_,  _q_,  _p_
	subfe. _x_,  _x_,  _x_
	bne    L(loop_22)
	subf   _r_,  _s_,  _r_	;  dfait les additions
	subf   _p_,  _q_,  _p_
L(unshift_2):
	srw    _s_,  _s_,  _c_	; divise q et s par 2^i
	srw    _q_,  _q_,  _c_
	b      L(done)
L(loop_21):
	srwi   _a0_, _a0_,  1
	insrwi _a0_, _a1_,  1,0
	srwi   _a1_, _a1_,  1
	srwi   _s_,  _s_,   1
	srwi   _q_,  _q_,   1
	subfc  _b0_, _a0_, _b0_
	subfe  _b1_, _a1_, _b1_
	subfe. _x_,  _x_,  _x_
	beq    L(loop_20)
	addc   _b0_, _a0_, _b0_
	adde   _b1_, _a1_, _b1_
L(loop_22):
	subic. _c_,  _c_,  1
	bge    L(loop_21)
	or.    _x_,  _b0_, _b1_	; si b = 0, termin
	bne    L(loop)

L(done):
	stw    _p_,  16(r3)
	stw    _s_,  20(r3)
	stw    _q_,  24(r3)
	stw    _r_,  28(r3)
	blr

	#undef _a0_
	#undef _a1_
	#undef _b0_
	#undef _b1_
	#undef _p_
	#undef _q_
	#undef _r_
	#undef _s_
	#undef _c_
	#undef _x_

#undef L
#endif /* assembly_sn_gcd_2 */

                      ; +-----------------------------+
                      ; |  Demi-pgcd  deux chiffres  |
                      ; +-----------------------------+

; void xn(hgcd_2)(chiffre *x)
;
; entre :
;   x = tableau de 8 chiffres [a0,a1,b0,b1,p,s,q,r]
;   a = a0 + BASE*a1, b = b0 + BASE*b1
;
; contrainte : 0 < b < a
;
; Dveloppe en fraction continue les fractions a/(b+1) et (a+1)/b
; tant que les quotients concident et que les coefficients tiennent
; sur un chiffre
;
; sortie :
;   [a0,a1,b0,b1] <- ind.
;   [p,s,q,r]     <- coefficients des combinaisons effectues
        
#ifdef assembly_sn_hgcd_2
#undef L
#define L(x) .Lsn_hgcd_2_##x
.globl _sn_hgcd_2
_sn_hgcd_2:

	; variables locales
	#define _a0_ r2
	#define _a1_ r4
	#define _b0_ r5
	#define _b1_ r6
	#define _p_  r7
	#define _q_  r8
	#define _r_  r9
	#define _s_  r10
	#define _c_  r11
	#define _x_  r12

	lwz  _a0_,  0(r3)
	lwz  _a1_,  4(r3)
	lwz  _b0_,  8(r3)
	lwz  _b1_, 12(r3)

        ; [p,q,r,s] <- Id
	li   _p_,   1
	li   _q_,   0
	li   _r_,   0
	li   _s_,   1
	li   _c_,  -1		; compteur dcalages

	addc   _b0_, _p_,  _b0_	; b <- b + p
	addze  _b1_, _b1_
	subfc  _a0_, _b0_, _a0_	; a <- (a-q) - (b+p)
	subfe  _a1_, _b1_, _a1_

        ; ici a-q >= b+p, p > r, q >= s-1
        ; dcale b,p,r tant que a-q >= 2(b+p)
L(shift_1):
	subfc  _a0_, _b0_, _a0_
	subfe  _a1_, _b1_, _a1_
	subfe. _x_,  _x_,  _x_
	addi   _c_,  _c_,  1
	bne    L(div_1)
	and.   _p_,  _p_,  _p_
	addc   _b0_, _b0_, _b0_
	adde   _b1_, _b1_, _b1_
	blt    L(unshift_1)
	add    _r_,  _r_,  _r_
	add    _p_,  _p_,  _p_
	b      L(shift_1)
	
        ; ici 2^i*(b+p) <= a-q < 2^(i+1)*(b+p), p > r, q >= s-1
        ; calcule (a-q/(b+p) par soustractions et dcalages
L(div_1):
	addc   _a0_, _b0_, _a0_
	adde   _a1_, _b1_, _a1_
L(loop_10):
	add    _s_,  _r_,  _s_
	addc   _q_,  _p_,  _q_
	subfe. _x_,  _x_,  _x_
	bne    L(loop_12)
	subf   _s_,  _r_,  _s_	;  dfait les additions
	subf   _q_,  _p_,  _q_
L(unshift_1):
	srw    _p_,  _p_,  _c_	; divise p et r par 2^i
	srw    _r_,  _r_,  _c_
	b      L(done)
L(loop_11):
	srwi   _b0_, _b0_,  1
	insrwi _b0_, _b1_,  1,0
	srwi   _b1_, _b1_,  1
	srwi   _p_,  _p_,   1
	srwi   _r_,  _r_,   1
	subfc  _a0_, _b0_, _a0_
	subfe  _a1_, _b1_, _a1_
	subfe. _x_,  _x_,  _x_
	beq    L(loop_10)
	addc   _a0_, _b0_, _a0_
	adde   _a1_, _b1_, _a1_
L(loop_12):
	subic. _c_,  _c_,  1
	bge    L(loop_11)

	; fin de la division de a-q par b+p
	subfc  _b0_, _p_,  _b0_	; b <- b-r
	addme  _b1_, _b1_
	subfc  _b0_, _r_,  _b0_
	addme  _b1_, _b1_
	addc   _a0_, _s_,  _a0_	; a <- a + s
	addze  _a1_, _a1_
	addc   _a0_, _q_,  _a0_
	addze  _a1_, _a1_
	subfc  _b0_, _a0_, _b0_	; b <- (b-r) - (a+s)
	subfe  _b1_, _a1_, _b1_
	subfe. _x_,  _x_,  _x_	; si < 0, termin
	bne    L(done)

        ; ici b-r >= a+s, p > r, q >= s
        ; dcale a,s,q tant que b-r >= 2(a+s)
L(shift_2):
	subfc  _b0_, _a0_, _b0_
	subfe  _b1_, _a1_, _b1_
	subfe. _x_,  _x_,  _x_
	addi   _c_,  _c_,  1
	bne    L(div_2)
	and.   _q_,  _q_,  _q_
	addc   _a0_, _a0_, _a0_
	adde   _a1_, _a1_, _a1_
	blt    L(unshift_2)
	add    _s_,  _s_,  _s_
	add    _q_,  _q_,  _q_
	b      L(shift_2)
	
        ; ici 2^i*(a+s) <= b-r < 2^(i+1)*(a+s), p > r, q >= s
        ; calcule (b-r)/(a+s) par soustractions et dcalages
L(div_2):
	addc   _b0_, _a0_, _b0_
	adde   _b1_, _a1_, _b1_
L(loop_20):
	add    _r_,  _s_,  _r_
	addc   _p_,  _q_,  _p_
	subfe. _x_,  _x_,  _x_
	bne    L(loop_22)
	subf   _r_,  _s_,  _r_	;  dfait les additions
	subf   _p_,  _q_,  _p_
L(unshift_2):
	srw    _s_,  _s_,  _c_	; divise q et s par 2^i
	srw    _q_,  _q_,  _c_
	b      L(done)
L(loop_21):
	srwi   _a0_, _a0_,  1
	insrwi _a0_, _a1_,  1,0
	srwi   _a1_, _a1_,  1
	srwi   _s_,  _s_,   1
	srwi   _q_,  _q_,   1
	subfc  _b0_, _a0_, _b0_
	subfe  _b1_, _a1_, _b1_
	subfe. _x_,  _x_,  _x_
	beq    L(loop_20)
	addc   _b0_, _a0_, _b0_
	adde   _b1_, _a1_, _b1_
L(loop_22):
	subic. _c_,  _c_,  1
	bge    L(loop_21)

	; fin de la division de b-r par a+s
	subfc  _a0_, _s_,  _a0_	; a <- a - q
	addme  _a1_, _a1_
	subfc  _a0_, _q_,  _a0_
	addme  _a1_, _a1_
	addc   _b0_, _p_,  _b0_	; b <- b + p
	addze  _b1_, _b1_
	addc   _b0_, _r_,  _b0_
	addze  _b1_, _b1_
	subfc  _a0_, _b0_, _a0_	; a <- (a-q) - (b+p)
	subfe  _a1_, _b1_, _a1_
	subfe. _x_,  _x_,  _x_	; si < 0, termin
	beq    L(shift_1)

L(done):
	stw    _p_,  16(r3)
	stw    _s_,  20(r3)
	stw    _q_,  24(r3)
	stw    _r_,  28(r3)
	blr

	#undef _a0_
	#undef _a1_
	#undef _b0_
	#undef _b1_
	#undef _p_
	#undef _q_
	#undef _r_
	#undef _s_
	#undef _c_
	#undef _x_

#undef L
#endif /* assembly_sn_hgcd_2 */
