// file kernel/n/alpha/sqrt_n2.S: O(n^2) square root of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Racine carre quadratique                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/


                             # +-----------------+
                             # |  Racine carre  |
                             # +-----------------+

   # void xn(sqrt_n2)(chiffre *a, long la, chiffre *b)
   #
   # entre :
   # a = naturel longueur la
   # b = naturel de longueur la/2
   #
   # contraintes :
   # la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
   # a,b non confondus
   #
   # sortie :
   # b <- 2*floor(sqrt(a))
   # a <- a - b^2/4

#ifdef assembly_sn_sqrt_n2
#define L(x) .Lsn_sqrt_n2_##x

        .align 5
#ifdef debug_sqrt_n2
        .globl sn_sqrt_n2_buggy
        .ent   sn_sqrt_n2_buggy
sn_sqrt_n2_buggy:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
#else
        .globl sn_sqrt_n2
        .ent   sn_sqrt_n2
sn_sqrt_n2:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):
#endif
	#define _a_    $20
	#define _la_   $17
	#define _b_    $16
	#define _lb_   $19
	#define _bi_   $21
	#define _bh_   $22
	#define _ah_   $23
	#define _msub_ $25
	#define _add_  $28
	#define _i_    $2
	#define _u_    $0
	#define _v_    $1
	#define _w_    $2
	#define _x_    $3
	#define _y_    $4
	#define _z_    $5
	#define _q_    $8

	subq   _la_, 2,    _la_ # la -= 2
	addq   $31,  2,    _lb_ # lb <- 2
	s8addq _la_, $16,  _a_	# a += la-2
	s4addq _la_, $18,  _b_  # b += lb-1

	# calcule la racine carre des deux chiffres de tte
	ldq    _u_,  0(_a_)	# u:v <- a[0]:a[1]
	ldq    _v_,  8(_a_)
	addq   $31,  1,    _bh_	# bh <- BASE/2
	sll    _bh_, 63,   _bh_
	srl    _bh_, 3,    _y_
	subq   _v_,  _y_,  _v_  # u:v -= bh^2
	srl    _bh_, 2,    _y_  # y <- BASE/8 (bit de test)
	srl    _u_,  62,   _w_  # dcale le reste de 2 bits
	sll    _u_,  2,    _u_
	s4addq _v_,  _w_,  _v_

	.align 5
1:
	addq   _bh_, _y_, _bh_  # bit de test <- 1
	cmplt  _u_,  $31, _w_   # dcale le reste d un bit
	cmplt  _v_,  $31, _z_
	addq   _u_,  _u_, _u_
	addq   _v_,  _v_, _v_
	addq   _v_,  _w_, _v_
	cmpule _bh_, _v_, _w_   # a passe ?
	or     _w_,  _z_, _w_
	beq    _w_,  2f
	subq   _v_,  _bh_,_v_	# si oui, mise  jour du reste
	addq   _bh_, _y_, _bh_  # valide le nouveau bit
2:
	bic    _bh_, _y_, _bh_  # efface le bit de test
	srl    _y_,  1,   _y_
	bne    _y_,  1b

	stq    _v_,  0(_a_)     # sauve le reste dans a[0]:a[1]
	stq    $31,  8(_a_)
	stq    _bh_, 0(_b_)     # sauve le chiffre de tte de b
	bne    _la_, 3f
	ret    $31,  ($26),1
3:
	
	# prpare le droulement des boucles internes
	lda    _b_,  8(_b_)     # b <- &b[lb]
	lda    _msub_, sn_mulsubloop
	lda    _add_,  sn_addloop
	lda    _msub_, 1344(_msub_) # adresse de saut dans mulsub pour 2 chiffres
	lda    _add_,   964(_add_)  # adresse de saut dans add pour 2 chiffres

        INVERSE(_bh_,_bi_,_y_,_z_)

	# calcule les chiffres suivants par divisions
	.align 5
L(loop):
	# quotient approch, peut tre trop grand d une ou deux units
	ldq    _v_, 0(_a_)
	ldq    _u_,-8(_a_)
	bis    _v_, _v_, _ah_
        DIV(_u_,_v_,_bh_,_bi_,_q_,_x_,_y_)

	# a <- a - v*q - q^2
	subq   $31, _lb_, _i_	# i <- compteur
	bic    _i_, 31,   _i_
	s8addq _i_, _a_,  _a_	# cadre a et b sur le multiple de 32 prc.
	s8addq _i_, _b_,  _b_
	mulq   _q_, _q_, _v_    # retenue <- q^2
	umulh  _q_, _q_, _u_
	jsr    $27, (_msub_)
	subq   _ah_,_u_, _ah_	# dernire retenue

	# corrige le quotient et le reste si < 0
	beq    _ah_,   L(q_ok)
L(corr):
	subq   $31, _lb_, _i_	# i <- compteur
	bic    _i_, 31,   _i_
	s8addq _i_, _a_,  _a_	# cadre a et b sur le multiple de 32 prc.
	s8addq _i_, _b_,  _b_
	bis    _q_, _q_,  _u_	# retenue <- 2q-1
	subq   _q_, 1,    _q_
	bis    _q_, _q_,  _v_
	bis    _a_, _a_,  $18
	jsr    $27, (_add_)
	addq   _ah_,_u_, _ah_	# dernire retenue
	bne    _ah_, L(corr)

	# b <- b + 2*q
L(q_ok):
	stq    $31,  0(_a_)     # a[lb] <- 0
	sll    _lb_, 3,  _u_	# r18 <- &b[0]
	subq   _b_,  _u_, $18
	ldq    _v_, 8($18)
	srl    _q_, 63,  _u_    # ajoute 2q  0:b[1]
	sll    _q_, 1,   _q_
	addq   _u_, _v_, _u_
	stq    _q_, 0($18)
	stq    _u_, 8($18)
	
	# chiffre suivant
	lda    _msub_, -44(_msub_) # recule les adresses de saut
	lda    _add_,  -32(_add_)
	and    _lb_, 31, _u_	   # si on franchit un multiple de 32
	bne    _u_,  1f
	lda    _msub_, 1408(_msub_)# repart en fin de boucle.
	lda    _add_,  1024(_add_)
1:
	subq   _la_, 2,   _la_  # la -= 2
	addq   _lb_, 1,   _lb_  # lb++
	lda    _a_,  -8(_a_)    # a--
	bne    _la_, L(loop)
	
	# termin
	ret    $31,  ($26),1

	#undef _a_
	#undef _la_
	#undef _b_
	#undef _lb_
	#undef _bi_
	#undef _bh_
	#undef _ah_
	#undef _msub_
	#undef _add_
	#undef _i_
	#undef _u_
	#undef _v_
	#undef _w_
	#undef _x_
	#undef _y_
	#undef _z_
	#undef _q_

#ifdef debug_sqrt_n2
	.end sn_sqrt_n2_buggy
#else
	.end sn_sqrt_n2
#endif
#undef L
#endif /* assembly_sn_sqrt_n2 */
#if !defined(assembly_sn_sqrt_n2) || defined(debug_sqrt_n2)
	REPLACE(sn_sqrt_n2)
#endif

