// file kernel/n/alpha/toom.S: Toom multiplication of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Multiplication de Toom                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

#if defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr)
        
                         # +-------------------------+
                         # |  Addition/soustraction  |
                         # +-------------------------+

   # void xn(add_sub3)(chiffre *a, long p, chiffre *b, long q)
   #
   # entre :
   #   a = naturel de longueur 2p+q
   #   b = naturel de longueur 2p+2
   # contraintes : 0 < q <= p
   #
   # sortie :
   #   b[0..p]      <-  a[0..p-1] + a[p..2p-1] + a[2p..2p+q-1]
   #   b[p+1..2p+1] <-  |a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]|
   #   retourne le signe de la diffrence

#define L(x) .Lsn_add_sub3_##x

        .align 5
        .globl sn_add_sub3
        .ent   sn_add_sub3
sn_add_sub3:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):

	# c[0..p-1] <- a0+a2
	bis    $31,  $31,  $0   # r0 <- 0 (retenue)
	ldq    $1,   0($16)     # r1 <- a0[0] (retenue)
	subq   $31,  $19,   $2  # r2 <- -q
	subq   $17,  $19,   $19 # r19 <- p - q
	and    $2,   31,    $3	# r3 <- (-q) % 32
	bic    $2,   31,    $2  # r2 <- -32*ceil(q/32)
	sll    $3,   3,     $3  # r3 <- 8*((-q) % 32)
	subq   $16,  $3,    $16 # cadre les pointeurs  sur le mult. prc. de 32
	subq   $18,  $3,    $20
	s8addq $17,  $16,   $18
	s8addq $17,  $18,   $18
	lda    $27,  sn_addloop
	addq   $3,   3,     $3  # r3 <- nb d instructions a sauter
	s4addq $3,   $27,   $27 # r27 <- adresse d entre dans la boucle
	jsr    $27,  ($27)	# additionne les chiffres communs
	
        beq    $19,  2f         # propage la retenue
        .align 5
1:
        ldq    $1,   0($16)
        lda    $16,  8($16)
        lda    $19,  -1($19)
        addq   $1,   $0,   $1
        cmpult $1,   $0,   $0
        stq    $1,   0($20)
        lda    $20,  8($20)
        bne    $19,  1b
2:
	bis    $0,  $0,  $7     # r7 <- retenue(a0+a2)

	s8addq $17, $16, $23    # r23 <- &a1[p]
	subq   $31, $17, $17    # r17 <- -p
	bis    $20, $20, $22    # r22 <- &b[p]
	and    $17, 31,  $0     # r0 <- (-p) mod 32
	bic    $17, 31,  $2     # r2 <- -ceil(p/32)
	sll    $0,  3,   $1
	subq   $16, $1,  $18    # cadre les pointeurs sur le mult. prc. de 32
	subq   $20, $1,  $21
	s8addq $17, $21, $20
	bis    $20, $20, $16
	lda    $21, 8($21)
	subq   $1,  $0,  $1     # calcule l adresse de saut pour addsub
	lda    $27, sn_addsubloop
	s8addq $1,  $27, $27
	bne    $7,  2f          # si a0+a2 >= BASE^p alors a0-a1+a2 > 0
1:
	lda    $17, 1($17)      # sinon, compare a0+a2 et a1
	ldq    $0,  -8($22)
	ldq    $1,  -8($23)
	lda    $22, -8($22)
	lda    $23, -8($23)
	cmpule $0,  $1,  $19
	cmovne $19, $17, $19
	beq    $19, 2f
	cmpult $0,  $1,  $19
	beq    $19, 1b
	bis    $18, $18, $16    # si a0+a2 < a1, change les pointeurs
	bis    $20, $20, $18
2:
	bis    $31, $31, $0     # init retenues
	bis    $31, $31, $1
	jsr    $27, ($27)       # b[0..p-1] <- a0+a1+a2, b[p+1..2p] <- |a0-a1+a2|
	addq   $7,  $0,  $0     # derniers chiffres
	subq   $7,  $1,  $1
	stq    $0,  0($20)
	stq    $1,  0($21)

	bis    $19, $19, $0     # r0 <- signe(a0-a1+a2)
	ret    $31, ($26),1

        .end   sn_add_sub3
#undef L

                        # +--------------------------+
                        # |  Addition avec dcalage  |
                        # +--------------------------+

   # void xn(add_base)(chiffre *a, chiffre *b, long p, long q)
   #
   # entre :
   #   a = naturel de longueur 2p+q
   #   b = naturel de longueur p+3
   # contraintes : 0 < q <= p, p > 2
   #
   # sortie :
   #   b <-  a[0..p-1] + BASE*a[p..2p-1] + BASE^2*a[2p..2p+q-1]

#define L(x) .Lsn_add_base_##x

        .align 5
        .globl sn_add_base
        .ent   sn_add_base
sn_add_base:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):

	subq   $18, 2,   $1     # r1 <- p-2
	bis    $1,  $1,  $2     # r2 <- p-2
	subq   $19, $1,  $6     # r6 <- q - (p-2)
	cmovlt $6,  $19, $1     # r1 <- min(p-2,q)
	subq   $2,  $1,  $2     # r2 <- p-2 - min(p-2,q)
	
	bis    $17, $17, $19    # r19 <- b0
	s8addq $18, $16, $17    # r17 <- &a1
	s8addq $18, $17, $18    # r18 <- &a2

	# traite les deux premiers chiffres  part
	ldq    $3,  0($16)
	stq    $3,  0($19)      # b[0] <- a0[0]
	ldq    $3,  8($16)
	ldq    $4,  0($17)
	addq   $4,  $3,  $3
	stq    $3,  8($19)      # b[1] <- a0[1] + a1[0]
	cmpult $3,  $4,  $0     # r0 <- retenue
	
	# additionne les chiffres communs
	.align 5
L(loop_1):
	ldq    $3, 16($16)      # r3 <- a0[i+2]
	ldq    $4,  8($17)      # r4 <- a1[i+1]
	ldq    $5,  0($18)      # r5 <- a2[i]
	addq   $0,  $3,  $3     # r3 <- a0[i+2] + ret
	cmpult $3,  $0,  $0
	addq   $4,  $3,  $3     # r3 <- a0[i+2] + a1[i+1] + ret
	cmpult $3,  $4,  $4
	addq   $5,  $3,  $3     # r3 <- a0[i+2] + a1[i+1] + a2[i] + ret
	cmpult $3,  $5,  $5
	addq   $4,  $0,  $0
	addq   $5,  $0,  $0     # r0 <- somme des retenues
	stq    $3, 16($19)      # sauve a0[i+2] + a1[i+1] + a2[i]
	lda    $1,  -1($1)      # i++
	lda    $16, 8($16)      # avance les pointeurs
	lda    $17, 8($17)
	lda    $18, 8($18)
	lda    $19, 8($19)
	bne    $1,  L(loop_1)

	# continue sans a2
	beq    $2,  L(done)
	.align 5
L(loop_2):
	ldq    $3, 16($16)      # r3 <- a0[i+2]
	ldq    $4,  8($17)      # r4 <- a1[i+1]
	addq   $0,  $3,  $3     # r3 <- a0[i+2] + ret
	cmpult $3,  $0,  $0
	addq   $4,  $3,  $3     # r3 <- a0[i+2] + a1[i+1] + ret
	cmpult $3,  $4,  $4
	addq   $4,  $0,  $0     # r0 <- somme des retenues
	stq    $3, 16($19)      # sauve a0[i+2] + a1[i+1] + a2[i]
	lda    $2,  -1($2)      # i++
	lda    $16, 8($16)      # avance les pointeurs
	lda    $17, 8($17)
	lda    $19, 8($19)
	bne    $2,  L(loop_2)
L(done):

	# derniers chiffres
	subq   $6,  1,  $6     # r6 <- q - (p-1)
	blt    $6,  1f
	ldq    $1,  0($18)     # r1 <- 0 ou a2[p-2]
	beq    $6,  1f
	ldq    $2,  8($18)     # r2 <- 0 ou a2[p-1]
1:
	ldq    $3,  8($17)     # r3 <- a1[p-1]
	addq   $0,  $3,  $3    # r3 <- a1[p-1] + ret
	cmpult $3,  $0,  $0
	addq   $1,  $3,  $3    # r3 <- a1[p-1] + a2[p-2] + ret
	cmpult $3,  $1,  $1
	addq   $1,  $0,  $0    # r0 <- retenue
	addq   $0,  $2,  $2    # r2 <- a2[p-1] + ret
	cmpult $2,  $0,  $0    # r0 <- retenue
	stq    $3, 16($19)     # sauve a1[p-1] + a2[p-2]
	stq    $2, 24($19)     # sauve a2[p-1]
	stq    $0, 32($19)     # sauve la retenue
	ret    $31, ($26),1

	.end   sn_add_base
#undef L
	
#endif /* defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr) */

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        

   #  void xn(toommul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
   #
   #  entre :
   #  a = naturel de longueur la
   #  b = naturel de longueur lb
   #  c = naturel de longueur la+lb, non confondu avec a ou b
   #  contraintes : 0 < lb <= la
   #
   #  sortie :
   #  c <- a*b

#ifdef assembly_sn_toommul
#define L(x) .Lsn_toommul_##x

        .align 5
#ifdef debug_toommul
        .globl sn_toommul_buggy
        .ent   sn_toommul_buggy
sn_toommul_buggy:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
#else
        .globl sn_toommul
        .ent   sn_toommul
sn_toommul:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):
#endif

	cmpule $19,  toommul_lim, $0 # petite multiplication ?
	bne    $0,   .Lsn_karamul_nogp # => algorithme de Karatsuba

	lda    $0, 0x5555($31)
	sll    $0,  16,  $1
	or     $1,  $0,  $0
	sll    $0,  32,  $1
	or     $1,  $0,  $0	# r0 <- (BASE-1)/3
	umulh  $0,  $17, $0
	addq   $0,  1,   $0     # r0 <- ceil(la/3) = p
	addq   $0,  $0,  $1     # r1 <- 2p
	subq   $19, $1,  $2     # r2 <- lb - 2p = r
	ble    $2,  L(tranches) # si lb <= 2p, dcoupe en tranches

	# ici lb > 2p, dcoupage de Toom
	# variables locales
	#define _d_  64($30)
	#define _x_  56($30)
        #define _a_  48($30)
        #define _b_  40($30)
        #define _c_  32($30)
	#define _p_  24($30)
	#define _q_  16($30)
	#define _r_   8($30)
	#define _ra_  0($30)
	
	subq   $17, $1,  $19    # r19 <- la - 2p = q
	addq   $0,  $1,  $1     # rserve 6p+18 chiffres dans la pile
	sll    $1,  4,   $1
	addq   $1,  144, $1
	subq   $30, $1,  $30
	stq    $31,  _x_        # sauve les paramtres
	stq    $16,  _a_
	stq    $18,  _b_
	stq    $20,  _c_
	stq    $0,   _p_
	stq    $19,  _q_
	stq    $2,   _r_
	stq    $26,  _ra_

	# c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
	bis    $0,  $0,  $17	# r17 <- p
	bis    $20, $20, $18	# r18 <- &c
	bsr    $26, .Lsn_add_sub3_nogp
	stq    $0, _x_

	# c[2p+2..3p+2] <- b0 + b1 + b2, c[3p+3..4p+3] <- |b0 - b1 + b2|
	ldq    $16, _b_
	ldq    $17, _p_
	ldq    $18, _c_
	s4addq $17, 4,   $0
	s4addq $0,  $18, $18	# r18 <- &c[2p+2]
	ldq    $19, _r_
	bsr    $26, .Lsn_add_sub3_nogp
	ldq    $1,  _x_
	xor    $1,  $0,  $0
	stq    $0,  _x_

        # d <- (a0 + a1 + a2)(b0 + b1 + b2) = c0 + c1 + c2 + c3 + c4
	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 1,   $17    # r17 <- p+1
	s8addq $17, $16, $18
	s8addq $17, $18, $18    # r18 <- &c[2p+2]
	bis    $17, $17, $19    # r19 <- p+1
	lda    $20, _d_
	bsr    $26,  L(nogp)

        # e <- |a0 - a1 + a2||b0 - b1 + b2| = |c0 - c1 + c2 - c3 + c4|
	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 1,   $17    # r17 <- p+1
	s8addq $17, $16, $16    # r16 <- &c[p+1]
	s8addq $17, $16, $18
	s8addq $17, $18, $18    # r18 <- &c[3p+3]
	bis    $17, $17, $19    # r19 <- p+1
	lda    $20, _d_
	s8addq $17, $20, $20
	s8addq $17, $20, $20    # r20 <- &e
	bsr    $26,  L(nogp)

        # f <- (a0 + BASE*a1 + BASE^2*a2)*(b0 + BASE*b1 + BASE^2*b2)
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
	ldq    $16, _a_
	ldq    $17, _c_
	ldq    $18, _p_
	ldq    $19, _q_
	bsr    $26, .Lsn_add_base_nogp
	
	ldq    $16, _b_
	ldq    $17, _c_
	ldq    $18, _p_
	ldq    $19, _r_
	s8addq $18, $17, $17
	lda    $17, 24($17)	# r17 <- &c[p+3]
	bsr    $26, .Lsn_add_base_nogp

	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 3,   $17	# r17 <- p+3
	s8addq $17, $16, $18    # r18 <- &c[p+3]
	bis    $17, $17, $19    # r19 <- p+3
	lda    $20, _d_
	s4addq $17, -8,  $0     # r0 <- 4p+4
	s8addq $0,  $20, $20    # r20 <- &f

	# diminue les longeurs des facteurs si les chiffres de tte sont nuls
	# seuls les chiffres de rang <= 2p+1 de f vont tre utiliss, donc ce
	# n est pas grave si les suivants ne sont pas initialiss
	s8addq $17, $16, $0
	ldq    $1,   -8($0)
	ldq    $2,  -16($0)
	cmpeq  $1,  $31, $1
	cmpeq  $2,  $31, $2
	and    $1,  $2,  $2
	addq   $1,  $2,  $1     # r1 <- nb. de zros de tte pour a
	subq   $17, $1,  $17
	s8addq $19, $18, $0
	ldq    $1,   -8($0)
	ldq    $2,  -16($0)
	cmpeq  $1,  $11, $1
	cmpeq  $2,  $11, $2
	and    $1,  $2,  $2
	addq   $1,  $2,  $1     # r1 <- nb. de zros de tte pour b
	subq   $19, $1,  $19
	cmpult $17, $19, $0     # si a est plus court, change les facteurs
	beq    $0,  1f
	bis    $16, $16, $0
	bis    $18, $18, $16
	bis    $0,  $0,  $18
	bis    $17, $17, $0
	bis    $19, $19, $17
	bis    $0,  $0,  $19
1:
	bsr    $26,  L(nogp)
	
        # c[0..2p-1] <- a0*b0 = c0, c[4*p..4p+q+r-1] <- a2*b2 = c4
	ldq    $16, _a_
	ldq    $17, _p_
	ldq    $18, _b_
	bis    $17, $17, $19	# r19 <- p
	ldq    $20, _c_
	bsr    $26,  L(nogp)
	
	ldq    $16, _a_
	ldq    $17, _q_
	ldq    $18, _b_
	ldq    $19, _r_
	ldq    $20, _c_
	ldq    $0,  _p_
	s4addq $0,  0,   $0	# r0 <- 4p
	s4addq $0,  $16, $16    # r16 <- &a[2p]
	s4addq $0,  $18, $18    # r18 <- &b[2p]
	s8addq $0,  $20, $20    # r20 <- &c[4p]
	bsr    $26,  L(nogp)
	
        # point de chute pour toomsqr
.Lsn_toom_aux:

	# d <- (d-e)/2 mod BASE^(2p+1) = c1 + c3, e <- (d+e)/2 = c0 + c2 + c4
	ldq    $20, _p_
	sll    $20, 1,   $20
	addq   $20, 1,   $20	# r20 <- 2p+1
	lda    $16, _d_
	s8addq $20, $16, $17    # r17 <- &e
	lda    $17, 8($17)
	ldq    $0,  _x_
	cmoveq $0,  $17, $18    # r18 <- &e (cas (a0-a1+a2)(b0-b1+b2) >= 0)
	cmoveq $0,  $16, $19    # r19 <- &d
	cmovne $0,  $17, $19    # r19 <- &e (cas (a0-a1+a2)(b0-b1+b2) <  0)
	cmovne $0,  $16, $18    # r18 <- &d
	bsr    $26, .Lsn_half_addsub_nogp

        # c[2p..4p] <- e - c0 - c4 = c2
        # f <- BASE^4*c4 - f = -c0 - BASE*c1 - BASE^2*c2 - BASE^3*c3
	ldq    $0,  _p_
	addq   $0,  $0,  $0    # r0  <- 2p
	ldq    $16, _c_        # r16 <- &c0
	s8addq $0,  $16, $20   # r20 <- &c2
	s8addq $0,  $20, $18   # r18 <- &c4
	lda    $19, _d_
	s8addq $0,  $19, $19   # r19 <- &e
	lda    $19, 16($19)
	s8addq $0,  $19, $21   # r21 <- &f[4]
	lda    $21, 48($21)
	ldq    $1,  _q_
	ldq    $2,  _r_
	addq   $1,  $2,  $22   # r22 <- q+r
	subq   $0,  $22, $23   # r23 <- 2p-q-r

	# f[0..3] <- -f[0..3]
	ldq    $5,  -32($21)   # r5 <- f[0]
	ldq    $6,  -24($21)   # r6 <- f[1]
	ldq    $7,  -16($21)   # r7 <- f[2]
	ldq    $8,   -8($21)   # r8 <- f[3]
	subq   $31, $5,  $5    # ngation sur 4 chiffres
	cmpeq  $5,  $31, $4
	eqv    $6,  $31, $6
	addq   $6,  $4,  $6
	cmpult $6,  $4,  $4
	eqv    $7,  $31, $7
	addq   $7,  $4,  $7
	cmpult $7,  $4,  $4
	eqv    $8,  $31, $8
	addq   $8,  $4,  $8
	stq    $5,  -32($21)
	stq    $6,  -24($21)
	stq    $7,  -16($21)
	stq    $8,   -8($21)
	cmpule $4,  $8,  $1    # r1 <- retenue(BASE^4*c4-f)
	bis    $31, $31, $0    # r0 <- retenue(d-c0-c4)

	# combinaison des chiffres communs
	.align 5
1:
	ldq    $2,  0($19)     # r2 <- e[i]
	ldq    $3,  0($21)     # r3 <- f[i+4]
	ldq    $4,  0($16)     # r4 <- c0[i]
	ldq    $5,  0($18)     # r5 <- c4[i]

	subq   $2,  $0,  $0    # r0 <- e[i] - ret(e-c0-c4)
	cmpult $2,  $0,  $2
	subq   $0,  $4,  $4    # r4 <- e[i] - c0[i] - ret(e-c0-c4)
	cmpult $0,  $4,  $0
	addq   $2,  $0,  $0
	subq   $4,  $5,  $2    # r2 <- e[i] - c0[i] - c4[i] - ret(e-c0-c4)
	cmpult $4,  $2,  $4
	addq   $4,  $0,  $0    # r0 <- retenue(e-c0-c4)
	stq    $2,  0($20)     # sauve c2[i]
	
	addq   $3,  $1,  $3    # r3 <- f[i+4] + ret(BASE^4*c4-f)
	cmpult $3,  $1,  $1
	subq   $5,  $3,  $3    # r3 <- c4[i] - f[i+4] - ret(BASE^4*c4-f)
	cmpult $5,  $3,  $5
	addq   $5,  $1,  $1    # r1 <- retenue(BASE^4*c4-f)
	stq    $3,  0($21)     # sauve f[i+4]
	
	lda    $22, -1($22)    # i++
	lda    $16,  8($16)    # avance les pointeurs
	lda    $19,  8($19)
	lda    $21,  8($21)
	lda    $18,  8($18)
	lda    $20,  8($20)
	bne    $22,  1b

	# continue sans c4
	beq    $23,  2f
	.align 5
1:
	ldq    $2,  0($19)     # r2 <- e[i]
	ldq    $3,  0($21)     # r3 <- f[i+4]
	ldq    $4,  0($16)     # r4 <- c0[i]

	subq   $2,  $0,  $0    # r0 <- e[i] - ret(e-c0-c4)
	cmpult $2,  $0,  $2
	subq   $0,  $4,  $4    # r4 <- e[i] - c0[i] - ret(e-c0-c4)
	cmpult $0,  $4,  $0
	addq   $2,  $0,  $0    # r0 <- retenue(e-c0-c4)
	stq    $4,  0($20)     # sauve c2[i]

	addq   $3,  $1,  $3    # r3 <- f[i+4] + ret(BASE^4*c4-f)
	cmpult $3,  $1,  $1
	subq   $31, $3,  $3    # r3 <- -f[i+4] - ret(BASE^4*c4-f)
	cmpult $31, $3,  $5
	addq   $5,  $1,  $1    # r1 <- retenue(BASE^4*c4-f)
	stq    $3,  0($21)     # sauve f[i+4]
	
	lda    $23, -1($23)    # i++
	lda    $16,  8($16)    # avance les pointeurs
	lda    $19,  8($19)
	lda    $21,  8($21)
	lda    $20,  8($20)
	bne    $23,  1b
2:

	# ajoute le dernier chiffre de c2  c4
	ldq    $2,  0($19)     # r2 <- e[2p]
	subq   $2,  $0,  $0    # r0 <- e[2p] - ret(e-c0-c4)
	subq   $31, 1,   $2    # r2 <- illimit
	bsr    $27, sn_incloop

        # f <- (f + c0 + BASE*d + BASE^2*c2)/(1-BASE^2) = BASE*c3
        # e[3..2p+3] <- d - f/BASE = c1
	ldq    $0,  _p_
	addq   $0,  $0, $0     # r0 <- 2p
	ldq    $16, _c_        # r16 <- &c0
	s8addq $0,  $16, $17   # r17 <- &c2
	lda    $18, _d_        # r18 <- &d
	addq   $0,  5,   $1
	s8addq $1,  $18, $19   # r19 <- &e[3]
	s8addq $0,  $19, $20   # r20 <- &f[1]
	subq   $0,  2,   $21   # r21 <- 2p-2

	# traite les deux premiers chiffres  part
	bis    $31, $31, $3     # r3 <- 0 (retenue de la division)
	bis    $31, $31, $4     # r4 <- 0 (= nf[0])
	ldq    $2,  0($20)	# r2 <- f[1]
	ldq    $0,  0($16)
	ldq    $1,  8($16)
	cmpult $31, $0,  $0     # r0 <- c0[0] != 0 (= retenue(f+c0))
	addq   $2,  $0,  $2     # r1 <- f[1] + ret
	cmpult $2,  $0,  $0
	addq   $2,  $1,  $1     # r1 <- f[1] + c0[1] + ret
	cmpult $1,  $2,  $2
	addq   $2,  $0,  $0
	ldq    $7,  0($18)      # r7 <- d[0]
	addq   $1,  $7,  $5     # r5 <- f[1] + c0[1] + d[0] + ret = nf[1]
	cmpult $5,  $1,  $1
	addq   $1,  $0,  $0
	stq    $5,  0($20)      # sauve nf[1]
	cmpult $7,  $5,  $6     # r6 <- retenue(d - f/BASE)
	subq   $7,  $5,  $7     # r7 <- d[0] - nf[1]
	stq    $7,  0($19)      # sauve nd[0]

	# boucle sur les chiffres communs
	.align 5
1:
	ldq    $1,  8($20)
	addq   $1,  $0,  $2	# r2 <- f[i] + ret
	cmpult $2,  $1,  $0
	ldq    $1,  16($16)
	addq   $2,  $1,  $1     # r1 <- f[i] + c0[i] + ret
	cmpult $1,  $2,  $2
	addq   $2,  $0,  $0
	ldq    $7,  8($18)      # r7 <- d[i-1]
	addq   $1,  $7,  $2     # r2 <- f[i] + c0[i] + d[i-1] + ret
	cmpult $2,  $1,  $1
	addq   $1,  $0,  $0
	ldq    $1,  0($17)
	addq   $2,  $1,  $1     # r1 <- f[i] + c0[i] + d[i-1] + c2[i-2] + ret
	cmpult $1,  $2,  $2
	addq   $2,  $0,  $0
	addq   $1,  $4,  $1     # r1 += nf[i-2]
	cmpult $1,  $4,  $2
	bis    $5,  $5,  $4     # r4 <- nf[i-1]
	addq   $1,  $3,  $5     # r5 <- nf[i]
	cmpult $5,  $3,  $3
	addq   $2,  $3,  $3     # r3 <- nouvelle retenue de la division
	stq    $5,  8($20)      # sauve nf[i]
	addq   $5,  $6,  $6
	cmpult $6,  $5,  $2
	subq   $7,  $6,  $6     # r6 <- d[i-1] - nf[i] - ret
	cmpult $7,  $6,  $7
	stq    $6,  8($19)      # sauve nd[i-1]
	addq   $2,  $7,  $6     # r6 <- retenue(d - f/BASE)
	
	lda    $21, -1($21)     # i++
	lda    $16,  8($16)     # avance les pointeurs
	lda    $18,  8($18)
	lda    $20,  8($20)
	lda    $17,  8($17)
	lda    $19,  8($19)
	bne    $21,  1b

	# derniers chiffres
	ldq    $2,  8($20)
	addq   $2,  $0,  $1     # r1 <- f[2p] + ret
	cmpult $1,  $2,  $0
	ldq    $7,  8($18)      # r7 <- d[2p-1]
	addq   $1,  $7,  $2     # r2 <- f[2p] + d[2p-1] + ret
	cmpult $2,  $1,  $1
	addq   $1,  $0,  $0
	ldq    $1,  0($17)
	addq   $2,  $1,  $1     # r1 <- f[2p] + d[2p-1] + c2[2p-2] + ret
	cmpult $1,  $2,  $2
	addq   $2,  $0,  $0
	addq   $1,  $4,  $1     # r1 += nf[2p-2]
	cmpult $1,  $4,  $2
	bis    $5,  $5,  $4     # r4 <- nf[2p-1]
	addq   $1,  $3,  $5     # r5 <- nf[2p]
	cmpult $5,  $3,  $3
	addq   $2,  $3,  $3     # r3 <- nouvelle retenue de la division
	stq    $5,  8($20)      # sauve nf[2p]
	addq   $5,  $6,  $6
	cmpult $6,  $5,  $2
	subq   $7,  $6,  $6     # r6 <- d[2p-1] - nf[2p] - ret
	cmpult $7,  $6,  $7
	stq    $6,  8($19)      # sauve nd[2p-1]
	addq   $2,  $7,  $6     # r6 <- retenue(d + f/BASE)

	ldq    $2,  16($20)
	addq   $2,  $0,  $1     # r1 <- f[2p+1] + ret
	ldq    $7,  16($18)     # r7 <- d[2p]
	addq   $1,  $7,  $2     # r2 <- f[2p+1] + d[2p] + ret
	ldq    $1,  8($17)
	addq   $2,  $1,  $1     # r1 <- f[2p+1] + d[2p] + c2[2p-1] + ret
	addq   $1,  $4,  $1     # r1 += nf[2p-1]
	addq   $1,  $3,  $5     # r5 <- nf[2p+1]
	stq    $5,  16($20)     # sauve nf[2p+1]
	addq   $5,  $6,  $6
	subq   $7,  $6,  $6     # r6 <- d[2p] - nf[2p+1] - ret

        # ajoute nd[2p]  nf
	ldq    $0,  16($19)
	addq   $0,  $6,  $6
	stq    $6,  16($19)
	cmpult $6,  $0,  $0
	beq    $0,  2f
1:
	lda    $19, 8($19)
	ldq    $6,  16($19)
	addq   $6,  1,  $6
	stq    $6,  16($19)
	beq    $6,  1f
2:

        # injecte c1,c3 dans c
	ldq    $0,  _p_
	ldq    $1,  _q_
	ldq    $2,  _r_
	addq   $0,  $0,  $3     # r3  <- 2p
	addq   $0,  $3,  $4
	addq   $1,  $4,  $4     # r4  <- 3p+q
	addq   $4,  $2,  $17	# r17 <- 3p+q+r
	addq   $4,  1,   $19    # r19 <- 3p+q+1
	ldq    $16, _c_
	s8addq $0,  $16, $16    # r16 <- &c[p]
	lda    $18, _d_
	addq   $3,  5,   $3
	s8addq $3,  $18, $18    # r18 <- &e[3]
	bsr    $26, .Lsn_inc_nogp
	
	# termin
	ldq    $26, _ra_
	ldq    $0,  _p_
	addq   $0,  $0,  $1     # nettoie la pile
	addq   $0,  $1,  $0
	addq   $0,  $0,  $0
	addq   $0,  18,  $0
	s8addq $0,  $30, $30
	ret    $31, ($26),1

	#undef _x_
	#undef _a_
	#undef _b_
	#undef _c_
	#undef _d_
	#undef _p_
	#undef _q_
	#undef _r_
	#undef _ra_

        # ici lb <= 2*ceil(la/3) : dcoupage en tranches de longueur lb
        # variables locales
	#define _sp_    80($30)
        #define _d_     72($30)
        #define _a_     64($30)
        #define _la_    56($30)
        #define _b_     48($30)
        #define _lb_    40($30)
        #define _c_  	32($30)
	#define _l_  	24($30)
        #define _ra_ 	16($30)
	#define _add_  	 8($30)
	#define _move_ 	 0($30)
	
        .align 5
L(tranches):
	s8addq $19,  88,   $1 	# rserve lb+10 chiffres dans la pile
	bic    $1,   15,   $1   # en arrondissant  un compte pair
	subq   $30,  $1,   $30
	stq    $16,  _a_        # sauve les paramtres
	stq    $17,  _la_
	stq    $18,  _b_
	stq    $19,  _lb_
	stq    $31,  _l_
	stq    $26,  _ra_

	# prpare le droulement des boucles
	subq   $31,  $19,  $0	# r0 <- -lb
	and    $0,   31,   $0	# r0 <- (-lb) % 32
	lda    $1,   sn_cpuploop
	s8addq $0,   $1,   $1   # r1 <- adresse de saut pour move
	stq    $1,   _move_
	sll    $0,   3,    $0   # r0 <- 8*((-lb) % 32)
	lda    $1,   sn_addloop
	s4addq $0,   $1,   $1   # r1 <- adresse de saut pour add
	stq    $1,   _add_
	subq   $20,  $0,   $1   # r1 <- c cadr sur un multiple de 32
	stq    $1,   _c_
	lda    $1,   _sp_
	subq   $1,   $0,   $1   # r1 <- d cadr sur un multiple de 32
	stq    $1,   _d_
	
        # premire multiplication : c <- a[0..lb-1]*b
	bis    $19,  $19,  $17
	bsr    $26,  L(nogp)
	br     $31,  3f

        # multiplications suivantes
	.align 5
1:
	stq    $18,  _a_	# sauvegarde les paramtres
	stq    $19,  _la_
	stq    $16,  _c_
	cmpult $17,  $19,  $0	# l <- min(lb,la)
	cmovne $0,   $17,  $19
	stq    $19,  _l_

	# sauvegarde c[0..lb-1] dans la pile
	ldq    $27,  _move_
	ldq    $20,  _d_
	subq   $31,  $17,  $2	# r2 <- -lb
	jsr    $27,  ($27)      # d <- c[0..lb-1]

        # multiplication
	sll    $17,  3,   $0
	subq   $16,  $0,  $20   # r20 <- &c[0]
	ldq    $16,  _b_
	bsr    $26,  L(nogp)

	# ajoute d
	ldq    $16,  _c_
	ldq    $18,  _d_
	ldq    $2,   _lb_
	ldq    $27,  _add_
	subq   $31,  $2,   $2   # r2 <- -lb
	bis    $16,  $16,  $20
	bis    $31,  $31,  $0   # r0 <- 0 (retenue)
	jsr    $27,  ($27)	# effectue l addition
	ldq    $2,   _l_
	jsr    $27,  sn_incloop # propage la retenue

3:
	ldq    $18,  _a_	# rcupre les paramtres
	ldq    $19,  _la_
	ldq    $17,  _lb_
	ldq    $16,  _c_
	s8addq $17,  $18,  $18  # a  += lb
	s8addq $17,  $16,  $16  # c  += lb
	subq   $19,  $17,  $19  # la -= lb
	bgt    $19,  1b

	addq   $17,  1,   $17   # nettoie la pile
	bic    $17,  1,   $17   
	ldq    $26,  _ra_
	lda    $30,  _sp_
	s8addq $17,  $30,  $30
	ret    $31,  ($26),1
	
        #undef  _sp_
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #undef  _l_
	#undef  _ra_
	#undef _move_
	#undef _add_

#ifdef debug_toommul
	.end sn_toommul_buggy
#else
	.end sn_toommul
#endif
#undef L
#endif /* assembly_sn_toommul */
#if !defined(assembly_sn_toommul) || defined(debug_toommul)
	REPLACE(sn_toommul)
#endif

                                 # +---------+
                                 # |  Carr  |
	                         # +---------+

   #  void xn(toomsqr)(chiffre *a, long la, chiffre *c)
   #
   #  entre :
   #  a = naturel de longueur la
   #  c = naturel de longueur 2*la, non confondu avec a
   #  contraintes : 0 < la
   #
   #  sortie :
   #  c <- a^2

#ifdef assembly_sn_toomsqr
#define L(x) .Lsn_toomsqr_##x

        .align 5
#ifdef debug_toommul
        .globl sn_toomsqr_buggy
        .ent   sn_toomsqr_buggy
sn_toomsqr_buggy:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
#else
        .globl sn_toomsqr
        .ent   sn_toomsqr
sn_toomsqr:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)
L(nogp):
#endif
      
	cmpule $17,  toomsqr_lim, $0 # petit carr ?
	bne    $0,   .Lsn_karasqr_nogp # => algorithme de Karatsuba

	lda    $0, 0x5555($31)
	sll    $0,  16,  $1
	or     $1,  $0,  $0
	sll    $0,  32,  $1
	or     $1,  $0,  $0	# r0 <- (BASE-1)/3
	umulh  $0,  $17, $0
	addq   $0,  1,   $0     # r0 <- ceil(la/3) = p
	addq   $0,  $0,  $1     # r1 <- 2p
	subq   $17, $1,  $19    # r19 <- la - 2p = q

	# variables locales
	#define _d_  64($30)
	#define _x_  56($30)
        #define _a_  48($30)
        #define _b_  40($30)
        #define _c_  32($30)
	#define _p_  24($30)
	#define _q_  16($30)
	#define _r_   8($30)
	#define _ra_  0($30)
	
	addq   $0,  $1,  $1     # rserve 6p+18 chiffres dans la pile
	sll    $1,  4,   $1
	addq   $1,  144, $1
	subq   $30, $1,  $30
	stq    $31,  _x_        # sauve les paramtres
	stq    $16,  _a_
	stq    $16,  _b_
	stq    $18,  _c_
	stq    $0,   _p_
	stq    $19,  _q_
	stq    $19,  _r_
	stq    $26,  _ra_

	# c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
	bis    $0,  $0,  $17	# r17 <- p
	bsr    $26, .Lsn_add_sub3_nogp

        # d <- (a0 + a1 + a2)^2 = c0 + c1 + c2 + c3 + c4
	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 1,   $17    # r17 <- p+1
	lda    $18, _d_
	bsr    $26,  L(nogp)

        # e <- |a0 - a1 + a2|^2 = c0 - c1 + c2 - c3 + c4
	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 1,   $17    # r17 <- p+1
	s8addq $17, $16, $16    # r16 <- &c[p+1]
	lda    $18, _d_
	s8addq $17, $18, $18
	s8addq $17, $18, $18    # r18 <- &e
	s8addq $17, $16, $0     # teste si a0-a1+a2 >= 0
	ldq    $0,  -8($0)
	bge    $0,  2f
	bis    $16, $16, $0     # sinon, remplace par l oppos
	bis    $17, $17, $1
	addq   $31, 1,   $2
	.align 5
1:
	lda    $1, -1($1)
	ldq    $3,  0($0)
	eqv    $3,  $31, $3
	addq   $2,  $3,  $3
	cmpult $3,  $2,  $2
	stq    $3,  0($0)
	lda    $0,  8($0)
	bne    $1,  1b
2:
	bsr    $26,  L(nogp)
	
        # f <- (a0 + BASE*a1 + BASE^2*a2)^2
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
	ldq    $16, _a_
	ldq    $17, _c_
	ldq    $18, _p_
	ldq    $19, _q_
	bsr    $26, .Lsn_add_base_nogp
	
	ldq    $16, _c_
	ldq    $17, _p_
	addq   $17, 3,   $17	# r17 <- p+3
	lda    $18, _d_
	s4addq $17, -8,  $0     # r0 <- 4p+4
	s8addq $0,  $18, $18    # r18 <- &f

	# diminue les longeurs des facteurs si les chiffres de tte sont nuls
	# seuls les chiffres de rang <= 2p+1 de f vont tre utiliss, donc ce
	# n est pas grave si les suivants ne sont pas initialiss
	s8addq $17, $16, $0
	ldq    $1,   -8($0)
	ldq    $2,  -16($0)
	cmpeq  $1,  $31, $1
	cmpeq  $2,  $31, $2
	and    $1,  $2,  $2
	addq   $1,  $2,  $1     # r1 <- nb. de zros de tte pour a
	subq   $17, $1,  $17
	bsr    $26,  L(nogp)
	
        # c[0..2p-1] <- a0^2 = c0, c[4*p..4p+q+r-1] <- a2^2 = c4
	ldq    $16, _a_
	ldq    $17, _p_
	ldq    $18, _c_
	bsr    $26,  L(nogp)
	
	ldq    $16, _a_
	ldq    $17, _q_
	ldq    $18, _c_
	ldq    $0,  _p_
	s4addq $0,  0,   $0	# r0 <- 4p
	s4addq $0,  $16, $16    # r16 <- &a[2p]
	s8addq $0,  $18, $18    # r18 <- &c[4p]
	bsr    $26,  L(nogp)
	
	# continue avec toommul
	br     $31,  .Lsn_toom_aux

        #undef  _a_
        #undef  _b_
        #undef  _c_
	#undef  _d_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#undef  _ra_

#ifdef debug_toommul
	.end sn_toomsqr_buggy
#else
	.end sn_toomsqr
#endif
#undef L
#endif /* assembly_sn_toomsqr */
#if !defined(assembly_sn_toomsqr) || defined(debug_toommul)
	REPLACE(sn_toomsqr)
#endif
