/*
 * Copyright (C) 2002  Terence M. Welsh
 * Ported to Linux by Tugrul Galatali <tugrul@galatali.com>
 *
 * rsMath is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as 
 * published by the Free Software Foundation.
 *
 * rsMath is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "config.h"
#include "rsMath.h"
#include "rsDefines.h"

/* on x86 at least: 1 - 3DNow, 2 - SSE */
int cpuid;

#define FLAG_3DNOW 1
#define FLAG_SSE 2

#ifdef USE_SSE
#define DETECT_X86_EXTS
#endif

#ifdef USE_3DNOW
#define DETECT_X86_EXTS
#endif

void rsCPUDetect() {
#ifdef DETECT_X86_EXTS
	__asm__ (
"\n xor	%%eax, %%eax"

"\n cpuid"

"\n cmp	$0x68747541, %%ebx"
"\n jne	not_amd"

"\n mov	$0x80000001, %%eax"
"\n cpuid"

"\n shr	$31, %%edx"
"\n jmp	done"

"\n not_amd:"
"\n xor	%%edx, %%edx"
"\n cmp	$1, %%eax"
"\n jl	no_exts"

"\n xor	%%eax, %%eax"
"\n inc	%%eax"

"\n cpuid"

"\n shr	$24, %%edx"
"\n and	$2, %%edx"

"\n no_exts:"
"\n done:"
	
	: "=d" (cpuid)
	:
	: "%eax", "%ebx", "%ecx"
	);
#endif
}

float rsVec_length (float *v)
{
	return (float)sqrt (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
}

float rsVec_normalize (float *v)
{
	float length;

#ifdef USE_3DNOW
	if (cpuid & FLAG_3DNOW) {
		__asm__ (
"\n femms"

"\n movq            (%1), %%mm0             /* 1 | 0 */"
"\n movq            8(%1), %%mm1            /* - | 2 */"

"\n movq            %%mm0, %%mm2            /* 1 | 0 */"
"\n movq            %%mm1, %%mm3            /* - | 2 */"

"\n pfmul           %%mm0, %%mm0            /* 1 * 1 | 0 * 0 */"
"\n pfmul           %%mm1, %%mm1            /* -     | 2 * 2 */"

"\n movq            %%mm0, %%mm7            /* 1 * 1 | 0 * 0 */"
"\n punpckhdq       %%mm7, %%mm7            /* 1 * 1 | 1 * 1 */"

"\n pfadd           %%mm1, %%mm0            /* -     | 0 * 0 + 2 * 2 */"
"\n pfadd           %%mm7, %%mm0            /* -     | 0 * 0 + 2 * 2 + 1 * 1 */"

"\n pfrsqrt         %%mm0, %%mm1            /* 24-bit sqrt */"
"\n movq            %%mm1, %%mm4"
"\n pfmul           %%mm1, %%mm1"
"\n punpckldq       %%mm0, %%mm0"
"\n pfrsqit1        %%mm0, %%mm1"
"\n pfrcpit2        %%mm4, %%mm1"
"\n pfmul           %%mm1, %%mm0"

"\n movd            %%mm0, %0               /* length */"

"\n pfrcp           %%mm0, %%mm1            /* 24-bit reciprocal */"
"\n movq            %%mm0, %%mm4"
"\n punpckldq       %%mm4, %%mm4"
"\n pfrcpit1        %%mm1, %%mm4"
"\n pfrcpit2        %%mm1, %%mm4"

"\n pfmul           %%mm4, %%mm2            /* 1 / length | 0 / length */"
"\n pfmul           %%mm4, %%mm3            /* -          | 2 / length */"

"\n movq            %%mm2, (%1)             /* 1 | 0 */"
"\n movq            %%mm3, 8(%1)            /* - | 2 */"

"\n femms"

		: "=m" (length)
		: "d" (v)
		);

		return length;
	}
#endif

#ifdef USE_SSE
	if (cpuid & FLAG_SSE) {
		__asm__ (
"\n movups          (%1), %%xmm0            /* 0 | 1 | 2 | - */"
"\n movups          %%xmm0, %%xmm1          /* 0 | 1 | 2 | - */"

"\n mulps           %%xmm0, %%xmm0          /* 0 * 0 | 1 * 1 | 2 * 2 | - */"
"\n movups          %%xmm0, %%xmm2          /* 0 * 0 | 1 * 1 | 2 * 2 | - */"

"\n shufps          $9, %%xmm2, %%xmm2      /* 1 * 1                 | 2 * 2 | - | - */"
"\n addss           %%xmm2, %%xmm0          /* 0 * 0 + 1 * 1         | -     | - | - */"
"\n shufps          $1, %%xmm2, %%xmm2      /* 2 * 2                 | -     | - | - */"
"\n addss           %%xmm2, %%xmm0          /* 0 * 0 + 1 * 1 + 2 * 2 | -     | - | - */"

"\n sqrtss          %%xmm0, %%xmm0          /* length | -      | -      | - */"
"\n movss           %%xmm0, %0              /* length */"

"\n unpcklps        %%xmm0, %%xmm0          /* length | length | -      | - */"
"\n unpcklps        %%xmm0, %%xmm0          /* length | length | length | length */"
"\n divps           %%xmm0, %%xmm1          /* 1 / length | 2 / length | 3 / length | - */"

"\n movups          %%xmm1, (%1)            /* 1 / length | 2 / length | 3 / length | - */"
		
		: "=m" (length)
		: "d" (v)
		);

		return length;
	}
#endif

	length = (float)sqrt (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);

	if (length == 0.0f) {
		v[1] = 1.0f;
		return (0.0f);
	}

	v[0] /= length;
	v[1] /= length;
	v[2] /= length;

	return length;
}

void rsVec_cross (float *v, float vec1[4], float vec2[4])
{
#ifdef USE_3DNOW
	if (cpuid & FLAG_3DNOW) {
		__asm__ (

"\n femms"

"\n movq            (%0), %%mm0             /* 1.1 | 1.0 */"
"\n movq            8(%0), %%mm1            /* -   | 1.2 */"

"\n movq            (%1), %%mm2             /* 2.1 | 2.0 */"
"\n movq            8(%1), %%mm3            /* -   | 2.2 */"

"\n movq            %%mm0, %%mm4            /* 1.1 | 1.0 */"
"\n punpckhdq       %%mm4, %%mm4            /* 1.1 | 1.1 */"
"\n punpckldq       %%mm1, %%mm4            /* 1.2 | 1.1 */"

"\n movq            %%mm2, %%mm5            /* 2.1 | 2.0 */"
"\n punpckhdq       %%mm2, %%mm5            /* 2.1 | 2.1 */"
"\n punpckldq       %%mm3, %%mm5            /* 2.2 | 2.1 */"

"\n pfmul           %%mm2, %%mm4            /* 1.2 * 2.1 | 2.0 * 1.1 */"
"\n pfmul           %%mm0, %%mm5            /* 1.1 * 2.2 | 1.0 * 2.1 */"

"\n pfmul           %%mm0, %%mm3            /* - | 1.0 * 2.2 */"
"\n pfmul           %%mm2, %%mm1            /* - | 2.0 * 1.2 */"

"\n pfsub           %%mm4, %%mm5            /* 0 | 2 */"
"\n pfsub           %%mm3, %%mm1            /* - | 1 */"

"\n movq            %%mm5, %%mm6            /* 0 | 2 */"
"\n punpckldq       %%mm1, %%mm1            /* 1 | 1 */"
"\n punpckhdq       %%mm1, %%mm6            /* 1 | 0 */"

"\n movq            %%mm6, (%2)             /* 1 | 0 */"
"\n movq            %%mm5, 8(%2)            /* - | 2 */"

"\n femms"

		:
		: "a" (vec1), "b" (vec2), "d" (v)
		);

		return;
	}
#endif

#ifdef USE_SSE
	if (cpuid & FLAG_SSE) {
		__asm__ (

"\n movups  (%0), %%xmm0                    /* 1.0 | 1.1 | 1.2 | - */"
"\n movups  (%1), %%xmm1                    /* 2.0 | 2.1 | 2.2 | - */"

"\n movups  %%xmm0, %%xmm2                  /* 1.0 | 1.1 | 1.2 | - */"
"\n shufps  $201, %%xmm2, %%xmm2            /* 1.1 | 1.2 | 1.0 | - */"
"\n movups  %%xmm1, %%xmm3                  /* 2.0 | 2.1 | 2.2 | - */"
"\n shufps  $201, %%xmm3, %%xmm3            /* 2.1 | 2.2 | 2.0 | - */"

"\n mulps   %%xmm1, %%xmm2                  /* 1.1 * 2.0 | 1.2 * 2.1 | 1.0 * 2.2 | - */"
"\n mulps   %%xmm0, %%xmm3                  /* 2.1 * 1.0 | 2.2 * 1.1 | 2.0 * 1.2 | - */"

"\n subps   %%xmm2, %%xmm3                  /* 2 | 0 | 1 | - */"

"\n shufps  $201, %%xmm3, %%xmm3            /* 0 | 1 | 2 | - */"

"\n movups  %%xmm3, (%2)                    /* 0 | 1 | 2 | - */"
		
		: 
		: "a" (vec1), "c" (vec2), "d" (v)
		);

		return;
	}
#endif

	v[0] = vec1[1] * vec2[2] - vec2[1] * vec1[2];
	v[1] = vec1[2] * vec2[0] - vec2[2] * vec1[0];
	v[2] = vec1[0] * vec2[1] - vec2[0] * vec1[1];
}

void rsVec_scale (float *v, float scale)
{
	v[0] *= scale;
	v[1] *= scale;
	v[2] *= scale;
}

void rsVec_copy (float v[3], float *dest)
{
	dest[0] = v[0];
	dest[1] = v[1];
	dest[2] = v[2];
}

void rsVec_add (float v[3], float vec[3], float *dest)
{
	dest[0] = v[0] + vec[0];
	dest[1] = v[1] + vec[1];
	dest[2] = v[2] + vec[2];
}

void rsVec_subtract (float v[3], float vec[3], float *dest)
{
	dest[0] = v[0] - vec[0];
	dest[1] = v[1] - vec[1];
	dest[2] = v[2] - vec[2];
}

void rsQuat_make (float *q, float a, float x, float y, float z)
{
	if (a < RSEPSILON && a > -RSEPSILON) {
		q[0] = 0.0f;
		q[1] = 0.0f;
		q[2] = 0.0f;
		q[3] = 1.0f;
	} else {
		float sintheta;

		a *= 0.5f;
		sintheta = sin (a);
		q[0] = sintheta * x;
		q[1] = sintheta * y;
		q[2] = sintheta * z;
		q[3] = cos (a);
	}
}

void rsQuat_preMult (float *q, float postQuat[4])
{
	/*
	 * q1q2 = s1v2 + s2v1 + v1xv2, s1s2 - v1.v2 
	 */
	float tempx = q[0];
	float tempy = q[1];
	float tempz = q[2];
	float tempw = q[3];

	q[0] = tempw * postQuat[0] + postQuat[3] * tempx + tempy * postQuat[2] - postQuat[1] * tempz;
	q[1] = tempw * postQuat[1] + postQuat[3] * tempy + tempz * postQuat[0] - postQuat[2] * tempx;
	q[2] = tempw * postQuat[2] + postQuat[3] * tempz + tempx * postQuat[1] - postQuat[0] * tempy;
	q[3] = tempw * postQuat[3] - tempx * postQuat[0] - tempy * postQuat[1] - tempz * postQuat[2];
}

void rsQuat_postMult (float *q, float preQuat[4])
{
	float tempx = q[0];
	float tempy = q[1];
	float tempz = q[2];
	float tempw = q[3];

	q[0] = preQuat[3] * tempx + tempw * preQuat[0] + preQuat[1] * tempz - tempy * preQuat[2];
	q[1] = preQuat[3] * tempy + tempw * preQuat[1] + preQuat[2] * tempx - tempz * preQuat[0];
	q[2] = preQuat[3] * tempz + tempw * preQuat[2] + preQuat[0] * tempy - tempx * preQuat[1];
	q[3] = preQuat[3] * tempw - preQuat[0] * tempx - preQuat[1] * tempy - preQuat[2] * tempz;
}

void rsQuat_toMat (float *q, float *mat)
{
	float s, xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz;

	/*
	 * must have an axis 
	 */
	if (q[0] == 0.0f && q[1] == 0.0f && q[2] == 0.0f) {
		mat[0] = 1.0f;
		mat[1] = 0.0f;
		mat[2] = 0.0f;
		mat[3] = 0.0f;
		mat[4] = 0.0f;
		mat[5] = 1.0f;
		mat[6] = 0.0f;
		mat[7] = 0.0f;
		mat[8] = 0.0f;
		mat[9] = 0.0f;
		mat[10] = 1.0f;
		mat[11] = 0.0f;
		mat[12] = 0.0f;
		mat[13] = 0.0f;
		mat[14] = 0.0f;
		mat[15] = 1.0f;
		return;
	}

	s = 2.0f / (q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]);
	xs = q[0] * s;
	ys = q[1] * s;
	zs = q[2] * s;
	wx = q[3] * xs;
	wy = q[3] * ys;
	wz = q[3] * zs;
	xx = q[0] * xs;
	xy = q[0] * ys;
	xz = q[0] * zs;
	yy = q[1] * ys;
	yz = q[1] * zs;
	zz = q[2] * zs;

	mat[0] = 1.0f - yy - zz;
	mat[1] = xy + wz;
	mat[2] = xz - wy;
	mat[3] = 0.0f;
	mat[4] = xy - wz;
	mat[5] = 1.0f - xx - zz;
	mat[6] = yz + wx;
	mat[7] = 0.0f;
	mat[8] = xz + wy;
	mat[9] = yz - wx;
	mat[10] = 1.0f - xx - yy;
	mat[11] = 0.0f;
	mat[12] = 0.0f;
	mat[13] = 0.0f;
	mat[14] = 0.0f;
	mat[15] = 1.0f;
}
