    /*        Fast GEMM routine for Alpha 21164/21264      */
    /*         on  Linux, Digital UNIX                     */
    /*        by Kazushige Goto <goto@statabo.rim.or.jp>   */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/mman.h>
#include <pthread.h>
#include "common.h"
#include "version.h"

/* To allocate buffer memory  */
#define BUFFER_SIZE (16<<20)
/* Align to 8MB */
#define ALIGN_SIZE 0x7fffff

static char *version = VERSION;

#ifdef SMP
typedef struct {
  int trans;
  int m, n, k;
  FLOAT alpha[2];
  FLOAT *a;
  int lda;
  FLOAT *b;
  int ldb;
  FLOAT *c;
  int ldc;
} ZGEMM_PARAM_T;

static ZGEMM_PARAM_T param;
static int           offset;

int ZGEMM_THREAD(void *arg){
  long current = (long)arg;
  int jn, info, pthread_flag;
  FLOAT *b, *buffer, *a_buffer;
  int (*zgemm[])(int, int, int, FLOAT *, FLOAT *, int, FLOAT* ,
		 int, FLOAT *, int, FLOAT *)
    ={ ZGEMM_NN, ZGEMM_NT, ZGEMM_NR, ZGEMM_NC,
       ZGEMM_TN, ZGEMM_TT, ZGEMM_TR, ZGEMM_TC,
       ZGEMM_RN, ZGEMM_RT, ZGEMM_RR, ZGEMM_RC,
       ZGEMM_CN, ZGEMM_CT, ZGEMM_CR, ZGEMM_CC};
  
  if ((buffer = (FLOAT *)mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == NULL){
    info = INFO_NUM;
#ifdef DGEMM
    xerbla_("ZGEMM ", &info, 6L);
#else
    xerbla_("CGEMM ", &info, 6L);
#endif
    return 0;
  }

  a_buffer = (FLOAT *)(((long)buffer + ALIGN_SIZE) & ~ALIGN_SIZE);

  pthread_flag = (current & 1);
  current >>= 1;

  jn = MIN(offset, param.n - current);

  if (!( param.trans & 1)){
    b = param.b + current * param.ldb;
  }else{
    b = param.b + current * 2;
  }

  (zgemm[param.trans])(param.m, jn, param.k,
			param.alpha, param.a, param.lda,
			b, param.ldb,
			param.c + current*param.ldc, param.ldc, a_buffer);

  munmap((void *)buffer, BUFFER_SIZE);

  if (pthread_flag) pthread_exit(NULL);
  return 0;
};
#endif

int ZGEMM_(char *TRANSA, char *TRANSB, int *M, int *N, int *K,
	   FLOAT *alpha, FLOAT *a, int *LDA,
	                 FLOAT *b, int *LDB, 
	   FLOAT *beta,  FLOAT *c, int *LDC){

  int   m, n, k;
  int lda, ldb, ldc;
  int nota, notb;
  int conja, conjb;
  int trana, tranb;
  int refa, refb;

  int info;
  int nrowa, nrowb;
  char transA, transB;

  FLOAT alpha_r, alpha_i;
  FLOAT beta_r, beta_i;
#ifndef SMP
  FLOAT *buffer, *a_buffer;
#endif

#ifdef SMP
  int   i, j;
  pthread_t threads[CPU_NUM];
  int num_of_cpu;
#endif

#ifndef SMP
  int (*zgemm[])(int, int, int, FLOAT *, FLOAT *, int, FLOAT* ,
		 int, FLOAT *, int, FLOAT *)
    ={ ZGEMM_NN, ZGEMM_NT, ZGEMM_NR, ZGEMM_NC,
       ZGEMM_TN, ZGEMM_TT, ZGEMM_TR, ZGEMM_TC,
       ZGEMM_RN, ZGEMM_RT, ZGEMM_RR, ZGEMM_RC,
       ZGEMM_CN, ZGEMM_CT, ZGEMM_CR, ZGEMM_CC};
#endif

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  beta_r = *(beta + 0);
  beta_i = *(beta + 1);

  m = *M;
  n = *N;
  k = *K;

  lda = *LDA*2;
  ldb = *LDB*2;
  ldc = *LDC*2;

  transA = *TRANSA;
  transB = *TRANSB;

  transA = toupper(transA);
  transB = toupper(transB);

  nota  = (transA == 'N');
  notb  = (transB == 'N');
  trana = (transA == 'T');
  tranb = (transB == 'T');
  refa  = (transA == 'R');
  refb  = (transB == 'R');
  conja = (transA == 'C');
  conjb = (transB == 'C');

  if (nota || refa) nrowa = m; else nrowa = k;
  if (notb || refb) nrowb = k; else nrowb = n;
  
  /*     Test the input parameters. */
  info = 0;
  if (! nota && ! conja && ! trana && ! refa ) {
    info = 1;
  } else if (! notb && ! conjb && ! tranb && ! refb) {
    info = 2;
  } else if (m < 0) {
    info = 3;
  } else if (n < 0) {
    info = 4;
  } else if (k < 0) {
    info = 5;
  } else if (lda < MAX(1,nrowa)*2) {
    info = 8;
  } else if (ldb < MAX(1,nrowb)*2) {
    info = 10;
  } else if (ldc < MAX(1,m)*2) {
    info = 13;
  }
  if (info != 0) {
#ifdef DGEMM
    xerbla_("ZGEMM ", &info, 6L);
#else
    xerbla_("CGEMM ", &info, 6L);
#endif
    return 1;
  }
  
  /* Quick return if possible. */
  if ((m == 0) || (n == 0) ) return 0;
  
  if (beta_r != 1. || beta_i != 0.) ZGEMM_BETA(m, n, c, ldc, beta_r, beta_i);
  
  /* And when  alpha.eq.zero. */
  if ((alpha_r == 0. && alpha_i == 0.) || k==0) return 0;


#ifndef SMP
  if ((buffer = (FLOAT *)mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == NULL){
    info = INFO_NUM;
#ifdef DGEMM
    xerbla_("DGEMM ", &info, 6L);
#else
    xerbla_("SGEMM ", &info, 6L);
#endif
    return 0;
  }

  a_buffer = (FLOAT *)(((long)buffer + ALIGN_SIZE) & ~ALIGN_SIZE);

  if (nota)  info  = 0x0;
  if (trana) info  = 0x4;
  if (refa)  info  = 0x8;
  if (conja) info  = 0xc;

  if (notb)  info |= 0x0;
  if (tranb) info |= 0x1;
  if (refb)  info |= 0x2;
  if (conjb) info |= 0x3;

  (zgemm[info])(m, n, k, alpha, a, lda, b, ldb, c, ldc, a_buffer);

  munmap((void *)buffer, BUFFER_SIZE);

#else

  if (nota)  info  = 0x0;
  if (trana) info  = 0x4;
  if (refa)  info  = 0x8;
  if (conja) info  = 0xc;

  if (notb)  info |= 0x0;
  if (tranb) info |= 0x1;
  if (refb)  info |= 0x2;
  if (conjb) info |= 0x3;

  param.trans = info;
  param.m     = m;
  param.n     = n;
  param.k     = k;
  param.alpha[0] = alpha_r;
  param.alpha[1] = alpha_i;
  param.a     = a;
  param.lda   = lda;
  param.b     = b;
  param.ldb   = ldb;
  param.c     = c;
  param.ldc   = ldc;

  offset = (n-1)/(CPU_NUM)+1;
  if (offset < 8) offset = 8;

  num_of_cpu = (n-1)/offset + 1;

  for(j=0, i=0; i < num_of_cpu - 1; j+= offset, i++){
    pthread_create(&threads[i], NULL, 
		   (void *)&ZGEMM_THREAD, (void *)(long)((j<<1)|1));
  }

  ZGEMM_THREAD((void *)(long)((j<<1)));

  for(i=0; i < num_of_cpu - 1; i++){
    pthread_join(threads[i], NULL);
  }

#endif
  return 0;
}
