#include "atlas_misc.h"
#include "atlas_level1.h"
#include "atlas_lvl2.h"
#include "atlas_reflvl2.h"
#include "atlas_reflevel2.h"
#if defined(ATL_INL1)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),her2_L1.h))
   #define ATL_her2 Mjoin(PATL,her2_L1)
#elif defined(ATL_INL2)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),her2_L2.h))
   #define ATL_her2 Mjoin(PATL,her2_L2)
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),her2.h))
   #define ATL_her2 Mjoin(PATL,her2)
#endif

#ifdef ATL_NXTUNE
   extern int ATL_KERN_NX;
   #define ATL_S2NX ATL_KERN_NX
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2NX.h))
   #ifndef ATL_S2NX
      #define ATL_S2NX 128
   #endif
#endif

#define MY_GERK(m_, n_, x_, y_, w_, z_, A_, lda_) \
{ \
   if (FNU) \
   { \
      ATL_CINT nnu = ((n_) >= minN && (m_) >= minM) ? ((n_)/nu)*nu : 0, \
               nr = (n_)-nnu; \
      if (nnu) \
         gerk(m_, nnu, x_, y_, w_, z_, A_, lda_); \
      if (nr) \
         ATL_GENGERK(m_, nr, x_, (y_)+nnu, w_, (z_)+nnu, \
                     (A_)+nnu*(lda_), lda_); \
   } \
   else \
      gerk(m_, n_, x_, y_, w_, z_, A_, lda_); \
}

void Mjoin(PATL,her2_kU)
(
   ATL_r2kern_t gerk0,          /* func ptr to selected GER kernel */
   int FNU,                  /* nonzero: gerk0 doesn't handle N%ATL_sNU != 0 */
   ATL_CINT N,                  /* size of prob to solve */
   const SCALAR alpha,          /* need orig alpha to pass to ref blas */
   const TYPE *x,               /* input vector x */
   const TYPE *xh,              /* conj(alpha)*x^H */
   const TYPE *y,               /* input vector y */
   const TYPE *yh,              /* alpha * y^H */
   TYPE *A,                     /* hermitian matrix, A = A + x*y^H + y*x^H*/
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r2kern_t gerk=gerk0;
   ATL_INT nx=(ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU, j;
   TYPE one[2] = {ATL_rone, ATL_rzero};
   ATL_CINT lda2 = lda+lda;
   ATL_CINT NN = (N/ATL_s2U_NU)*ATL_s2U_NU;

   nx = (ATL_S2NX >= ATL_s2U_NU) ? (ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU
        : ATL_s2U_NU;
   nx = Mmin(nx,N);
   Mjoin(PATL,refher2U)(nx, alpha, x, 1, y, 1, A, lda);
   if (nx == N)
     return;
   for (j=nx; j < NN; j += ATL_s2U_NU)
   {
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (j >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(j, ATL_s2U_NU, x, yh+j+j, y, xh+j+j, A+j*lda2, lda);
      ATL_HER2U_nu(A+j*(lda2+2), lda, x+j+j, y+j+j, xh+j+j, yh+j+j);
   }
   nx = N - NN;
   if (nx)
   {
      ATL_GENGERK(j, nx, x, yh+j+j, y, xh+j+j, A+j*lda2, lda);
      Mjoin(PATL,refher2U)(nx, alpha, x+j+j, 1, y+j+j, 1, A+j*(lda2+2), lda);
   }
}

void Mjoin(PATL,her2_kL)
(
   ATL_r2kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT N,                  /* size of prob to solve */
   const SCALAR alpha,          /* need orig alpha to pass to ref blas */
   const TYPE *x,               /* input vector x */
   const TYPE *xh,              /* conj(alpha)*x^H */
   const TYPE *y,               /* input vector y */
   const TYPE *yh,              /* alpha * y^H */
   TYPE *A,                     /* hermitian matrix, A = A + x*y^H + y*x^H*/
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r2kern_t gerk=gerk0;
   ATL_INT nx=Mmin(ATL_S2NX,N), i, NN, n;
   ATL_CINT lda2 = lda+lda, incA =  ATL_s2L_NU*(lda2+2);
   const TYPE one[2] = {ATL_rone, ATL_rzero};

   i = N - nx;
   i = (i/ATL_s2L_NU)*ATL_s2L_NU;
   if (i != N-nx)
      nx += N-nx-i;
   NN = N - nx;
   for (i=0; i < NN; i += ATL_s2L_NU)
   {
      ATL_HER2L_nu(A, lda, x, y, xh, yh);
      n = N-i-ATL_s2L_NU;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (n >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(n, ATL_s2L_NU, x+ATL_s2L_NU+ATL_s2L_NU, yh, y+ATL_s2L_NU+ATL_s2L_NU,
           xh, A+ATL_s2L_NU+ATL_s2L_NU, lda);
      A += incA;
      xh += ATL_s2L_NU+ATL_s2L_NU;
      x += ATL_s2L_NU+ATL_s2L_NU;
      yh += ATL_s2L_NU+ATL_s2L_NU;
      y += ATL_s2L_NU+ATL_s2L_NU;
   }
   Mjoin(PATL,refher2L)(nx, alpha, x, 1, y, 1, A, lda);
}

void Mjoin(PATL,her2)(const enum ATLAS_UPLO Uplo, ATL_CINT N,
                      const SCALAR alpha0, const TYPE *X, ATL_CINT incX,
                      const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
{
   size_t t1, t2;
   const TYPE *alpha = alpha0;
   const TYPE one[2] = {ATL_rone, ATL_rzero}, calpha[2]={*alpha0, -alpha0[1]};
   ATL_CINT lda2 = lda+lda, incx = incX+incX, incy = incY+incY;
   const int ALPHA_IS_ONE = (alpha[0] == ATL_rone && alpha[1] == ATL_rzero);
   TYPE *x, *y, *xh, *yh;
   void *vp=NULL;
   ATL_r2kern_t gerk, gerk0;
   int MB, NB, mb, nb, mu, nu, minM, minN, alignX, alignXt, FNU;
   int COPYX=0, COPYY, ALIGNX2A=0;
   ATL_INT CacheElts, i, n;
   ATL_CINT S2NU = (Uplo == AtlasUpper) ? ATL_s2U_NU : ATL_s2L_NU;

   if (N < 1 || SCALAR_IS_ZERO(alpha))
      return;
/*
 * For small problems, avoid overhead of func calls & data copy
 */
   if (N <= ATL_S2NX)
   {
      Mjoin(PATL,refher2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   ATL_GetPartS2(A, lda, mb, nb);
   if (!mb || !nb || mb > N || nb > N)
   {
      MB = N-S2NU;
      NB = N-S2NU;
      mb = N;
      nb = N;
   }
   else
   {
      MB = mb;
      NB = nb;
   }
   gerk = ATL_GetR2Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX,
                        &ALIGNX2A, &alignXt, &FNU, &CacheElts);
   COPYY = (incY != 1);
   if (!COPYY)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make Y's modulo with Cachelen match that of A if you want A & Y to have
 *    the same alignment;  We use Y in the same way as X in GER2, so
 *    its alignment is controlled by the X align settings, while
 *    Y^H's alignment is controlled by the Y align settings.
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignX)*alignX != t1);
      }
   }
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's modulo with Cachelen match that of A if you want A & X to have
 *    the same alignment;  We use Y in the same way as X in GER2, so
 *    its alignment is controlled by the X align settings, while
 *    Y^H's alignment is controlled by the Y align settings.
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   i = N+N + (COPYX+COPYY)*mb;
   vp = malloc(ATL_MulBySize(i)+4*ATL_Cachelen);
   if (!vp)
   {
      Mjoin(PATL,refher2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
      return;
   }
   xh = ATL_AlignPtr(vp);
   yh = xh + N + N;
   yh = ATL_AlignPtr(yh);
   y = yh + N + N;
   if (COPYX)
   {
      x = ALIGNX2A ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y);
      y = x + mb+mb;
   }
   else
      x = (TYPE*) X;
   if (COPYY)
      y = ALIGNX2A ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y);
   else
      y = (TYPE*) Y;
   if (ALPHA_IS_ONE)
   {
      Mjoin(PATL,copyConj)(N, X, incX, xh, 1);
      Mjoin(PATL,copyConj)(N, Y, incY, yh, 1);
   }
   else
   {
      Mjoin(PATL,moveConj)(N, calpha, X, incX, xh, 1); /* xh = conj(alpha*x) */
      Mjoin(PATL,moveConj)(N, alpha, Y, incY, yh, 1);  /* yh = alpha*conj(y) */
      alpha = one;
   }
   if (Uplo == AtlasUpper)
   {
      for (i=0; i < N; i += mb)
      {
         n = N-i;
         mb = Mmin(n, mb);
         n -= mb;
         #if ATL_MIN_RESTRICTED_M > 0
            if (n < minN)
               gerk = ATL_GENGERK;
         #endif
         if (COPYX)
            Mjoin(PATL,copy)(mb, X+i*incx, incX, x, 1);
         if (COPYY)
            Mjoin(PATL,copy)(mb, Y+i*incy, incY, y, 1);
         Mjoin(PATL,her2_kU)(gerk, FNU, mb, alpha0, x, xh+i+i, y, yh+i+i,
                             A+i*(lda2+2), lda);
         if (n)
            gerk(mb, n, x, yh+((i+mb)<<1), y, xh+((i+mb)<<1),
                    A+(mb+i)*lda2+i+i, lda);
         if (!COPYX)
            x += mb+mb;
         if (!COPYY)
            y += mb+mb;
      }
   }
   else         /* Uplo == AtlasLower */
   {
      i = ((N-1)/mb)*mb;
      MB = N - i;         /* at least MIN(mb,N) col here due to N-1 above */
      gerk0 = gerk;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (MB >= minN) ? gerk : ATL_GENGERK;
      #endif
      if (COPYX)
         Mjoin(PATL,copy)(MB, X, incX, x, 1);
      if (COPYY)
         Mjoin(PATL,copy)(MB, Y, incY, y, 1);
      Mjoin(PATL,her2_kL)(gerk, MB, alpha0, x, xh, y, yh, A, lda);
      for (i=MB; i < N; i += mb)
      {
         gerk = (i >= minN && mb >= minM) ? gerk0 : ATL_GENGERK;
         if (COPYX)
            Mjoin(PATL,copy)(mb, X+i*incx, incX, x, 1);
         else
            x += mb+mb;
         if (COPYY)
            Mjoin(PATL,copy)(mb, Y+i*incy, incY, y, 1);
         else
            y += mb+mb;
         gerk(mb, i, x+i+i, yh, y+i+i, xh, A+i+i, lda);
         Mjoin(PATL,her2_kL)(gerk, mb, alpha0, x+i+i, xh+i+i, y+i+i, yh+i+i,
                             A+i*(lda2+2), lda);
      }
   }

   if (vp)
     free(vp);
}
