#include "atlas_misc.h"
#include "atlas_level1.h"
#include "atlas_reflvl2.h"
#include "atlas_reflevel2.h"
#include "atlas_lvl2.h"
#if defined(ATL_INL1)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr_L1.h))
   #define ATL_her Mjoin(PATL,her_L1)
#elif defined(ATL_INL2)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr_L2.h))
   #define ATL_her Mjoin(PATL,her_L2)
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr.h))
   #define ATL_her Mjoin(PATL,her)
#endif

#ifdef ATL_NXTUNE
   extern int ATL_KERN_NX;
   #define ATL_S1NX ATL_KERN_NX
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syrNX.h))
   #ifndef ATL_S1NX
      #define ATL_S1NX 128
   #endif
#endif
void Mjoin(PATL,her_kU)
(
   ATL_r1kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT FNU,                /* non-0: kern does not handle N%NU != 0 */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE alpha,            /* alpha */
   const TYPE *x,               /* input vector X */
   const TYPE *xh,              /* alpha*X^H */
   TYPE *A,                     /* hermitian matrix, A = A + x*xh */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r1kern_t gerk=gerk0;
   ATL_INT nx, j;
   TYPE one[2] = {ATL_rone, ATL_rzero};
   ATL_CINT lda2 = lda+lda;
   ATL_CINT NN = (N/ATL_s1U_NU)*ATL_s1U_NU;

   nx = (ATL_S1NX >= ATL_s1U_NU) ? (ATL_S1NX/ATL_s1U_NU)*ATL_s1U_NU : ATL_s1U_NU;
   nx = Mmin(nx, N);
   Mjoin(PATL,refherU)(nx, alpha, x, 1, A, lda);
   for (j=nx; j < NN; j += ATL_s1U_NU)
   {
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (j >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(j, ATL_s1U_NU, x, xh+j+j, A+j*lda2, lda);
      ATL_HER1U_nu(A+j*(lda2+2), lda, x+j+j, xh+j+j);
   }
   nx = N - j;
   if (nx)
   {
      ATL_GENGERK(j, nx, x, xh+j+j, A+j*lda2, lda);
      Mjoin(PATL,refherU)(nx, alpha, x+j+j, 1, A+j*(lda2+2), lda);
   }
}

void Mjoin(PATL,her_kL)
(
   ATL_r1kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE alpha,            /* alpha */
   const TYPE *x,               /* input vector X */
   const TYPE *xh,              /* alpha*X^H */
   TYPE *A,                     /* hermitian matrix, A = A + x*xh */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r1kern_t gerk=gerk0;
   ATL_INT nx=Mmin(ATL_S1NX,N), i, NN, n;
   ATL_CINT lda2 = lda+lda;
   const TYPE one[2] = {ATL_rone, ATL_rzero};

   i = N - nx;
   i = (i/ATL_s1L_NU)*ATL_s1L_NU;
   if (i != N-nx)
      nx += N-nx-i;
   NN = N - nx;
   for (i=0; i < NN; i += ATL_s1L_NU)
   {
      ATL_HER1L_nu(A, lda, x, xh);
      n = N-i-ATL_s1L_NU;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (n >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(n, ATL_s1L_NU, x+ATL_s1L_NU+ATL_s1L_NU, xh, A+ATL_s1L_NU+ATL_s1L_NU, lda);
      A += ATL_s1L_NU*(lda2+2);
      xh += ATL_s1L_NU+ATL_s1L_NU;
      x += ATL_s1L_NU+ATL_s1L_NU;
   }
   Mjoin(PATL,refher)(AtlasLower, nx, alpha, x, 1, A, lda);
}

#define MY_GERK(m_, n_, x_, xt_, A_, lda_) \
{ \
   if (FNU) \
   { \
      ATL_CINT nnu = ((n_) >= minN && (m_) >= minM) ? ((n_)/nu)*nu : 0, \
               nr = (n_)-nnu; \
      if (nnu) \
         gerk(m_, nnu, x_, xt_, A_, lda_); \
      if (nr) \
         ATL_GENGERK(m_, nr, x_, (xt_)+nnu, (A_)+nnu*(lda_), lda_); \
   } \
   else \
      gerk(m_, n_, x_, xt_, A_, lda_); \
}
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha,
               const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda)
{
   size_t t1, t2;
   const TYPE one[2] = {ATL_rone, ATL_rzero}, calpha[2] = {alpha, ATL_rzero};
   ATL_CINT lda2 = lda+lda, incx = incX+incX;
   void *vp=NULL;
   TYPE *x, *xh;
   ATL_r1kern_t gerk, gerk0;
   ATL_INT MB, NB, mb, nb, Nmb, n, i, CacheElts;
   int mu, nu, minM, minN, alignX, alignXt, FNU;
   int COPYX=0, COPYXt=0, ALIGNX2A=0;
   const int ALPHA_IS_ONE=(alpha == ATL_rone);

   if (N < 1 || (alpha == ATL_rzero))
      return;
/*
 * For very small problems, avoid overhead of func calls & data copy
 */
   if (N < 50)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   ATL_GetPartS1(A, lda, mb, nb);
   if (!mb || !nb || mb > N || nb > N)
   {
      MB = N-ATL_s1L_NU;
      NB = N-ATL_s1L_NU;
      mb = nb = N;
   }
   else
   {
      MB = mb;
      NB = nb;
   }
   gerk = ATL_GetR1Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX,
                        &ALIGNX2A, &alignXt, &FNU, &CacheElts);
/*
 * Determine if we need to copy the vectors
 */
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's % with Cachelen match that of A if you want A & X to have
 *    the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
   i = N + COPYX*mb;
   vp = malloc(ATL_MulBySize(i)+2*ATL_Cachelen);
   if (!vp)
   {
      Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda);
      return;
   }
   xh = ATL_AlignPtr(vp);
   if (COPYX)
   {
      x = xh + N+N;
      x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x);
   }
   else
      x = (TYPE*) X;
   if (ALPHA_IS_ONE)
      Mjoin(PATL,copyConj)(N, X, incX, xh, 1);
   else
      Mjoin(PATL,moveConj)(N, calpha, X, incX, xh, 1);
   Nmb = ((N-1)/mb)*mb;
   if (Uplo == AtlasUpper)
   {
      for (i=0; i < Nmb; i += mb)
      {
         n = N-i-MB;
         if (COPYX)
            Mjoin(PATL,copy)(MB, X+i*incx, incX, x, 1);
         Mjoin(PATL,her_kU)(gerk, FNU, MB, alpha, x, xh+i+i, A+i*(lda2+2), lda);
         MY_GERK(MB, n, x, xh+((i+MB)<<1), A+(MB+i)*lda2+i+i, lda);
         if (!COPYX)
            x += MB+MB;
      }
      mb = N - Nmb;
      if (COPYX)
         Mjoin(PATL,copy)(mb, X+Nmb*incx, incX, x, 1);
      Mjoin(PATL,her_kU)(gerk, FNU, mb, alpha, x, xh+Nmb+Nmb,
                         A+Nmb*(lda2+2), lda);
   }
   else         /* Uplo == AtlasLower */
   {
      mb = N - Nmb;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk0 = gerk = (mb >= minM) ? gerk0 : ATL_GENGERK;
      #endif
      if (COPYX)
         Mjoin(PATL,copy)(mb, X, incX, x, 1);
      Mjoin(PATL,her_kL)(gerk, mb, alpha, x, xh, A, lda);
      for (i=mb; i < N; i += MB)
      {
         #if ATL_MIN_RESTRICTED_M > 0
            gerk = (i >= minN) ? gerk0 : ATL_GENGERK;
         #endif
         if (COPYX)
            Mjoin(PATL,copy)(MB, X+i*incx, incX, x, 1);
         else
            x += mb+mb;
         MY_GERK(MB, i, x, xh, A+i+i, lda);
         Mjoin(PATL,her_kL)(gerk, MB, alpha, x, xh+i+i, A+i*(lda2+2), lda);
         mb = MB;
      }
   }

   if (vp)
     free(vp);
}
