#include "atlas_misc.h"
#include "atlas_level1.h"
#include "atlas_lvl2.h"
#include "atlas_reflvl2.h"
#include "atlas_reflevel2.h"
#if defined(ATL_INL1)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2_L1.h))
   #define ATL_syr2 Mjoin(PATL,syr2_L1)
#elif defined(ATL_INL2)
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2_L2.h))
   #define ATL_syr2 Mjoin(PATL,syr2_L2)
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2.h))
   #define ATL_syr2 Mjoin(PATL,syr2)
#endif

#ifdef ATL_NXTUNE
   extern int ATL_KERN_NX;
   #define ATL_S2NX ATL_KERN_NX
#else
   #include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2NX.h))
   #ifndef ATL_S2NX
      #define ATL_S2NX 128
   #endif
#endif

#define MY_GERK(m_, n_, x_, y_, w_, z_, A_, lda_) \
{ \
   if (FNU) \
   { \
      ATL_CINT nnu = ((n_) >= minN && (m_) >= minM) ? ((n_)/nu)*nu : 0, \
               nr = (n_)-nnu; \
      if (nnu) \
         gerk(m_, nnu, x_, y_, w_, z_, A_, lda_); \
      if (nr) \
         ATL_GENGERK(m_, nr, x_, (y_)+nnu, w_, (z_)+nnu, \
                     (A_)+nnu*(lda_), lda_); \
   } \
   else \
      gerk(m_, n_, x_, y_, w_, z_, A_, lda_); \
}

void Mjoin(PATL,syr2_kU)
(
   ATL_r2kern_t gerk0,          /* func ptr to selected GER kernel */
   int FNU,                  /* nonzero: gerk0 doesn't handle N%ATL_sNU != 0 */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE *x,               /* vector x -- alpha applied to x or y */
   const TYPE *y,               /* vector y -- alpha applied to x or y */
   TYPE *A,                     /* symmetric matrix, A = A + x*y^T + y*x^T */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r2kern_t gerk=gerk0;
   ATL_INT nx=(ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU, j;
   ATL_CINT NN = (N/ATL_s2U_NU)*ATL_s2U_NU;

   nx = (ATL_S2NX >= ATL_s2U_NU) ? (ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU
        : ATL_s2U_NU;
   nx = Mmin(nx,N);
   Mjoin(PATL,refsyr2U)(nx, ATL_rone, x, 1, y, 1, A, lda);
   if (nx == N)
     return;
   for (j=nx; j < NN; j += ATL_s2U_NU)
   {
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (j >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(j, ATL_s2U_NU, x, y+j, y, x+j, A+j*lda, lda);
      ATL_SYR2U_nu(A+j*(lda+1), lda, x+j, y+j);
   }
   nx = N - NN;
   if (nx)
   {
      ATL_GENGERK(NN, nx, x, y+NN, y, x+NN, A+NN*lda, lda);
      Mjoin(PATL,refsyr2U)(nx, ATL_rone, x+NN, 1, y+NN, 1, A+NN*(lda+1), lda);
   }
}

void Mjoin(PATL,syr2_kL)
(
   ATL_r2kern_t gerk0,          /* func ptr to selected GER kernel */
   ATL_CINT N,                  /* size of prob to solve */
   const TYPE *x,               /* vector x -- alpha applied to x or y */
   const TYPE *y,               /* vector y -- alpha applied to x or y */
   TYPE *A,                     /* symmetric matrix, A = A + x*y^T + y*x^T */
   ATL_CINT lda                 /* row stride of A */
)
{
   ATL_r2kern_t gerk=gerk0;
   ATL_INT nx=Mmin(ATL_S2NX,N), i, NN, n;
   ATL_CINT incA = ATL_s2L_NU*(lda+1);

   i = N - nx;
   i = (i/ATL_s2L_NU)*ATL_s2L_NU;
   if (i != N-nx)
      nx += N-nx-i;
   NN = N - nx;
   for (i=0; i < NN; i += ATL_s2L_NU)
   {
      ATL_SYR2L_nu(A, lda, x, y);
      n = N-i-ATL_s2L_NU;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (n >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
      #endif
      gerk(n, ATL_s2L_NU, x+ATL_s2L_NU, y, y+ATL_s2L_NU, x, A+ATL_s2L_NU, lda);
      A += incA;
      x += ATL_s2L_NU;
      y += ATL_s2L_NU;
   }
   Mjoin(PATL,refsyr2L)(nx, ATL_rone, x, 1, y, 1, A, lda);
}

void Mjoin(PATL,syr2)(const enum ATLAS_UPLO Uplo, ATL_CINT N,
                      const SCALAR alpha0, const TYPE *X, ATL_CINT incX,
                      const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
{
   size_t t1, t2;
   const TYPE alpha = alpha0;
   TYPE alphaX = alpha0, alphaY=alpha0;
   ATL_INT incx=incX, incy=incY;
   int XisXt, YisYt, COPYYt, COPYXt, ApplyAlphaToXt=0, ApplyAlphaToYt=0;
   void *vp2;
   TYPE *x, *xt, *y, *yt;
   const int ALPHA_IS_ONE = (alpha == ATL_rone);
   void *vp=NULL;
   ATL_r2kern_t gerk, gerk0;
   int MB, NB, mb, nb, mu, nu, minM, minN, alignX, alignXt, FNU;
   int COPYX=0, COPYY, ALIGNX2A=0;
   ATL_INT CacheElts, i, n;
   ATL_CINT S2NU = (Uplo == AtlasUpper) ? ATL_s2U_NU : ATL_s2L_NU;

   if (N < 1 || SCALAR_IS_ZERO(alpha))
      return;
/*
 * For small problems, avoid overhead of func calls & data copy
 */
   if (N <= ATL_S2NX)
   {
      Mjoin(PATL,refsyr2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
      return;
   }
/*
 * Determine the GER kernel to use, and its parameters
 */
   ATL_GetPartS2(A, lda, mb, nb);
   if (!mb || !nb || mb > N || nb > N)
   {
      MB = N-S2NU;
      NB = N-S2NU;
      mb = N;
      nb = N;
   }
   else
   {
      MB = mb;
      NB = nb;
   }
   gerk = ATL_GetR2Kern(MB, NB, A, lda, &mu, &nu, &minM, &minN, &alignX,
                        &ALIGNX2A, &alignXt, &FNU, &CacheElts);
/*
 * See if it is OK to have transpose vectors same as no-transpose
 */
   YisYt = XisXt = ALPHA_IS_ONE;
   if (ALPHA_IS_ONE && alignXt > sizeof(TYPE)) /* align rest may prevent */
   {
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         if ((t1/alignXt)*alignXt != t1)
            YisYt = XisXt = 0;
      }
      else if (alignXt > alignX)
      {
         if ((alignXt/alignX)*alignX != alignXt)
            YisYt = XisXt = 0;
         else
            alignX = alignXt;
      }
      else if ((alignX/alignXt)*alignXt != alignX)
         YisYt = XisXt = 0;
   }
/*
 * See if we have to copy the no-transpose vectors
 */
   COPYY = (incY != 1);
   if (!COPYY)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make Y's modulo with Cachelen match that of A if you want A & Y
 *    to have the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) Y;
         COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) Y;
         COPYY = ((t1/alignX)*alignX != t1);
      }
   }
   COPYX = (incX != 1);
   if (!COPYX)  /* may still need to copy due to alignment issues */
   {
/*
 *    ATL_Cachelen is the highest alignment that can be requested, so
 *    make X's modulo with Cachelen match that of A if you want A & X
 *    to have the same alignment
 */
      if (ALIGNX2A)
      {
         t1 = (size_t) A;
         t2 = (size_t) X;
         COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
                 (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
      }
      else if (alignX)
      {
         t1 = (size_t) X;
         COPYX = ((t1/alignX)*alignX != t1);
      }
   }
/*
 * See if we have to copy the transpose vectors
 */
   COPYYt = (incY != 1);
   if (!COPYYt && alignXt > sizeof(TYPE))
   {                /* may still need copy due to alignment issues */
      t1 = (size_t) Y;
      COPYYt = ((t1/alignXt)*alignXt != t1);
   }
   COPYXt = (incX != 1);
   if (!COPYXt && alignXt > sizeof(TYPE))
   {                /* may still need copy due to alignment issues */
      t1 = (size_t) X;
      COPYXt = ((t1/alignXt)*alignXt != t1);
   }
/*
 * See if applying alpha will force a copy; must apply alpha to either
 * no-transpose or transpose vectors, not mixture
 */
   if (!ALPHA_IS_ONE)
   {
      if (!COPYX && !COPYXt)
         COPYX = 1;
      else
         ApplyAlphaToXt = !COPYX;
      if (ApplyAlphaToXt)
         COPYYt = ApplyAlphaToYt = 1;
      else   /* must apply alpha to Y */
         COPYY = 1;
   }
/*
 * Compute amount of space necessary to allocate any needed vectors
 */
   i = 0;
   if (!YisYt)  /* vectors are distinct */
   {
      i += (COPYY) ? mb : 0;
      i += (COPYYt) ? N : 0;
   }
   else if (COPYY || COPYYt)
      i = N;

   if (!XisXt)  /* vectors are distinct */
   {
      i += (COPYX) ? mb : 0;
      i += (COPYXt) ? N : 0;
   }
   else if (COPYX || COPYXt)
      i += N;
/*
 * Allocate space, and set vector pointers; start out assuming all vectors
 * come from original input
 */
   x = xt = (TYPE*) X;
   y = yt = (TYPE*) Y;
   if (i)
   {
      vp2 = vp = malloc(ATL_MulBySize(i) + 4*ATL_Cachelen);
      if (!vp)
      {
         Mjoin(PATL,refsyr2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
         return;
      }
      if (COPYYt)
      {
         if (YisYt)
            yt = ALIGNX2A ? ATL_Align2Ptr(vp2, A) : ATL_AlignPtr(vp2);
         else
            yt = ATL_AlignPtr(vp2);
         if (ApplyAlphaToYt && !ALPHA_IS_ONE)
            Mjoin(PATL,cpsc)(N, alpha, Y, incY, yt, 1);
         else
            Mjoin(PATL,copy)(N, Y, incY, yt, 1);
         if (YisYt)
         {
            y = yt;
            COPYY = 0;
            incy = 1;
         }
         vp2 = yt + N;
      }
      if (COPYY)
      {
         y = ALIGNX2A ? ATL_Align2Ptr(vp2, A) : ATL_AlignPtr(vp2);
         if (ApplyAlphaToYt)
            alphaY = ATL_rone;
         vp2 = y + mb;
      }
      if (COPYXt)
      {
         if (XisXt)
            xt = ALIGNX2A ? ATL_Align2Ptr(vp2, A) : ATL_AlignPtr(vp2);
         else
            xt = ATL_AlignPtr(vp2);
         if (ApplyAlphaToXt && !ALPHA_IS_ONE)
            Mjoin(PATL,cpsc)(N, alpha, X, incX, xt, 1);
         else
            Mjoin(PATL,copy)(N, X, incX, xt, 1);
         if (XisXt)
         {
            x = xt;
            COPYX = 0;
            incx = 1;
         }
         vp2 = xt + N;
      }
      if (COPYX)
      {
         x = ALIGNX2A ? ATL_Align2Ptr(vp2, A) : ATL_AlignPtr(vp2);
         if (ApplyAlphaToXt)
            alphaX = ATL_rone;
      }
   }
/*
 * If we are copying the no-transpose vectors, see if we can copy them from
 * the transpose vectors.  This case can occur due to differing align restr.
 */
   if (COPYX && COPYXt && ALPHA_IS_ONE)
   {
      X = (const TYPE*) xt;
      incx = 1;
   }
   if (COPYY && COPYYt && ALPHA_IS_ONE)
   {
      Y = (const TYPE*) yt;
      incy = 1;
   }
   if (Uplo == AtlasUpper)
   {
      for (i=0; i < N; i += mb)
      {
         n = N-i;
         mb = Mmin(n, mb);
         n -= mb;
         #if ATL_MIN_RESTRICTED_M > 0
            if (n < minN)
               gerk = ATL_GENGERK;
         #endif
         if (COPYX)
            Mjoin(PATL,cpsc)(mb, alphaX, X+i*incx, incx, x, 1);
         if (COPYY)
            Mjoin(PATL,cpsc)(mb, alphaY, Y+i*incy, incy, y, 1);
         Mjoin(PATL,syr2_kU)(gerk, FNU, mb, x, yt+i, A+i*(lda+1), lda);
         if (n)
            gerk(mb, n, x, yt+i+mb, y, xt+i+mb, A+(mb+i)*lda+i, lda);
         if (!COPYX)
            x += mb;
         if (!COPYY)
            y += mb;
      }
   }
   else         /* Uplo == AtlasLower */
   {
      i = ((N-1)/mb)*mb;
      MB = N - i;         /* at least MIN(mb,N) col here due to N-1 above */
      gerk0 = gerk;
      #if ATL_MIN_RESTRICTED_M > 0
         gerk = (MB >= minN) ? gerk : ATL_GENGERK;
      #endif
      if (COPYX)
         Mjoin(PATL,cpsc)(MB, alphaX, X, incx, x, 1);
      if (COPYY)
         Mjoin(PATL,cpsc)(MB, alphaY, Y, incy, y, 1);
      Mjoin(PATL,syr2_kL)(gerk, MB, x, yt, A, lda);
      for (i=MB; i < N; i += mb)
      {
         gerk = (i >= minN && mb >= minM) ? gerk0 : ATL_GENGERK;
         if (COPYX)
            Mjoin(PATL,cpsc)(mb, alpha, X+i*incx, incx, x, 1);
         else
            x += mb;
         if (COPYY)
            Mjoin(PATL,cpsc)(mb, alpha, Y+i*incy, incy, y, 1);
         else
            y += mb;
         gerk(mb, i, x+i, yt, y+i, xt, A+i,lda);
         Mjoin(PATL,syr2_kL)(gerk, mb, x+i, yt+i*incy, A+i*(lda+1), lda);
      }
   }

   if (vp)
     free(vp);
}
