#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "mr1.h"
#include "m2_function_table.h"

static mr1tablecoefficients *MR1TableCoefficients[1000];
static int  MR1SetForceModeBuf[1000] = {0,1, 0,1, -1,-1, 0,1, -1,-1, -1,-1};

static int flag_s=1;

void MR1SetTable_emu(char *filename, int tblno, int flag)
{
/* flag 0 -- force
        1 -- potential
*/
  M2_FUNCTION_TABLE ftb;
  int i,j;
  mr1tablecoefficients *tc;
  char fn[1000];

  if(tblno<500 || tblno>=1000){
    fprintf(stderr,"** error : tblno must be from 500 to 999 **\n");
    exit(1);
  }
  if((MR1TableCoefficients[tblno]=(mr1tablecoefficients *)
      malloc(sizeof(mr1tablecoefficients)))==NULL){
    fprintf(stderr,"** error : malloc of MR1TableCoefficients failed **\n");
    exit(1);
  }
  tc=MR1TableCoefficients[tblno];
  sprintf(fn,"%s.table",filename);
  /*  printf("Table file is %s, %s\n",filename,fn);*/
  ftb=m2_read_function_table(fn);
  tc->estart=ftb.estart;
  tc->mshift=ftb.mshift;
#if 0
  printf("MR1SetTable_emu: estart=%d mshift=%d\n",ftb.estart,ftb.mshift);
#endif
  tc->domain_max=ftb.domain_max;
  /*  printf("estart=%d mshift=%d domain_max=%e\n",
      tc->estart,tc->mshift,tc->domain_max);*/
  for(i=0;i<M2_NUM_RACO_WORDS;i++){
    for(j=0;j<6;j++){
      if(j<5) (tc->c)[i][j]=(double)(ftb.coefficient_table[i][j]);
      else    (tc->c)[i][j]=pow(2,ftb.coefficient_table[i][j]);
      (tc->ci)[i][j]=ftb.coefficient_table[i][j];
    }
  }
  for(i=0;i<M2_NUM_RAEX_WORDS;i++){
    tc->e[i]=ftb.exponents[i];
    tc->ed[i]=pow(2,ftb.exponents[i]-122);
  }
  MR1SetForceModeBuf[tblno]=flag;
}


void mr1settable_emu_(char *filename, int *tblno, int *flag)
{
    MR1SetTable_emu(filename,*tblno,*flag);
}

void mr1settable_emu__(char *filename, int *tblno, int *flag)
{
    MR1SetTable_emu(filename,*tblno,*flag);
}


void mr1settable_emu_nf_(char *fname, int *tblno, int *flag, int *n)
{
    char *f;

    if((f=(char *)malloc(sizeof(char)*(*n+1)))==NULL){
	fprintf(stderr,"** error : can't malloc f **\n");
	exit(1);
    }
    snprintf(f,*n+1,"%s",fname);
    MR1SetTable_emu(f,*tblno,*flag);
    free(f);
}


void mr1settable_emu_nf__(char *filename, int *tblno, int *flag, int *n)
{
    mr1settable_emu_nf_(filename,tblno,flag,n);
}


static float r2_unit(float dr[3], float e)
{
  float dx2,dy2,dz2,e_plus_dx2,dy2_plus_dz2,r2;

  dx2=dr[0]*dr[0];
  dy2=dr[1]*dr[1];
  dz2=dr[2]*dr[2];
  e_plus_dx2 = e + dx2;
  dy2_plus_dz2 = dy2 + dz2;
  r2 = e_plus_dx2 + dy2_plus_dz2;

  return r2;
}


static int fev_segmentation_v2(float r2,int estart,int mshift,
			int *er2,int *caddr,int *dx)
{
    FLOAT x;

    x.f=r2;
    *er2=SLICE(x.i,23,8);
    if((*er2-estart)<0) return 1;
    *caddr=(SLICE(*er2-estart,0,mshift)<<(10-mshift))
		+SLICE(x.i,13+mshift,10-mshift);

    if(mshift<7){
	*dx=SLICE(x.i,0,13+mshift)<<(7-mshift);
    }
    else{
	*dx=SLICE(x.i,mshift-7,MDGP2_WID_TBL_DX);
    }
    *dx^=0x80000;
    if(flag_s==0){
	printf("  er2=%x(%d)  caddr=%x(%d)  dx=%x(%d)\n",
	       *er2,*er2,*caddr,*caddr,*dx,*dx);
    }
    return 0;
}


static void fev_poly_v2(int c[],int dx,int *z)
{
    int xsign;
    long long x,x_11,x_16,lz;
    long long muc4, muc3, muc2, muc1;
    double xd,x_11d,x_16d,lzd;
    double muc4d,muc3d,muc2d,muc1d;

    x = (long long)dx&MASK(MDGP2_WID_TBL_DX-1);
    xsign = (dx>>(MDGP2_WID_TBL_DX-1))&1;
    if (xsign) x=x-(1<<(MDGP2_WID_TBL_DX-1));

    x_11=x>>(MDGP2_WID_TBL_DX-MDGP2_WID_TBL_4);
    x_16=x>>(MDGP2_WID_TBL_DX-MDGP2_WID_TBL_3);

    lz=(int)c[4];
#if 0
    printf("      x=%llx x_11=%llx x_16=%llx\n",x,x_11,x_16);
    printf("      lz=%llx\n",lz);
#endif
    muc4 = lz * x_11;
    lz=(int)c[3]+(muc4>>(MDGP2_WID_TBL_4-1));
#if 0
    printf("      lz=%llx\n",lz);
#endif
    muc3 = lz * x_16;
    lz=(int)c[2]+(muc3>>(MDGP2_WID_TBL_3-1));
#if 0
    printf("      lz=%llx\n",lz);
#endif
    muc2 = lz * x;
    lz=(int)c[1]+(muc2>>(MDGP2_WID_TBL_2-1));
#if 0
    printf("      lz=%llx\n",lz);
#endif
    muc1 = lz * x;
#if 0
    printf("      muc1=%llx\n",muc1);
#endif
    lz=(int)c[0]+(muc1>>(MDGP2_WID_TBL_DX-1));
#if 0
    printf("      muc1>>(MDGP2_WID_TBL_DX-1)=%x\n",
	   (muc1>>(MDGP2_WID_TBL_DX-1)));
    printf("      lz=%llx\n",lz);
#endif

    *z=lz;
    /*
    xd=x;x_11d=x_11;x_16d=x_16;
    lzd=c[4]*pow(2,;
    muc4d=lzd*x_11d;*/
    
}


static void accumulator_unit_v2(float bjg, const float dr[3], int potflag, 
			 double fi[3]){
	int i;
	float float_f[3];
	double double_f[3];

	if(potflag==0){
	  for(i = 0; i < 3; i ++){
	    float_f[i] = bjg * dr[i];
	    fi[i]+=(double)float_f[i];
	  }
	}
	else{
	  for(i = 0; i < 3; i ++){
	    fi[i]+=(double)bjg;
	  }
	}
}


static void mdgp2pipe_v3(double rid[3], double rjd[3], 
			 double ad, double bd, double ed, double hd,
			 double size_1, double xmax_scale1, double xmax_scale2,
			 int table[][6], double tabled[][6], double exp_table[],
			 int estart, int mshift,
			 int potflag, double fi[3])
{
    float a,b,h,e,r2,dr[3],r2a,g,bg;
    int k;
    SDOUBLE dru[3];
    double ri[3],rj[3];

#ifdef DEBUG_MDGP2PIPEV3
    printf("  in mdgp2pipe_v3 rid=%e %e %e\n",rid[0],rid[1],rid[2]);
    printf("    rjd=%e %e %e\n",rjd[0],rjd[1],rjd[2]);
    printf("    ad=%e bd=%e ed=%e hd=%e\n",ad,bd,ed,hd);
    printf("    size_1=%e xmax_scale1=%e xmax_scale2=%e\n",
	   size_1,xmax_scale1,xmax_scale2);
    printf("    estart=%d mshift=%d potflag=%d\n",estart,mshift,potflag);
#endif
#if 0
    for(k=0;k<256;k++)
      printf("    exp_table[%d]=%e\n",k,exp_table[k]);
#endif
    a = (float)(xmax_scale1 * ad);
    if(potflag==0) b=(float)(bd*xmax_scale2);
    else           b=(float)(bd);
    h=(float)hd;
    e=(float)ed;
    
    for(k=0;k<3;k++){
        ri[k]=rid[k]*size_1+0x1800;
        /*        fi[k]=0.0;*/
    }
    for(k=0;k<3;k++){
        rj[k]=rjd[k]*size_1+0x1800;
        dru[k].f=rj[k]-ri[k];
        dru[k].f+=0x1800;
#ifdef DEBUG_MDGP2PIPEV3
	printf("    dru[%d]=%e\n",k,dru[k].f);
#endif
        dru[k].i&=LMASK(MDGP2_WID_R);
        if(BIT(dru[k].i,MDGP2_WID_R-1)!=0){
            dru[k].i|=~LMASK(MDGP2_WID_R);
        }
        dr[k]=dru[k].i;
    }
    r2=r2_unit(dr,e);
    r2a=r2*a;
#ifdef DEBUG_MDGP2PIPEV3
    printf("    dr=%e %e %e\n",dr[0],dr[1],dr[2]);
    printf("    r2=%e r2a=%e a=%e\n",r2,r2a,a);
#endif
    {
        int er2,caddr,dx,z;
        if(fev_segmentation_v2(r2a,estart,mshift,
                               &er2,&caddr,&dx)!=0){
            g=0.0;
        }
        else{
            fev_poly_v2(table[caddr],dx,&z);
            if(BIT(z,27)){
                z|=~MASK(28);
            }
            g=z * tabled[caddr][5] * exp_table[er2];
#ifdef DEBUG_MDGP2PIPEV3
	    printf("    table[%d]=%d %d %d %d %d %d\n",
		   caddr,table[caddr][0],table[caddr][1],
		   table[caddr][2],table[caddr][3],table[caddr][4],
		   table[caddr][5]);
	    printf("    exp_table[%d]=%e\n",er2,exp_table[er2]);
	    printf("    caddr=%d er2=%d dx=%x\n",caddr,er2,dx);
#endif
        }
    }
    bg=g*b;
    accumulator_unit_v2(bg, dr, potflag, fi);
#ifdef DEBUG_MDGP2PIPEV3
    printf("    g=%e bg=%e dr=%e %e %e\n",
	   g,bg,dr[0],dr[1],dr[2]);
    printf("    fi=%e %e %e\n",fi[0],fi[1],fi[2]);
#endif
}


void MR1calccoulomb_nlist_emu(double x[], int n, double q[], double rscale,
			      int tblno, double xmax, int periodicflag,
			      int numex[], int natex[], double factor,
			      double force[])
{
  /*
      charge of i-particle is always multiplied 

      periodicflag bit 0 ---- 0 : non periodic
                              1 : periodic
                   bit 1 ---- 0 : use Newton's third law
                                  do not include duplicate list
                              1 : do not use Newton's third law
                   bit 2 ---- 0 : multiply charge and rscale
                              1 : do not multiply charge nor rscale
   */
  mr1tablecoefficients *tc;
  int i,i3,j,j3,k,potflag,nc,jj;
  double ri[3],rj[3],size_1,rscale_sq,dtmp;
  unsigned long long ril[3],rjl[3];
  SDOUBLE dru[3];
  float a,b,h=0.0,e=0.0,r2,dr[3],r2a,g,bg;
  float dx2,dy2,dz2,e_plus_dx2,dy2_plus_dz2;
  double fi[3],xmax_scale1,xmax_scale2,ad,bd;
  int ic,iexcl,ijflag,multicharge;

  if(tblno<500 || tblno>=1000){
    fprintf(stderr,"** error : tblno=%d is out of range **\n",tblno);
    exit(1);
  }
  tc=MR1TableCoefficients[tblno];
  potflag=MR1SetForceModeBuf[tblno];
  if((periodicflag & 1)==0) xmax*=2.0;
  if((periodicflag & 2)==0) ijflag=1;
  else                      ijflag=0;
  if((periodicflag & 4)==0) multicharge=1;
  else                      multicharge=0;
  rscale_sq=rscale * rscale;
  size_1=1.0/xmax;
  dtmp=xmax / tc->domain_max;
  xmax_scale1=dtmp * dtmp * pow(2, -18);
  xmax_scale2=pow(2, -40) * xmax;
  iexcl = 0;
  for(i=i3=0;i<n;i++,i3+=3){
      for(jj=iexcl;jj<iexcl+numex[i];jj++){
          if(natex[jj]>=0){
              j=natex[jj];
              j3=j*3;
              ad=rscale_sq;
              bd=q[j];
	      if(bd!=0.0){
		  for(k=0;k<3;k++) fi[k]=0.0;
		  mdgp2pipe_v3(x+i3,x+j3,ad,bd,0.0,0.0,
			       size_1,xmax_scale1,xmax_scale2,
			       tc->ci,tc->c,tc->ed,
			       tc->estart,tc->mshift,
			       potflag,fi);
#if 0
		  if(i<3){
		    printf(" emu i=%d j=%d rscale=%f qi=%f qj=%f fc=%f\n",
			   i,j,rscale,q[i],q[j],factor);
		    printf("   xi=%f %f %f xj=%f %f %f\n",
			   x[i3],x[i3+1],x[i3+2],x[j3],x[j3+1],x[j3+2]);
		    printf("   f=%f %f %f\n",fi[0],fi[1],fi[2]);
		    printf("   fq=%f %f %f\n",
			   fi[0]*factor*q[i],
			   fi[1]*factor*q[i],
			   fi[2]*factor*q[i]);
		    printf("   f a^3 q=%f %f %f\n",
			   fi[0]*rscale_sq*rscale*factor*q[i],
			   fi[1]*rscale_sq*rscale*factor*q[i],
			   fi[2]*rscale_sq*rscale*factor*q[i]);
		  }
#endif
#if 0
		  if(i<4)
		    printf("    force_old i=%d,%f j=%d,%f\n",
			   i,force[i3],j,force[j3]);
#endif
		  if(potflag==0){
                    if(multicharge!=0)
		      dtmp=q[i]*rscale_sq*rscale*factor;
		    else
		      dtmp=factor;
		    for(k=0;k<3;k++){
		      force[i3+k]+=fi[k]*dtmp;
		      if(ijflag!=0) force[j3+k]-=fi[k]*dtmp;
		    }
		  }
		  else{
                    if(multicharge!=0)
		      dtmp=q[i]*rscale*factor;
		    else
		      dtmp=factor;
		    force[i3]+=fi[0]*dtmp;
		    if(ijflag!=0) force[j3]+=fi[0]*dtmp;
		  }
#if 0
		  if(i<4)
		    printf("    force_new i=%d,%f j=%d,%f\n",
			   i,force[i3],j,force[j3]);
#endif
	      }
          }
      }
      iexcl = iexcl + numex[i];
  }
}


void mr1calccoulomb_nlist_emu_(double x[], int *n, double q[], double *rscale,
			       int *tblno, double *xmax, int *periodicflag,
			       int numex[], int natex[], double *factor,
			       double force[])
{
    int i,*natex2,s;

    for(i=s=0;i<*n;i++){
        s+=numex[i];
    }
    if(s==0) return;
    if((natex2=(int *)malloc(sizeof(int)*s))==NULL){
        fprintf(stderr,
                "** error at malloc natex2 in mr1calccoulomb_nlist_emu_ **\n");
        exit(1);
    }
    for(i=0;i<s;i++)  natex2[i]=natex[i]-1;
    MR1calccoulomb_nlist_emu(x,*n,q,*rscale,
			     *tblno,*xmax,*periodicflag,
			     numex,natex2,*factor,force);
    free(natex2);
}


void mr1calccoulomb_nlist_emu__(double x[], int *n, double q[], double *rscale,
				int *tblno, double *xmax, int *periodicflag,
				int numex[], int natex[], double *factor,
				double force[])
{
  mr1calccoulomb_nlist_emu_(x,n,q,rscale,tblno,xmax,periodicflag,
			    numex,natex,factor,force);
}


void MR1calcvdw_nlist_emu2(double x[], int n, int atype[], int nat,
			   double gscale[], double rscale[], int tblno,
			   double xmax, int periodicflag,
			   int numex[], int natex[], double factor,
			   double force[])
{
  /*
      periodicflag bit 0 ---- 0 : non periodic
                              1 : periodic
                   bit 1 ---- 0 : use Newton's third law
                                  do not include duplicate list
                              1 : do not use Newton's third law
   */
  mr1tablecoefficients *tc;
  int i,i3,j,j3,k,potflag,nc,jj;
  double ri[3],rj[3],size_1,rscale_sq,dtmp;
  unsigned long long ril[3],rjl[3];
  SDOUBLE dru[3];
  float a,b,h=0.0,e=0.0,r2,dr[3],r2a,g,bg;
  float dx2,dy2,dz2,e_plus_dx2,dy2_plus_dz2;
  double fi[3],xmax_scale1,xmax_scale2,ad,bd;
  int ic,iexcl,ijflag;

  if(tblno<500 || tblno>=1000){
    fprintf(stderr,"** error : tblno=%d is out of range **\n",tblno);
    exit(1);
  }
  tc=MR1TableCoefficients[tblno];
  potflag=MR1SetForceModeBuf[tblno];
  if((periodicflag & 1)==0) xmax*=2.0;
  if((periodicflag & 2)==0) ijflag=1;
  else                      ijflag=0;
  size_1=1.0/xmax;
  dtmp=xmax / tc->domain_max;
  xmax_scale1=dtmp * dtmp * pow(2, -18);
  xmax_scale2=pow(2, -40) * xmax;
  iexcl = 0;
  for(i=i3=0;i<n;i++,i3+=3){
      for(jj=iexcl;jj<iexcl+numex[i];jj++){
          if(natex[jj]>=0){
              j=natex[jj];
              j3=j*3;
              ic=atype[i]*nat+atype[j];
              ad=rscale[ic];
              bd=gscale[ic];
	      if(bd!=0.0){
		  for(k=0;k<3;k++) fi[k]=0.0;
		  mdgp2pipe_v3(x+i3,x+j3,ad,bd,0.0,0.0,
			       size_1,xmax_scale1,xmax_scale2,
			       tc->ci,tc->c,tc->ed,
			       tc->estart,tc->mshift,
			       potflag,fi);
		  if(potflag==0){
		      for(k=0;k<3;k++){
			  dtmp=factor*fi[k];
			  force[i3+k]+=dtmp;
			  if(ijflag!=0) force[j3+k]-=dtmp;
		      }
		  }
		  else{
		      dtmp=factor*fi[0];
		      force[i3]+=dtmp;
		      if(ijflag!=0) force[j3]+=dtmp;

		  }
	      }
          }
      }
      iexcl = iexcl + numex[i];
  }
}


void mr1calcvdw_nlist_emu2_(double x[], int *n, int atype[], int *nat,
			    double gscale[], double rscale[], int *tblno,
			    double *xmax, int *periodicflag,
			    int numex[], int natex[], double *factor,
			    double force[])
{
    int *atype2,i,*natex2,s;

    for(i=s=0;i<*n;i++){
        s+=numex[i];
    }
    if(s==0) return;
    if((atype2=(int *)malloc(sizeof(int)*(*n)))==NULL){
        fprintf(stderr,
                "** error at malloc atype2 in mr1calcvdw_nlist_emu_ **\n");
        exit(1);
    }
    if((natex2=(int *)malloc(sizeof(int)*s))==NULL){
        fprintf(stderr,
                "** error at malloc natex2 in mr1calcvdw_nlist_emu_ **\n");
        exit(1);
    }
    for(i=0;i<*n;i++) atype2[i]=atype[i]-1;
    for(i=0;i<s;i++)  natex2[i]=natex[i]-1;
/*    for(i=0;i<2;i++) printf("i=%d numex=%d\n",i,numex[i]);
    for(i=0;i<numex[0];i++) printf(" %d",natex[i]);
    printf("\n");
    for(i=0;i<numex[0];i++) printf(" %d",natex2[i]);
    printf("\n");*/
    MR1calcvdw_nlist_emu2(x,*n,atype2,*nat,
			  gscale,rscale,*tblno,
			  *xmax,*periodicflag,
			  numex,natex2,*factor,force);
    free(atype2);free(natex2);
}


void mr1calcvdw_nlist_emu2__(double x[], int *n, int atype[], int *nat,
			     double gscale[], double rscale[], int *tblno,
			     double *xmax, int *periodicflag,
			     int numex[], int natex[], double *factor,
			     double force[])
{
  mr1calcvdw_nlist_emu2_(x,n,atype,nat,
			 gscale,rscale,tblno,
			 xmax,periodicflag,
			 numex,natex,factor,
			 force);
}


static double det(a)
double a[3][3];
{
   int i,j,k;
   double deter=0.0;
   
   for(i=0,j=1,k=2;i<3;i++,j=(j+1)%3,k=(k+1)%3)
      deter+=a[0][i]*(a[1][j]*a[2][k]-a[1][k]*a[2][j]);
   return deter;
}      


static void invert_matrix(a, b)
double a[3][3];  /* in */
double b[3][3];  /* out */
{
   int i,j,k,l,m,n;
   double deter;
        
   if((deter=det(a))==0.0){
       fprintf(stderr,"Error: can't inverse this matrix\n");
       return;
   }
   deter=1.0/deter;
   for(i=0,j=1,k=2;i<3;i++,j=(j+1)%3,k=(k+1)%3)
      for(l=0,m=1,n=2;l<3;l++,m=(m+1)%3,n=(n+1)%3)
         b[l][i]=deter*(a[j][m]*a[k][n]-a[j][n]*a[k][m]);
}


static void MR1calcewald_sub1(n,knum,xn,rk,kv,bs,bc,cs,cc,
			      factor1,factor2,fs,fc)
double **xn,**rk,**kv,**bs,**bc,**cs,**cc,**factor1,**factor2,**fs,**fc;
int n,knum;
{
    if(*xn==NULL){
	if((*xn=(double *)malloc(sizeof(double)*n*3))==NULL){
	    fprintf(stderr,"** error at malloc xn in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*rk==NULL){
	if((*rk=(double *)malloc(sizeof(double)*knum*3))==NULL){
	    fprintf(stderr,"** error at malloc rk in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*kv==NULL){
	if((*kv=(double *)malloc(sizeof(double)*knum*3))==NULL){
	    fprintf(stderr,"** error at malloc kv in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*bs==NULL){
	if((*bs=(double *)malloc(sizeof(double)*knum))==NULL){
	    fprintf(stderr,"** error at malloc bs in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*bc==NULL){
	if((*bc=(double *)malloc(sizeof(double)*knum))==NULL){
	    fprintf(stderr,"** error at malloc bc in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*cs==NULL){
	if((*cs=(double *)malloc(sizeof(double)*knum*3))==NULL){
	    fprintf(stderr,"** error at malloc cs in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*cc==NULL){
	if((*cc=(double *)malloc(sizeof(double)*knum*3))==NULL){
	    fprintf(stderr,"** error at malloc cc in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*factor1==NULL){
	if((*factor1=(double *)malloc(sizeof(double)*knum))==NULL){
	    fprintf(stderr,"** error at malloc factor1 in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*factor2==NULL){
	if((*factor2=(double *)malloc(sizeof(double)*knum))==NULL){
	    fprintf(stderr,"** error at malloc factor2 in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*fs==NULL){
	if((*fs=(double *)malloc(sizeof(double)*n*3))==NULL){
	    fprintf(stderr,"** error at malloc fs in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
    if(*fc==NULL){
	if((*fc=(double *)malloc(sizeof(double)*n*3))==NULL){
	    fprintf(stderr,"** error at malloc fc in MR1calcewald_sub1 **\n");
	    exit(1);
	}
    }
}


static void MR1calcewald_sub2(k,rk,kv,knum,factor1,factor2,
			      cell,cell_inv,alpha,epsilon)
double *rk,*kv,*factor1,*factor2,cell[3][3],cell_inv[3][3],alpha,epsilon;
int *k,knum;
{
    int i,i3,j;
    double vol1,alpha4,eps1,r2;
    
    vol1=1.0/det(cell);
    //    for(i=0;i<3;i++) printf("cell[%d]=%e %e %e\n",i,cell[i][0],cell[i][1],cell[i][2]);
    invert_matrix(cell,cell_inv);
    eps1=1.0/epsilon;
    alpha4=1.0/(4.0*alpha*alpha);
    for(i=i3=0;i<knum;i++,i3+=3){
	for(j=0;j<3;j++) rk[i3+j]=k[i3+j];
	kv[i3]  =cell_inv[0][0]*rk[i3] + cell_inv[1][0]*rk[i3+1];
	kv[i3+1]=cell_inv[0][1]*rk[i3] + cell_inv[1][1]*rk[i3+1];
	kv[i3+2]=cell_inv[0][2]*rk[i3] + cell_inv[1][2]*rk[i3+1]
	                               + cell_inv[2][2]*rk[i3+2];
	for(j=0;j<3;j++) kv[i3+j]*=2.0*M_PI;
	r2=0.0;
	for(j=0;j<3;j++) r2+=kv[i3+j]*kv[i3+j];
	factor1[i]=2.0*eps1*vol1*exp(-r2*alpha4)/r2;
#if 0
	if(i<2){
	  printf("knum=%d eps=%e vol=%e\n",i,1.0/eps1,1.0/vol1);
	  printf("r2=%e alpha4=%e\n",r2,alpha4);
	}
#endif
	factor2[i]=2.0*(1.0+r2*alpha4)/r2;
/*	if(i<4){
	    fprintf(stderr,"MDone:kv[%d]=%e %e %e  vol=%e\n",
		    i,kv[i3+0],kv[i3+1],kv[i3+2],1.0/vol1);
	}*/
    }
}
		       
    
static void MR1calcewald_sub3(x,xn,n,cell_inv)
double *x,*xn,cell_inv[3][3];
int n;
{
    int i,i3,j;
    
    for(i=i3=0;i<n;i++,i3+=3){
        for(j=0;j<3;j++){
            xn[i3+j]= cell_inv[j][0]*x[i3]
                     +cell_inv[j][1]*x[i3+1]
                     +cell_inv[j][2]*x[i3+2];
        }
    }
}


static void MR1calcewald_sub4(bs,bc,kv,knum,factor1,factor2,pot,stress)
double *bs,*bc,*kv,*factor1,*factor2,*pot,stress[3][3];
int knum;
{
    int k,k3,i,j;
    double anbsc;

    for(k=k3=0;k<knum;k++,k3+=3){
	anbsc=0.5*factor1[k]*(bs[k]*bs[k]+bc[k]*bc[k]);
/*	if(k<10){
	    fprintf(stderr,"MDone:k=%d coeff=%e pe=%e\n",
		    k,factor1[k],anbsc);
	}*/
	*pot+=anbsc;
#if 0
	if(k<10){
	  printf("factor1=%e bc=%e bs=%e\n",factor1[k],bc[k],bs[k]);
	  printf("%d pot=%e %e\n",k,*pot,anbsc);
	}
#endif
/*	printf("MR2 %d factor1=%e anbsc=%e pot=%e\n",
	       k,factor1[k],anbsc,*pot);*/
	for(i=0;i<3;i++){
	    stress[i][i]+=anbsc;
	    for(j=i;j<3;j++)
		stress[i][j]-=anbsc*factor2[k]*kv[k3+i]*kv[k3+j];
	}
    }
}


static void MR1calcewald_sub5(bs,bc,rk,factor1,factor2,knum,cs,cc)
double *bs,*bc,*rk,*cs,*cc,*factor1,*factor2;
int knum;
{
    int i,i3,j;
    double tmp1,tmp2;
    
    for(i=i3=0;i<knum;i++,i3+=3){
        tmp1=factor1[i]*bs[i];
        tmp2=factor1[i]*bc[i];
        for(j=0;j<3;j++){
            cs[i3+j]=tmp1*rk[i3+j];
            cc[i3+j]=tmp2*rk[i3+j];
        }
    }
}


static void MR1calcewald_sub5eng(bs,bc,rk,factor1,factor2,knum,cs,cc)
double *bs,*bc,*rk,*cs,*cc,*factor1,*factor2;
int knum;
{
    int i,i3,j;
    double tmp1,tmp2;
    
    for(i=i3=0;i<knum;i++,i3+=3){
        tmp1=factor1[i]*bs[i];
        tmp2=factor1[i]*bc[i];
        for(j=0;j<3;j++){
            cs[i3+j]=tmp2;
            cc[i3+j]=tmp1;
        }
    }
}


static void MR1calcewald_sub6(fs,fc,chg,n,cell_inv,force)
double *fs,*fc,*chg,cell_inv[3][3],*force;
int n;
{
    int i,i3,j;
    double ftmp[3];
    
    for(i=i3=0;i<n;i++,i3+=3){
	for(j=0;j<3;j++) ftmp[j]=chg[i]*(fs[i3+j]-fc[i3+j]);
        force[i3]  +=2.0*M_PI
	                *(cell_inv[0][0]*ftmp[0] + cell_inv[1][0]*ftmp[1]);
        force[i3+1]+=2.0*M_PI
	                *(cell_inv[0][1]*ftmp[0] + cell_inv[1][1]*ftmp[1]);
        force[i3+2]+=2.0*M_PI
	                *(cell_inv[0][2]*ftmp[0] + cell_inv[1][2]*ftmp[1]
	                                    + cell_inv[2][2]*ftmp[2]);
    }
}


static void MR1calcewald_sub6eng(fs,fc,chg,n,cell_inv,force)
double *fs,*fc,*chg,cell_inv[3][3],*force;
int n;
{
    int i,i3,j;
    double ftmp[3];
    
    for(i=i3=0;i<n;i++,i3+=3){
	for(j=0;j<3;j++) ftmp[j]=chg[i]*(fs[i3+j]+fc[i3+j]);
        force[i3]  +=ftmp[0];
        force[i3+1]+=ftmp[1];
        force[i3+2]+=ftmp[2];
    }
}


static void MR1calcewald_dft_host(rk,knum,xn,n,chg,bs,bc)
int n,knum;
double *rk,*xn,*chg,*bs,*bc;
{
    int i,i3,j,j3,c;
    double th;

    for(i=i3=0;i<knum;i++,i3+=3){
	bs[i]=bc[i]=0.0;
	for(j=j3=0;j<n;j++,j3+=3){
	    th=0.0;
	    for(c=0;c<3;c++) th+=rk[i3+c]*xn[j3+c];
	    th*=2.0*M_PI;
	    bs[i]+=chg[j]*sin(th);
	    bc[i]+=chg[j]*cos(th);
	}
    }
}


static void MR1calcewald_idft_host(rk,cs,cc,knum,xn,n,fs,fc)
int n,knum;
double *rk,*cs,*cc,*xn,*fs,*fc;
{
    int i,i3,j,j3,c;
    double th,sth,cth;

    for(i=i3=0;i<n;i++,i3+=3){
	for(c=0;c<3;c++) fs[i3+c]=fc[i3+c]=0.0;
	for(j=j3=0;j<knum;j++,j3+=3){
	    th=0.0;
	    for(c=0;c<3;c++) th+=rk[j3+c]*xn[i3+c];
	    th*=2.0*M_PI;
	    sth=sin(th);
	    cth=cos(th);
	    for(c=0;c<3;c++){
		fs[i3+c]+=cc[j3+c]*sth;
		fc[i3+c]+=cs[j3+c]*cth;
	    }
	}
    }
}


void MR1calcewald_host(k,knum_org,x,n,chg,alpha,epsilon,cell,
		       force,tpot,stress)
int n,*k,knum_org;
double *x,*chg,cell[3][3],*force,*tpot,alpha,epsilon,stress[3][3];
{
    double cell_inv[3][3];
    static double *xn,*rk,*kv,*bs,*bc,*fs,*fc,*factor1,*factor2,*cs,*cc;
    int knum;

    knum=knum_org<0 ? -knum_org:knum_org;
/*    {
	int i,j;
	fprintf(stderr,"knum=%d n=%d alpha=%e epsilon=%e\n",
		knum,n,alpha,epsilon);
	for(i=0;i<3;i++) fprintf(stderr,"stress[%d]=%e %e %e\n",
				 i,stress[i][0],stress[i][1],stress[i][2]);
	for(i=0;i<3;i++) fprintf(stderr,"cell[%d]=%e %e %e\n",
				 i,cell[i][0],cell[i][1],cell[i][2]);
	for(i=0;i<4;i++){
	    fprintf(stderr,"k[%d]=%d %d %d  chg=%e\n",
		    i,k[i*3],k[i*3+1],k[i*3+2],chg[i]);
	    fprintf(stderr,"x[%d]=%e %e %e\n",i,x[i*3],x[i*3+1],x[i*3+2]);
	}
    }*/
/*    {
	int ii,jj;
	double tmp;
	for(ii=0;ii<4;ii++){
	    tmp=0.0;
	    for(jj=0;jj<n;jj++){
		tmp+=chg[jj]*sin(2.0*M_PI*
				 (k[ii*3  ]*x[jj*3  ]
				 +k[ii*3+1]*x[jj*3+1]
				 +k[ii*3+2]*x[jj*3+2])/cell[0][0]);
	    }
	    fprintf(stderr,"myMDone:bs[%d]=%e\n",ii,tmp);
	}
    }*/
    
    MR1calcewald_sub1(n,knum,&xn,&rk,&kv,&bs,&bc,&cs,&cc,
		      &factor1,&factor2,&fs,&fc);
    MR1calcewald_sub2(k,rk,kv,knum,factor1,factor2,cell,cell_inv,
		      alpha,epsilon);
    MR1calcewald_sub3(x,xn,n,cell_inv);
    
/*    {
	int ii,jj;
	double tmp;
	for(ii=0;ii<4;ii++){
	    tmp=0.0;
	    for(jj=0;jj<n;jj++){
		tmp+=chg[jj]*sin(2.0*M_PI*
				 (rk[ii*3  ]*xn[jj*3  ]
				 +rk[ii*3+1]*xn[jj*3+1]
				 +rk[ii*3+2]*xn[jj*3+2]));
	    }
	    fprintf(stderr,"myMDone2:bs[%d]=%e\n",ii,tmp);
	}
    }*/
    
    MR1calcewald_dft_host(rk,knum,xn,n,chg,bs,bc);
#if 1
    {
      int i,imax;
      imax=10;
      for(i=0;i<imax;i++) printf("host  bs[%d]=%e bc=%e\n",i,bs[i],bc[i]);
    }
#endif
    MR1calcewald_sub4(bs,bc,kv,knum,factor1,factor2,tpot,stress);
    if(knum_org<0)
      MR1calcewald_sub5eng(bs,bc,rk,factor1,factor2,knum,cs,cc);
    else
      MR1calcewald_sub5(bs,bc,rk,factor1,factor2,knum,cs,cc);
    MR1calcewald_idft_host(rk,cs,cc,knum,xn,n,fs,fc);
    if(knum_org<0)
      MR1calcewald_sub6eng(fs,fc,chg,n,cell_inv,force);
    else
      MR1calcewald_sub6(fs,fc,chg,n,cell_inv,force);
/*    MR1calcewald_sub7(rk,kv,bs,bc,cs,cc,factor1,factor2);*/
}


void mr1calcewald_host_(k,knum,x,n,chg,alpha,epsilon,cell,
			force,tpot,stress)
int *n,*k,*knum;
double *x,*chg,cell[3][3],*force,*tpot,*alpha,*epsilon,stress[3][3];
{
    MR1calcewald_host(k,*knum,x,*n,chg,*alpha,*epsilon,cell,
		      force,tpot,stress);
}


void mr1calcewald_host__(k,knum,x,n,chg,alpha,epsilon,cell,
			 force,tpot,stress)
int *n,*k,*knum;
double *x,*chg,cell[3][3],*force,*tpot,*alpha,*epsilon,stress[3][3];
{
    MR1calcewald_host(k,*knum,x,*n,chg,*alpha,*epsilon,cell,
		      force,tpot,stress);
}
