/*
  for ((n=2048;n<=4096;n+=16));do ./a.out $n 2 10|egrep -e Gflops -e "Number of particle" ;done | tee log2.txt 
 */


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>

#include "fastexpf.c"
#include "mr3.h"

static void get_cputime(double *laptime, double *sprittime)
{
  struct timeval tv;
  struct timezone tz;
  double sec,microsec;

  gettimeofday(&tv, &tz);
  sec=tv.tv_sec;
  microsec=tv.tv_usec;

  *sprittime = sec + microsec * 1e-6 - *laptime;
  *laptime = sec + microsec * 1e-6;
}


void MR3calcnacl_CPU_double(double x[], int n, int atype[], int nat,
			    double pol[], double sigm[], double ipotro[],
			    double pc[], double pd[], double zz[],
			    int tblno, double xmax, int periodicflag,
			    double force[])
{
  int i,j,k,t;
  double xmax1,dn2,r,inr,inr2,inr4,inr8,d3,dr[3],fi[3];
  double pb=0.338e-19/(14.39*1.60219e-19),dphir; 
  if((periodicflag & 1)==0) xmax *= 2;
  xmax1 = 1.0 / xmax;
#pragma omp parallel for private(k,j,dn2,dr,r,inr,inr2,inr4,inr8,t,d3,dphir,fi)
  for(i=0; i<n; i++){
    for(k=0; k<3; k++) fi[k] = 0.0;
    for(j=0; j<n; j++){
      dn2 = 0.0;
      for(k=0; k<3; k++){
	dr[k] =  x[i*3+k] - x[j*3+k];
	dr[k] -= rint(dr[k] * xmax1) * xmax;
	dn2   += dr[k] * dr[k];
      }
      if(dn2 != 0.0){
	r     = sqrt(dn2);
	inr   = 1.0  / r;
	inr2  = inr  * inr;
	inr4  = inr2 * inr2;
	inr8  = inr4 * inr4;
	t     = atype[i] * nat + atype[j];
	d3    = pb * pol[t] * exp( (sigm[t] - r) * ipotro[t]);
	dphir = ( d3 * ipotro[t] * inr
		  - 6.0 * pc[t] * inr8
		  - 8.0 * pd[t] * inr8 * inr2
		  + inr2 * inr * zz[t] );
	for(k=0; k<3; k++) fi[k] += dphir * dr[k];
      }
    }
    for(k=0; k<3; k++) force[i*3+k] = fi[k];
  }
}
    

void MR3calcnacl_CPU_optimized_kadai456(double x[], int n, int atype[], int nat,
			    double pol[], double sigm[], double ipotro[],
			    double pc[], double pd[], double zz[],
			    int tblno, double xmax, int periodicflag,
			    double force[])
{
  int i,j,k,t;
  double xmax1,dn2,r,inr,inr2,inr4,inr8,d3,dr[3],fi[3],fix,fiy,fiz;
  double pb=0.338e-19/(14.39*1.60219e-19),dphir; 
  if((periodicflag & 1)==0) xmax *= 2;
  xmax1 = 1.0 / xmax;
#pragma omp parallel for  private(k,j,dn2,dr,r,inr,inr2,inr4,inr8,t,d3,dphir,fix,fiy,fiz)
  for(i=0; i<n; i++){
    fix=fiy=fiz = 0.0;
//#pragma omp parallel for reduction(+:fix,fiy,fiz) private(k,j,dn2,dr,r,inr,inr2,inr4,inr8,t,d3,dphir,fi)
    for(j=0; j<n; j++){
      dn2 = 0.0;
      for(k=0; k<3; k++){
	dr[k] =  x[i*3+k] - x[j*3+k];
	dr[k] -= rint(dr[k] * xmax1) * xmax;
	dn2   += dr[k] * dr[k];
      }
      if(dn2 != 0.0){
	r     = sqrt(dn2);
	inr   = 1.0  / r;
	inr2  = inr  * inr;
	inr4  = inr2 * inr2;
	inr8  = inr4 * inr4;
	t     = atype[i] * nat + atype[j];
	d3    = pb * pol[t] * Exp( (sigm[t] - r) * ipotro[t]);
	dphir = ( d3 * ipotro[t] * inr
		  - 6.0 * pc[t] * inr8
		  - 8.0 * pd[t] * inr8 * inr2
		  + inr2 * inr * zz[t] );
        fix +=dphir * dr[0];
        fiy +=dphir * dr[1];
        fiz +=dphir * dr[2];
      }
    }
    force[i*3+0]=fix;
    force[i*3+1]=fiy;
    force[i*3+2]=fiz;
  }
}
    
void MR3calcnacl_CPU_optimized_kadai7core(double xi[], double xj[], int ni, int nj, 
					  double pol, double sigm, double ipotro,
					  double pc, double pd, double zz,
					  int tblno, double xmax, int periodicflag,
					  double force[])
{
  int i,j,k,t;
  double xmax1,dn2,r,inr,inr2,inr4,inr8,d3,dr[3],fi[3],fix,fiy,fiz;
  double pb=0.338e-19/(14.39*1.60219e-19),dphir; 
  if((periodicflag & 1)==0) xmax *= 2;
  xmax1 = 1.0 / xmax;
#pragma omp parallel for  private(k,j,dn2,dr,r,inr,inr2,inr4,inr8,t,d3,dphir,fix,fiy,fiz)
  for(i=0; i<ni; i++){
    fix=fiy=fiz = 0.0;
//#pragma omp parallel for reduction(+:fix,fiy,fiz) private(k,j,dn2,dr,r,inr,inr2,inr4,inr8,t,d3,dphir,fi)
    for(j=0; j<nj; j++){
      dn2 = 0.0;
      for(k=0; k<3; k++){
	dr[k] =  xi[i*3+k] - xj[j*3+k];
//	dr[k] -= rint(dr[k] * xmax1) * xmax;
	dn2   += dr[k] * dr[k];
      }
      if(dn2 != 0.0){
	r     = sqrt(dn2);
	inr   = 1.0  / r;
	inr2  = inr  * inr;
	inr4  = inr2 * inr2;
	inr8  = inr4 * inr4;
	d3    = pb * pol * Exp( (sigm - r) * ipotro);
	dphir = ( d3 * ipotro * inr
		  - 6.0 * pc * inr8
		  - 8.0 * pd * inr8 * inr2
		  + inr2 * inr * zz );
        fix +=dphir * dr[0];
        fiy +=dphir * dr[1];
        fiz +=dphir * dr[2];
      }
    }
    force[i*3+0]+=fix;
    force[i*3+1]+=fiy;
    force[i*3+2]+=fiz;
  }
}
    
void MR3calcnacl_CPU_optimized(double x[], int n, int atype[], int nat,
			    double pol[], double sigm[], double ipotro[],
			    double pc[], double pd[], double zz[],
			    int tblno, double xmax, int periodicflag,
			    double force[])
{
  int i,k,ati,atj,ni,nj,t;
  double *xi,*xj,*fi;
  if((xi=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc xi **\n");
    exit(1);
  }
  if((xj=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc xj **\n");
    exit(1);
  }
  if((fi=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc fi **\n");
    exit(1);
  }

  for(ati=0;ati<nat;ati++){
    // copy positions with atype=ati to xi
    ni=0;
    for(i=0;i<n;i++){
      if(atype[i]==ati){
	for(k=0;k<3;k++) xi[ni*3+k]=x[i*3+k];
	ni++;
	//	printf("i=%d, atomtype=%d, ni=%d\n",i,ati,ni);
      }
    }

    // initialize force
    for(i=0;i<ni;i++) for(k=0;k<3;k++) fi[i*3+k]=0.0;

    for(atj=0;atj<nat;atj++){
      // copy positions with atype=atj to xj
      nj=0;
      for(i=0;i<n;i++){
	if(atype[i]==atj){
	  for(k=0;k<3;k++) xj[nj*3+k]=x[i*3+k];
	  nj++;
	}
      }

      // calculate ati-atj pair
      //      printf("ati=%d atj=%d ni=%d nj=%d\n",ati,atj,ni,nj);
      t=ati*nat+atj;
      MR3calcnacl_CPU_optimized_kadai7core(xi,xj,ni,nj, 
					   pol[t],sigm[t],ipotro[t],
					   pc[t],pd[t],zz[t],
					   tblno,xmax,periodicflag,
					   fi);
      //      for(i=0;i<ni;i++) printf("  fi[%d]=%e %e %e\n",i,fi[i*3],fi[i*3+1],fi[i*3+2]);
    }
    // copy fi to force
    ni=0;
    for(i=0;i<n;i++){
      if(atype[i]==ati){
	for(k=0;k<3;k++) force[i*3+k]=fi[ni*3+k];
	ni++;
      }
    }
  }
  free(xi);
  free(xj);
}
    
int main(int argc, char **argv)
{
  int i,j,n,nstep=1,nat=2,knum=1000;
  double *x,*a1,*a2,*q;
  double *pol,*sigm,*ipotro,*pc,*pd,*zz;
  int *atype;
  double xmax=100.0;
  double cell[3][3]={{xmax,0.0,0.0},{0.0,xmax,0.0},{0.0,0.0,xmax}};
  double ltime,stime;
  double avr,aone,err,eone;
  int calcmode=0;
  int *ki;
  double *bsbc,ksize=10.0,tpot1,tpot2,stress[3][3];

  if(argc!=3 && argc!=4){
    printf("usage : %s number_of_particles.(number_of_kvectors) calc_mode (number_of_steps)\n",argv[0]);
    printf("  calc_mode : 0 -- CPU original routine is used for NaCl\n");
    printf("              1 -- CPU optimized routine is used for NaCl\n");
    printf("              2 -- GPU is used for NaCl\n");
    printf("              10 - GPU is used for coulomb force\n");
    printf("              11 - GPU is used for coulomb pot\n");
    printf("              16 - GPU is used for real force\n");
    printf("              17 - GPU is used for real pot\n");
    printf("              20 - GPU is used for vdw force\n");
    printf("              21 - GPU is used for vdw pot\n");
    printf("              25 - GPU is used for vdw r6 force\n");
    printf("              40 - GPU is used for ewald force\n");
    return 1;
  }

  // set number of particles
  sscanf(argv[1],"%d.%d",&n,&knum);
  printf("Number of particle is %d\n",n);
  printf("Number of k-vectors is %d\n",knum);

  // set number of steps
  if(argc==4){
    sscanf(argv[3],"%d",&nstep);
  }
  printf("Number of steps is %d\n",nstep);

  // set calcmode
  sscanf(argv[2],"%d",&calcmode);
  printf("calcmode=%d\n",calcmode);
  if(calcmode>=20){
    nat=64;
  }
  printf("Number of atom types is %d\n",nat);


  // allocate variables
  if((x=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc x **\n");
    return 1;
  }
  if((q=(double *)malloc(sizeof(double)*n))==NULL){
    fprintf(stderr,"** error : can't malloc q **\n");
    return 1;
  }
  if((a1=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc a1 **\n");
    return 1;
  }
  if((a2=(double *)malloc(sizeof(double)*n*3))==NULL){
    fprintf(stderr,"** error : can't malloc a2 **\n");
    return 1;
  }
  if((atype=(int *)malloc(sizeof(int)*n))==NULL){
    fprintf(stderr,"** error : can't malloc atype **\n");
    return 1;
  }
  if((pol=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc pol **\n");
  }
  if((sigm=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc sigm **\n");
  }
  if((ipotro=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc ipotro **\n");
  }
  if((pc=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc pc **\n");
  }
  if((pd=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc pd **\n");
  }
  if((zz=(double *)malloc(sizeof(double)*nat*nat))==NULL){
    fprintf(stderr,"** error : can't malloc zz **\n");
  }
  if((ki=(int *)malloc(sizeof(int)*knum*3))==NULL){
    fprintf(stderr,"** error : can't malloc ki **\n");
    return 1;
  }
  if((bsbc=(double *)malloc(sizeof(double)*knum*3))==NULL){
    fprintf(stderr,"** error : can't malloc bsbc **\n");
    return 1;
  }


  // set positions and types
  for(i=0;i<n;i++){
    for(j=0;j<3;j++){
      x[i*3+j]=drand48()*xmax;
    }
    atype[i]=drand48()*nat;
    q[i]=drand48();
  }

#if 0
  x[0]=1.0;x[1]=0.0;x[2]=0.0;
  x[3]=0.0;x[4]=0.0;x[5]=0.0;
  for(i=2;i<n;i++) for(j=0;j<3;j++) {x[i*3+j]=x[j];atype[i]=atype[0];}
  for(i=0;i<n;i++) atype[i]=0;
#endif

  // set parameters between atoms
  for(i=0;i<nat;i++){
    for(j=0;j<nat;j++){
      pol[i*nat+j]=1.0+drand48();
      sigm[i*nat+j]=2.0+drand48();
      ipotro[i*nat+j]=3.0+drand48();
      pc[i*nat+j]=5.0+drand48();
      pd[i*nat+j]=4.0+drand48();
      zz[i*nat+j]=-1.0+2.0*drand48();
    }
  }

  for(i=0;i<knum;i++){
    for(j=0;j<3;j++){
      ki[i*3+j]=drand48()*ksize+1;
      bsbc[i*3+j]=0.0;
    }
  }

  float y=1.0;
  InitExp();
  printf("%e %e\n",Exp(y),expf(y));

  MR3init();

  // timer start
  get_cputime(&ltime,&stime);

  // calc with target routine
  for(i=0;i<nstep;i++){
    bzero(a2,sizeof(double)*n*3);
    switch(calcmode){
    case 0:
      MR3calcnacl_CPU_double(x,n,atype,nat,pol,sigm,ipotro,pc,pd,zz,0,xmax,1,a2);
      if(i==0) printf("original routine is used\n");
      break;
    case 1:
      MR3calcnacl_CPU_optimized(x,n,atype,nat,pol,sigm,ipotro,pc,pd,zz,0,xmax,1,a2);
      if(i==0) printf("optimized routine is used\n");
      break;
    case 2:
      MR3calcnacl(x,n,atype,nat,pol,sigm,ipotro,pc,pd,zz,0,xmax,1,a2);
      if(i==0) printf("GPU routine is used\n");
      break;
    case 10:
      MR3calccoulomb(x,n,q,1.0,0,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for coulomb force\n");
      break;
    case 11:
      MR3calccoulomb(x,n,q,1.0,1,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for coulomb pot\n");
      break;
    case 16:
      MR3calccoulomb(x,n,q,1.0,6,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for real force\n");
      break;
    case 17:
      MR3calccoulomb(x,n,q,1.0,7,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for real pot\n");
      break;
    case 20:
      MR3calcvdw(x,n,atype,nat,pol,sigm,2,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for vdw force\n");
      break;
    case 21:
      MR3calcvdw(x,n,atype,nat,pol,sigm,3,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for vdw pot\n");
      break;
    case 25:
      MR3calcvdw(x,n,atype,nat,pol,sigm,5,xmax,1,0,a2);
      if(i==0) printf("GPU routine is used for vdw r6 force\n");
      break;
    case 40:
      MR3calcewald(ki,knum,x,n,q,1.0,1.0,cell,a2,&tpot2,stress);
      if(i==0) printf("GPU routine is used for ewald force\n");
      break;
    case 41:
      MR3calcewald(ki,-knum,x,n,q,1.0,1.0,cell,a2,&tpot2,stress);
      if(i==0) printf("GPU routine is used for ewald potential\n");
      break;
    default:
      fprintf(stderr,"** error : cal_mode=%d is not supported **\n",calcmode);
      return 1;
    }
  }

  // timer stop
  get_cputime(&ltime,&stime);
  double speed=0.0;
  if(calcmode<10)      speed=((double)n)*((double)n)*((double)nstep)*78.0/stime/1e9;
  else if(calcmode<12) speed=((double)n)*((double)n)*((double)nstep)*38.0/stime/1e9;
  else if(calcmode<20) speed=((double)n)*((double)n)*((double)nstep)*59.0/stime/1e9;
  else if(calcmode<30) speed=((double)n)*((double)n)*((double)nstep)*32.0/stime/1e9;
  else if(calcmode<50) speed=((double)n)*((double)knum)*((double)nstep)*64.0/stime/1e9;
  printf("Time=%f seconds, %f Gflops\n",stime,speed);
//  exit(0);

  // check result
  if(calcmode<10){
    MR3calcnacl_CPU_double(x,n,atype,nat,pol,sigm,ipotro,pc,pd,zz,0,xmax,1,a1);
  }
  else if(calcmode<20){
    MR3calccoulomb_host(x,n,q,1.0,0+(calcmode % 10),xmax,1,0,a1);
  }
  else if(calcmode<30){
    MR3calcvdw_host(x,n,atype,nat,pol,sigm,(calcmode % 10),xmax,1,0,a1);
  }
  else if(calcmode<50){
    MR3calcewald_host(ki,((calcmode % 10)==1 ? -knum:knum),x,n,q,1.0,1.0,cell,a1,&tpot1,stress);
  }
  avr=0.0;
  for(i=0;i<n;i++){
    aone=0.0;
    for(j=0;j<3;j++){
      aone+=a1[i*3+j]*a1[i*3+j];
    }
    aone=sqrt(aone);
    avr+=aone;
  }
  avr/=(double)n;
  for(i=0;i<3;i++){
    printf("a1[%d]=%10.3e %e %10.3e a2=%10.3e %10.3e %10.3e\n",
	   i,a1[i*3+0],a1[i*3+1],a1[i*3+2],a2[i*3+0],a2[i*3+1],a2[i*3+2]);
  }
  printf("Average size of force is %10.3e\n",avr);
  err=0.0;
  for(i=0;i<n;i++){
    eone=0.0;
    for(j=0;j<3;j++){
      eone+=(a1[i*3+j]-a2[i*3+j])*(a1[i*3+j]-a2[i*3+j]);
    }
    eone=sqrt(eone)/avr;
    err+=eone;
  }
  err/=(double)n;
  printf("Average error is %10.3e\n",err);

  // deallocate variables
  free(x);
  free(q);
  free(a1);
  free(a2);
  free(atype);
  free(pol);
  free(sigm);
  free(ipotro);
  free(pc);
  free(pd);
  free(zz);
  free(ki);
  free(bsbc);
  
  return 0;
}
