#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>

#include "mr3_core.h"

__device__ __constant__ VG_MATRIX      d_matrix[ATYPE2];
__device__ __constant__ VG_MATRIX_NACL d_matrix_nacl[ATYPE_NACL2];

__device__ __inline__ 
void coulombforce_inter(int xj[3], float qj, int xi[3], float fi[3], float rscale2f, float al2)
{
  int k;
  float dn2,inr,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= rscale2f;
  inr   = rsqrtf(dn2);
  dphir = qj * inr * inr * inr;
  if(dn2==0.0f) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir * dr[k];
}

__global__ 
void coulombforce_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
			 float rscale2f, float xmax, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) coulombforce_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) coulombforce_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void coulombpot_inter(int xj[3], float qj, int xi[3], float fi[3], float rscale2f, float al2)
{
  int k;
  float dn2,inr,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= rscale2f;
  inr   = rsqrtf(dn2);
  dphir = qj * inr;
  if(dn2==0.0f) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir;
}

__global__ 
void coulombpot_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
		       float rscale2f, float xmax, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) coulombpot_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) coulombpot_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void realforce_inter(int xj[3], float qj, int xi[3], float fi[3], float rscale2f, float al2,
		     float r2min, float r2max)
{
  int k;
  float dn2,r,inr,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= rscale2f;
  inr   = rsqrtf(dn2);
  r     = inr * dn2;
  dphir = qj * ((float)(M_2_SQRTPI)*expf(-dn2) + erfcf(r)*inr)*inr*inr;
  if(dn2<r2min || dn2>=r2max) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir * dr[k];
}

__global__ 
void realforce_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
		      float rscale2f, float xmax, float r2min, float r2max, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) realforce_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2,r2min,r2max);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) realforce_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2,r2min,r2max);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void realpot_inter(int xj[3], float qj, int xi[3], float fi[3], float rscale2f, float al2,
		   float r2min, float r2max)
{
  int k;
  float dn2,r,inr,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= rscale2f;
  inr   = rsqrtf(dn2);
  r     = inr * dn2;
  dphir = qj * erfcf(r) * inr;
  if(dn2<r2min || dn2>=r2max) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir;
}

__global__ 
void realpot_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
		    float rscale2f, float xmax, float r2min, float r2max, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) realpot_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2,r2min,r2max);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) realpot_inter(s_xj[js].r,s_xj[js].qatype.q,xi,fi,rscale2f,al2,r2min,r2max);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void vdwforce_inter(int xj[3], int xi[3], float fi[3], int t, float al2,
		    float r2min, float r2max)
{
  int k;
  float dn2,inr2,dn6,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= d_matrix[t].rscale;
  inr2  = 1.0f/dn2;
  dn6   = inr2*inr2*inr2;
  dphir = d_matrix[t].gscale * dn6 * inr2 * (2.0f * dn6 - 1.0f);
  if(dn2<r2min || dn2>=r2max) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir * dr[k];
}

__global__ 
void vdwforce_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
		     int nat, float xmax, float r2min, float r2max, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js,atypei;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  atypei = xivec[i].qatype.atype * nat;
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) vdwforce_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) vdwforce_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void vdwforce_r6_inter(int xj[3], int xi[3], float fi[3], int t, float al2,
		       float r2min, float r2max)
{
  int k;
  float dn2,inr2,dn6,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= d_matrix[t].rscale;
  inr2  = 1.0f/dn2;
  dn6   = inr2*inr2*inr2;
  dphir = - d_matrix[t].gscale * dn6;
  if(dn2<r2min || dn2>=r2max) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir * dr[k];
}

__global__ 
void vdwforce_r6_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
			int nat, float xmax, float r2min, float r2max, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js,atypei;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  atypei = xivec[i].qatype.atype * nat;
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) vdwforce_r6_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) vdwforce_r6_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void vdwpot_inter(int xj[3], int xi[3], float fi[3], int t, float al2,
		  float r2min, float r2max)
{
  int k;
  float dn2,inr2,dn6,dr[3],dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  dn2  *= d_matrix[t].rscale;
  inr2  = 1.0f/dn2;
  dn6   = inr2*inr2*inr2;
  dphir = d_matrix[t].gscale * dn6 * (dn6 - 1.0f);
  if(dn2<r2min || dn2>=r2max) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir;
}

__global__ 
void vdwpot_kernel(int ni, VG_XVEC *xivec, int nj, VG_XVEC *xjvec,
		   int nat, float xmax, float r2min, float r2max, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js,atypei;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = xivec[i].r[k];
  atypei = xivec[i].qatype.atype * nat;
  for (j = 0; j < nj - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = xjvec[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) vdwpot_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  }
  __syncthreads();
  if(tid < nj - j) s_xj[tid] = xjvec[j + tid];
  __syncthreads();
  for (js = 0; js < nj - j; js++) vdwpot_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2,r2min,r2max);
  if(i<ni) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


__device__ __inline__ 
void ewald_dft_inter(int xj[3], float qj, float ki[3], float factor1,
		     float bsbci[3], float al2)
{
  int k;
  float th,dr[3],s,c;

  th = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xj[k] * al2;
    th    += dr[k] * ki[k];
  }
  th  *= (float)(2.0 * M_PI);
  s    = qj * sinf(th);
  c    = qj * cosf(th);
  bsbci[0] += s * factor1;
  bsbci[1] += c * factor1;
  //  bsbci[2] += (s * s + c * c) * 0.5f * factor1;
}

__global__ 
void ewald_dft_kernel(VG_XVEC *x, int n, VG_KVEC *kvec, int knum, float *bsbc)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float bsbci[3],al2;
  int js;
  __shared__ VG_XVEC s_xj[NLOAD];
  float ki[3],factor1;

  al2=scalbnf(1.0f,-32);
  for(k=0; k<3; k++) bsbci[k] = 0.0f;
  for(k=0; k<3; k++) ki[k] = kvec[i].k[k];
  factor1                  = kvec[i].factor1;
  for (j = 0; j < n - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = x[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) ewald_dft_inter(s_xj[js].r,s_xj[js].qatype.q,ki,factor1,bsbci,al2);
  }
  __syncthreads();
  if(tid < n - j) s_xj[tid] = x[j + tid];
  __syncthreads();
  for (js = 0; js < n - j; js++) ewald_dft_inter(s_xj[js].r,s_xj[js].qatype.q,ki,factor1,bsbci,al2);
  if(i<knum) for(k=0; k<3; k++) bsbc[i*3+k] = bsbci[k];
}


__device__ __inline__ 
void ewaldforce_idft_inter(float kj[3], float bsbc[3], int xi[3], float fi[3], float al2)
{
  int k;
  float th,dr[3],s,c;

  th = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] * al2;
    th    += dr[k] * kj[k];
  }
  th  *= (float)(2.0 * M_PI);
  s    = sinf(th);
  c    = cosf(th);
  for(k=0; k<3; k++){
    fi[k] += (bsbc[1] * s - bsbc[0] * c) * kj[k];
  }
}

__global__ 
void ewaldforce_idft_kernel(VG_XVEC *x, int n, VG_KVEC *kvec, int knum, 
			    float *bsbc, float *force)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float al2;
  int js,xi[3];
  __shared__ VG_KVEC s_kj[NLOAD];
  __shared__ float   s_bsbcj[NLOAD][3];
  float fi[3];

  al2=scalbnf(1.0f,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = x[i].r[k];
  for (j = 0; j < knum - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_kj[tid] = kvec[j + tid];
    if(tid < NLOAD) for(k=0;k<2;k++) s_bsbcj[tid][k] = bsbc[(j + tid)*3 + k];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) ewaldforce_idft_inter(s_kj[js].k,s_bsbcj[js],xi,fi,al2);
  }
  __syncthreads();
  if(tid < knum - j) s_kj[tid] = kvec[j + tid];
  if(tid < knum - j) for(k=0;k<2;k++) s_bsbcj[tid][k] = bsbc[(j + tid)*3 + k];
  __syncthreads();
  for (js = 0; js < knum - j; js++) ewaldforce_idft_inter(s_kj[js].k,s_bsbcj[js],xi,fi,al2);
  if(i<n) for(k=0; k<3; k++) force[i*3+k] = fi[k];
}


__device__ __inline__ 
void ewaldpot_idft_inter(float kj[3], float bsbc[3], int xi[3], float fi[3], float al2)
{
  int k;
  float th,dr[3],s,c;

  th = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] * al2;
    th    += dr[k] * kj[k];
  }
  th  *= (float)(2.0 * M_PI);
  s    = sinf(th);
  c    = cosf(th);
  fi[0] += bsbc[1] * c + bsbc[0] * s;
}

__global__ 
void ewaldpot_idft_kernel(VG_XVEC *x, int n, VG_KVEC *kvec, int knum, 
			  float *bsbc, float *force)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float al2;
  int js,xi[3];
  __shared__ VG_KVEC s_kj[NLOAD];
  __shared__ float   s_bsbcj[NLOAD][3];
  float fi[3];

  al2=scalbnf(1.0f,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = x[i].r[k];
  for (j = 0; j < knum - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_kj[tid] = kvec[j + tid];
    if(tid < NLOAD) for(k=0;k<2;k++) s_bsbcj[tid][k] = bsbc[(j + tid)*3 + k];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) ewaldpot_idft_inter(s_kj[js].k,s_bsbcj[js],xi,fi,al2);
  }
  __syncthreads();
  if(tid < knum - j) s_kj[tid] = kvec[j + tid];
  if(tid < knum - j) for(k=0;k<2;k++) s_bsbcj[tid][k] = bsbc[(j + tid)*3 + k];
  __syncthreads();
  for (js = 0; js < knum - j; js++) ewaldpot_idft_inter(s_kj[js].k,s_bsbcj[js],xi,fi,al2);
  if(i<n) for(k=0; k<3; k++) force[i*3+k] = fi[k];
}


__device__ __inline__ 
void nacl_inter(int xj[3], int xi[3], float fi[3], int t, float al2)
{
  int k;
  float dn2,r,inr,inr2,inr4,inr8,d3,dr[3];
  float pb=(float)(0.338e-19/(14.39*1.60219e-19)),dphir;

  dn2 = 0.0f;
  for(k=0; k<3; k++){
    dr[k]  = xi[k] - xj[k];
    dr[k] *= al2;
    dn2   += dr[k] * dr[k];
  }
  r     = sqrtf(dn2);
  inr   = 1.0f / r;
  inr2  = inr  * inr;
  inr4  = inr2 * inr2;
  inr8  = inr4 * inr4;
  d3    = pb * d_matrix_nacl[t].pol * expf( (d_matrix_nacl[t].sigm - r) * d_matrix_nacl[t].ipotro);
  dphir = ( d3 * d_matrix_nacl[t].ipotro * inr
	    - 6.0f * d_matrix_nacl[t].pc * inr8
	    - 8.0f * d_matrix_nacl[t].pd * inr8 * inr2
	    + inr2 * inr * d_matrix_nacl[t].zz );
  if(dn2 == 0.0f) dphir = 0.0f;
  for(k=0; k<3; k++) fi[k] += dphir * dr[k];
}

__global__ 
void nacl_kernel(VG_XVEC *x, int n, int nat, float xmax, float *fvec)
{
  int tid = threadIdx.x;
  int i = blockIdx.x * NTHRE + tid;
  int j,k;
  float fi[3],al2;
  int js,atypei;
  __shared__ VG_XVEC s_xj[NLOAD];
  int xi[3];

  al2=scalbnf(xmax,-32);
  for(k=0; k<3; k++) fi[k] = 0.0f;
  for(k=0; k<3; k++) xi[k] = x[i].r[k];
  atypei = x[i].qatype.atype * nat;
  for (j = 0; j < n - NLOAD; j+=NLOAD){
    __syncthreads();
    if(tid < NLOAD) s_xj[tid] = x[j + tid];
    __syncthreads();
#pragma unroll 16
    for (js = 0; js < NLOAD; js++) nacl_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2);
  }
  __syncthreads();
  if(tid < n - j) s_xj[tid] = x[j + tid];
  __syncthreads();
  for (js = 0; js < n - j; js++) nacl_inter(s_xj[js].r,xi,fi,atypei+s_xj[js].qatype.atype,al2);
  if(i<n) for(k=0; k<3; k++) fvec[i*3+k] = fi[k];
}


extern "C" {
#include "mr3_core.c"
}

