From de34df7706dd75e3ddc0e3646e553125c1164db5 Mon Sep 17 00:00:00 2001
From: Paul Zimmerman <paulzim@umich.edu>
Date: Thu, 7 May 2026 15:28:08 -0400
Subject: [PATCH 1/4] various updates

integrals.cpp now has a long-range barrier potential option
also various changes to support Gaussian basis operations
---
 include/becke.h                 |  13 +-
 include/cintwrapper.h           |  13 +
 include/gauss.h                 |  22 +-
 include/integrals.h             |   4 +-
 src/integrals/becke.cpp         | 753 +++++++++++++++++++-------------
 src/integrals/gauss.cpp         | 540 +++++++++++++++++------
 src/integrals/integrals.cpp     |  60 ++-
 src/integrals/integrals_aux.cpp |  58 ++-
 src/libcintw/cintwrapper.cpp    | 394 +++++++++++++++--
 9 files changed, 1359 insertions(+), 498 deletions(-)
diff --git a/include/becke.h b/include/becke.h
index 54d349a..71b2362 100644
--- a/include/becke.h
+++ b/include/becke.h
@@ -17,6 +17,10 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
 
 void becke_weight_nc(int natoms, float* grid1, float* wt1, double* coords);
 void becke_weight_nc(int natoms, float* grid1, float* wt1, float* coordsf);
+void becke_weight_nc(const int mu_order, const int natoms, float* Rs, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords);
+void becke_weight_ncg(const int mu_order, const int natoms, float* Rs, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords);
+void becke_weight_ncgd(const int mu_order, const int natoms, double* Rs, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords);
+void becke_weight_nc(const int mu_order, const int natoms, double* Rs, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords);
 
 void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3,
                      int Z1, int Z2, int Z3, float A2, float B2, float C2, float A3, float B3, float C3);
@@ -25,10 +29,15 @@ void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
 void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double* wt2,
                      double zeta1, double zeta2, double A2, double B2, double C2);
 
+float becke_ar(float r1, float r2);
+double becke_ard(double r1, double r2);
+
 void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
+void get_becke_grid_full(int mu_order, float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
 void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt);
 void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, float* grid, float* wt);
 void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
+void get_becke_grid_full(int mu_order, int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
 void get_becke_grid_sparse(double rmax, int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt);
 
 void compute_rho(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* Pao, int nrad, int gsa, float* grid, double* rho, double* drho, int prl);
@@ -87,8 +96,8 @@ void density_in_basis2(int natoms, int* atno, double* coords, vector<vector<doub
 
 double becke_ad(int Z1, int Z2);
 float becke_a(int Z1, int Z2);
-float bf3(float f1);
-double bf3d(double f1);
+float bf3(int mu_order, float f1);
+double bf3d(int mu_order, double f1);
 
 //stretch atom 1's radius
 float becke_a(float alpha, int Z1, int Z2);
diff --git a/include/cintwrapper.h b/include/cintwrapper.h
index ef600a7..91baf23 100644
--- a/include/cintwrapper.h
+++ b/include/cintwrapper.h
@@ -79,6 +79,11 @@ extern "C" {
   int64_t int1e_grids_cart(double *out, FINT *dims, FINT *shls,
                                 FINT *atm, FINT natm, FINT *bas, FINT nbas,
                                 double *env, CINTOpt *opt, double *cache);
+
+  int cint4c1e_cart(double *buf, int *shls, int *atm, int natm,
+                    int *bas, int nbas, double *env, CINTOpt *opt);
+  int cint4c1e_sph(double *buf, int *shls, int *atm, int natm,
+                   int *bas, int nbas, double *env, CINTOpt *opt);
 }
 
 using namespace std;
@@ -128,6 +133,14 @@ void gen_pvp(double *pvp, int N,
                int natm, int nbas, int nenv,
                int *atm, int *bas, double *env);
 
+void gen_4c_overlap_m4(double* ovlp4, size_t N,
+                    int natm, int nbas, int nenv,
+                    int* atm, int* bas, double* env);
+
+void gen_4c_overlap(double* ovlp4, size_t N,
+                    int natm, int nbas, int nenv,
+                    int* atm, int* bas, double* env);
+
 void gen_eri(double **eri, int N,
              int natm, int nbas, int nenv,
              int *atm, int *bas, double *env);
diff --git a/include/gauss.h b/include/gauss.h
index 42c0f59..571f9ec 100644
--- a/include/gauss.h
+++ b/include/gauss.h
@@ -3,6 +3,7 @@
 
 #include "integrals.h"
 //#include "vxc.h"
+#include "gamma.h"
 #include "read.h"
 #include "gto_grid.h"
 #include "print.h"
@@ -10,18 +11,29 @@
 
 void eval_gh(int gs, float* grid, float* val1, int l1, int m1, const float norm1, const float zeta1);
 void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double norm1, const double zeta1);
+void eval_gh_spd(bool include_r, int gs, double* grid, double* val1, int l1, int m1);
+void eval_gh_spd(int gs, double* grid, double* val1, int l1, int m1);
+
 void eval_gh_ke(int gs, float* grid, float* val1, int n1, int l1, const float norm1, const float zeta1);
 void eval_ghd_ke(int gs, double* grid, double* val1, int n1, int l1, const double norm1, const double zeta1);
 //void eval_pdke_gh(int gs, double* grid, double* val1, int n1, int l1, int m1, double norm1, double zeta1);
 void eval_p_gh(int gs, float* grid, float* val, int n1, int l1, int m1, float norm1, float zeta1);
 void eval_pd_gh(int gs, double* grid, double* val, int n1, int l1, int m1, double norm1, double zeta1);
+
+//Gaussian single-center potentials
+void eval_vghd(int gsa, double* grid, double* val, int n, int l, int m, double norm, double zt);
+
 void eval_hess_ghd(int gs, double* grid, double* val, int n1, int l1, int m1, double norm1, double zeta1);
 int eval_gh_full(int gs, float* grid, float** val1, int i1, int natoms, int nbas, int nenv, int N, int* atm, int* bas, double* env);
-void wf_to_grid_gh_ke(int natoms, int* atno, double* coords, vector<vector<double> > basis, double* jCA, int gs, float* grid, float* wt, double* TL, int prl);
-void wf_to_grid_gh_ke_2(int natoms, int* atno, double* coords, vector<vector<double> > basis, int nbas, int nenv, int N, int* atm, int* bas, double* env,
-                        double* jCA, int gs, float* grid, float* wt, double* Td, int prl);
-void integrate_hole_para_gh(double* rdm, bool full_rdm, bool hfx_on, int Nc, int No, int M, int natoms, int* atno, double* coords, int gs, int gsb, vector<vector<double> >& basis,
-                    double* Pao, double* Pmo, double* jCA, float* grid, float* gridb, float* wt, float* wtb, float* rho, float* vxch, int prl);
+
+void wf_to_grid_gh(bool divide_by_rho, bool calc_rho, int natoms, int* atno, double* coords, vector<vector<double> > basis,
+                   double* Pao, double* gfao, int gs, double* grid, double* wt,
+                   double* rho, double* gf, double* Td, double* TL, int prl);
+//void wf_to_grid_gh_ke(int natoms, int* atno, double* coords, vector<vector<double> > basis, double* jCA, int gs, float* grid, float* wt, double* TL, int prl);
+//void wf_to_grid_gh_ke_2(int natoms, int* atno, double* coords, vector<vector<double> > basis, int nbas, int nenv, int N, int* atm, int* bas, double* env,
+//                        double* jCA, int gs, float* grid, float* wt, double* Td, int prl);
+//void integrate_hole_para_gh(double* rdm, bool full_rdm, bool hfx_on, int Nc, int No, int M, int natoms, int* atno, double* coords, int gs, int gsb, vector<vector<double> >& basis,
+//                    double* Pao, double* Pmo, double* jCA, float* grid, float* gridb, float* wt, float* wtb, float* rho, float* vxch, int prl);
 
 
 #endif
diff --git a/include/integrals.h b/include/integrals.h
index ae15936..3b1b342 100644
--- a/include/integrals.h
+++ b/include/integrals.h
@@ -78,7 +78,8 @@ void compute_Enp(int natoms, int* atno, float* coords, vector<vector<double> > &
 void compute_Enp_para(int ngpu, int natoms, int* atno, float* coords, vector<vector<double> > &basis, int nrad, int nang, double* ang_g0, double* ang_w0, double* En, double* pVp, int prl);
 void compute_Enp_para(int ngpu, int natoms, int* atno, float* coords, vector<vector<double> > &basis, int nrad, int nang, double* ang_g0, double* ang_w0, float* En, float* pVp, int prl);
 
-//electric fields in x,y,z (centered at origin)
+//electric fields in x,y,z (centered at origin) and confining potential
+void compute_Exyz(double rconf, double pconf, int natoms, int* atno, double* coords, bool gbasis, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, double* E, int prl);
 void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, double* E, int prl);
 
 void compute_ST(int natoms, int* atno, float* coords, vector<vector<double> > &basis, int nrad, int nang, double* ang_g0, double* ang_w0, double* S, double* T, int prl);
@@ -151,6 +152,7 @@ int get_imax_n2ip(int Nmax, int natoms, int N, vector<vector<double> >& basis, v
 
 int find_center_of_grid(float Z1, int nrad);
 void get_angular_grid(int size_ang, double* ang_g, double* ang_w);
+void get_angular_grid(int size_ang, float* ang_g, float* ang_w);
 void acc_assign(int size, float* vec, float v1);
 void acc_assign(int size, double* vec, double v1);
 void acc_assign(int tid, int size, double* vec, double v1);
diff --git a/src/integrals/becke.cpp b/src/integrals/becke.cpp
index 39e3b0e..d489847 100644
--- a/src/integrals/becke.cpp
+++ b/src/integrals/becke.cpp
@@ -23,7 +23,7 @@ void print_vec(int gsa, float* grid, float* A, float* B);
 
 using namespace std;
 
-//#pragma acc routine seq
+#pragma acc routine seq
 int get_atom_grid(int n, int natoms, int gsa)
 {
   int f1 = gsa/natoms;
@@ -37,7 +37,7 @@ int get_atom_grid(int n, int natoms, int gsa)
 
 double sigmoid(double v1)
 {
-  double val = 1./(1.f+exp(-v1));
+  double val = 1./(1.+exp(-v1));
   return 1. - val;
 }
 
@@ -60,6 +60,8 @@ float tanhh(float a)
 
 void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords0)
 {
+  int mu_order = 3;
+
   int gsa = gs*natoms;
   if (natoms==1)
   {
@@ -112,6 +114,7 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
   int nat2 = natoms*natoms;
 
   double Ra[nat2];
+  for (int i=0;i<natoms;i++) Ra[i*natoms+i] = 10000.;
  //relative atomic positions
   for (int i=0;i<natoms;i++)
   for (int j=0;j<i;j++)
@@ -121,11 +124,13 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
     double dax = xa2-xa1; double day = ya2-ya1; double daz = za2-za1;
     double ra1 = sqrt(dax*dax+day*day+daz*daz);
 
+    //printf("  %i %i Ra: %5.2f \n",i,j,ra1);
     Ra[i*natoms+j] = ra1;
     Ra[j*natoms+i] = ra1;
   }
 
   double aij[nat2];
+  for (int i=0;i<natoms;i++) aij[i*natoms+i] = 1.;
   for (int i=0;i<natoms;i++)
   for (int j=0;j<i;j++)
   {
@@ -153,6 +158,7 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
     bij[i*natoms+j] = b12;
     bij[j*natoms+i] = b21;
   }
+  for (int i=0;i<natoms;i++) bij[i*natoms+i] = 1.;
 
  #if 0
   printf("  b: \n");
@@ -201,8 +207,8 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
 
       double nu12 =  mu12 + a12*omm12;
       double nu21 = -mu12 + a21*omm12;
-      double f12 = bf3d(nu12);
-      double f21 = bf3d(nu21);
+      double f12 = bf3d(mu_order,nu12);
+      double f21 = bf3d(mu_order,nu21);
 
       if (alpha>0.)
       {
@@ -221,8 +227,8 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
 
      //"b" terms
 
-      //const double nu0 = beta*oR12;
-      const double nu0 = beta;
+      //const double nu0 = beta;
+      const double nu0 = 0.;
       if (alpha>0.)
       {
         //mu12 = r1-r2; //sigmoid not dependent on R12
@@ -246,8 +252,8 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
         nu12 =  mu12 + b12*omm12;
         nu21 = -mu12 + b21*omm12;
 
-        f12 = bf3d(nu12);
-        f21 = bf3d(nu21);
+        f12 = bf3d(mu_order,nu12);
+        f21 = bf3d(mu_order,nu21);
       }
 
       fpb[i*natoms+j] = f12;
@@ -348,8 +354,9 @@ void atomic_domain_cell_wt(const int ta, const float alpha, const float beta, co
 }
 
 //n-center grid wts (float)
-void becke_weight_nc(const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords)
+void becke_weight_nc(const int mu_order, const int natoms, float* Rs, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords)
 {
+  if (mu_order<1) { printf("  mu_order is zero. no Becke wts \n"); return; }
   if (natoms<2) return;
 
   int gsa = gs*natoms;
@@ -376,8 +383,8 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, float* grid1,
   for (int j=0;j<i;j++)
   {
     int Z1 = atno[i]; int Z2 = atno[j];
-    float a12 = becke_a(Z1,Z2);
-    float a21 = becke_a(Z2,Z1);
+    float a12 = becke_ar(Rs[Z1],Rs[Z2]);
+    float a21 = becke_ar(Rs[Z2],Rs[Z1]);
     aij[i*natoms+j] = a12;
     aij[j*natoms+i] = a21;
   }
@@ -415,8 +422,8 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, float* grid1,
       float omm12 = 1.-mu12*mu12;
       float nu12 =  mu12 + a12*omm12;
       float nu21 = -mu12 + a21*omm12;
-      float f12 = bf3(nu12);
-      float f21 = bf3(nu21);
+      float f12 = bf3(mu_order,nu12);
+      float f21 = bf3(mu_order,nu21);
 
       fp[i*natoms+j] = f12;
       fp[j*natoms+i] = f21;
@@ -449,20 +456,244 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, float* grid1,
   return;
 }
 
-void becke_weight_nc(const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, double* coords)
+//GPU version
+void becke_weight_ncgd(const int mu_order, const int natoms, double* Rs, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords)
+{
+ //this works but has a quirk:
+ // had to static allocate an array within an acc loop
+
+  if (mu_order<1) return;
+  if (natoms<2) return;
+  if (natoms*natoms>1000) { printf("\n ERROR: static array allocation in becke_weight_ncg has a problem (natoms: %2i) \n",natoms); return; }
+
+  int gsa = gs*natoms;
+  int nat2 = natoms*natoms;
+
+  double Ra[nat2];
+  double aij[nat2];
+  #pragma acc enter data create(Ra[0:nat2],aij[0:nat2])
+
+ //atomic positions
+ #pragma acc parallel loop collapse(2) present(Ra[0:nat2],coords[0:3*natoms])
+  for (int i=0;i<natoms;i++)
+  for (int j=0;j<natoms;j++)
+  {
+    double xa1 = coords[3*i+0]; double ya1 = coords[3*i+1]; double za1 = coords[3*i+2];
+    double xa2 = coords[3*j+0]; double ya2 = coords[3*j+1]; double za2 = coords[3*j+2];
+    double dax = xa2-xa1; double day = ya2-ya1; double daz = za2-za1;
+    double ra1 = sqrt(dax*dax+day*day+daz*daz);
+
+    Ra[i*natoms+j] = ra1;
+  }
+ #pragma acc parallel loop present(Ra[0:nat2])
+  for (int i=0;i<natoms;i++)
+    Ra[i*natoms+i] = 1.;
+
+ #pragma acc parallel loop collapse(2) present(aij[0:nat2],atno[0:natoms],coords[0:3*natoms],Rs[0:36])
+  for (int i=0;i<natoms;i++)
+  for (int j=0;j<natoms;j++)
+  {
+    int Z1 = atno[i]; int Z2 = atno[j];
+    double a12 = becke_ard(Rs[Z1],Rs[Z2]);
+    aij[i*natoms+j] = a12;
+  }
+
+  #pragma acc parallel loop present(Ra[0:nat2],aij[0:nat2],coords[0:3*natoms],grid1[0:gc*gsa],wt1[0:gsa])
+  for (int n=0;n<gsa;n++)
+  {
+    double x = grid1[gc*n+0]; double y = grid1[gc*n+1]; double z = grid1[gc*n+2];
+
+    int wa = get_atom_grid(n,natoms,gsa);
+
+    double fp1[1000]; //natoms up to ~30
+    #pragma acc loop collapse(2)
+    for (int i=0;i<natoms;i++)
+    for (int j=0;j<natoms;j++)
+    {
+      double a12 = aij[i*natoms+j];
+      double oR12 = 1./Ra[i*natoms+j];
+
+      double dx1 = x-coords[3*i+0]; double dy1 = y-coords[3*i+1]; double dz1 = z-coords[3*i+2];
+      double dx2 = x-coords[3*j+0]; double dy2 = y-coords[3*j+1]; double dz2 = z-coords[3*j+2];
+      double r1 = sqrt(dx1*dx1+dy1*dy1+dz1*dz1);
+      double r2 = sqrt(dx2*dx2+dy2*dy2+dz2*dz2);
+
+      double mu12 = (r1-r2)*oR12;
+      double omm12 = 1.-mu12*mu12;
+      double nu12 =  mu12 + a12*omm12;
+      double f12 = bf3d(mu_order,nu12);
+
+      fp1[i*natoms+j] = f12;
+    }
+
+   #pragma acc loop
+    for (int i=0;i<natoms;i++)
+      fp1[i*natoms+i] = 1.;
+
+    double s1;
+    {
+      double s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp1[wa*natoms+j];
+      s1 = s0;
+    }
+
+    double norm = 1.e-15;
+   #pragma acc loop reduction(+:norm)
+    for (int i=0;i<natoms;i++)
+    {
+      double s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp1[i*natoms+j];
+      norm += s0;
+    }
+
+    wt1[n] *= s1/norm;
+  }
+
+  #pragma acc exit data delete(Ra[0:nat2],aij[0:nat2])
+
+  return;
+}
+
+void becke_weight_ncg(const int mu_order, const int natoms, float* Rs, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords)
 {
+ //this works but has a quirk:
+ // had to static allocate an array within an acc loop
+
+  if (mu_order<1) return;
+  if (natoms<2) return;
+  if (natoms*natoms>1000) { printf("\n ERROR: static array allocation in becke_weight_ncg has a problem (natoms: %2i) \n",natoms); return; }
+
+  int gsa = gs*natoms;
+  int nat2 = natoms*natoms;
+
+  float Ra[nat2];
+  float aij[nat2];
+  #pragma acc enter data create(Ra[0:nat2],aij[0:nat2])
+
+ //atomic positions
+ #pragma acc parallel loop collapse(2) present(Ra[0:nat2],coords[0:3*natoms])
+  for (int i=0;i<natoms;i++)
+  for (int j=0;j<natoms;j++)
+  {
+    float xa1 = coords[3*i+0]; float ya1 = coords[3*i+1]; float za1 = coords[3*i+2];
+    float xa2 = coords[3*j+0]; float ya2 = coords[3*j+1]; float za2 = coords[3*j+2];
+    float dax = xa2-xa1; float day = ya2-ya1; float daz = za2-za1;
+    float ra1 = sqrtf(dax*dax+day*day+daz*daz);
+
+    Ra[i*natoms+j] = ra1;
+  }
+ #pragma acc parallel loop present(Ra[0:nat2])
+  for (int i=0;i<natoms;i++)
+    Ra[i*natoms+i] = 1.f;
+
+ #pragma acc parallel loop collapse(2) present(aij[0:nat2],atno[0:natoms],coords[0:3*natoms],Rs[0:36])
+  for (int i=0;i<natoms;i++)
+  for (int j=0;j<natoms;j++)
+  {
+    int Z1 = atno[i]; int Z2 = atno[j];
+    float a12 = becke_ar(Rs[Z1],Rs[Z2]);
+    aij[i*natoms+j] = a12;
+  }
+
+  #pragma acc parallel loop present(Ra[0:nat2],aij[0:nat2],coords[0:3*natoms],grid1[0:gc*gsa],wt1[0:gsa])
+  for (int n=0;n<gsa;n++)
+  {
+    float x = grid1[gc*n+0]; float y = grid1[gc*n+1]; float z = grid1[gc*n+2];
+
+    int wa = get_atom_grid(n,natoms,gsa);
+
+    float fp1[1000]; //natoms up to ~30
+    #pragma acc loop collapse(2)
+    for (int i=0;i<natoms;i++)
+    for (int j=0;j<natoms;j++)
+    {
+      float a12 = aij[i*natoms+j];
+      float oR12 = 1./Ra[i*natoms+j];
+
+      float dx1 = x-coords[3*i+0]; float dy1 = y-coords[3*i+1]; float dz1 = z-coords[3*i+2];
+      float dx2 = x-coords[3*j+0]; float dy2 = y-coords[3*j+1]; float dz2 = z-coords[3*j+2];
+      float r1 = sqrtf(dx1*dx1+dy1*dy1+dz1*dz1);
+      float r2 = sqrtf(dx2*dx2+dy2*dy2+dz2*dz2);
+
+      float mu12 = (r1-r2)*oR12;
+      float omm12 = 1.-mu12*mu12;
+      float nu12 =  mu12 + a12*omm12;
+      float f12 = bf3(mu_order,nu12);
+
+      fp1[i*natoms+j] = f12;
+    }
+
+   #pragma acc loop
+    for (int i=0;i<natoms;i++)
+      fp1[i*natoms+i] = 1.;
+
+    float s1;
+    {
+      float s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp1[wa*natoms+j];
+      s1 = s0;
+    }
+
+    float norm = 1.e-15;
+   #pragma acc loop reduction(+:norm)
+    for (int i=0;i<natoms;i++)
+    {
+      float s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp1[i*natoms+j];
+      norm += s0;
+    }
+
+    wt1[n] *= s1/norm;
+  }
+
+  #pragma acc exit data delete(Ra[0:nat2],aij[0:nat2])
+
+  return;
+}
+
+void becke_weight_nc(const int mu_order, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, float* coords)
+{
+  if (mu_order<1) { printf("  mu_order is zero. no Becke wts \n"); return; }
+
+  int Zmax = 0;
+  for (int n=0;n<natoms;n++)
+  if (atno[n]>Zmax)
+    Zmax = atno[n];
+  float* Rs = new float[Zmax+1]();
+  for (int m=0;m<Zmax;m++)
+    Rs[m+1] = get_radii_2(m+1);
+
+  becke_weight_nc(mu_order,natoms,Rs,gc,gs,grid1,wt1,atno,coords);
+
+  delete [] Rs;
+  return;
+}
+
+void becke_weight_nc(const int mu_order, const int natoms, const int gc, const int gs, float* grid1, float* wt1, int* atno, double* coords)
+{
+  if (mu_order<1) { printf("  mu_order is zero. no Becke wts \n"); return; }
+
   float coordsf[3*natoms];
   for (int n=0;n<3*natoms;n++)
     coordsf[n] = coords[n];
 
-  becke_weight_nc(natoms,gc,gs,grid1,wt1,atno,coordsf);
+  becke_weight_nc(mu_order,natoms,gc,gs,grid1,wt1,atno,coordsf);
 
   return;
 }
 
 //n-center grid wts (double)
-void becke_weight_nc(const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords)
+void becke_weight_nc(const int mu_order, const int natoms, double* Rs, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords)
 {
+  if (mu_order<1) { printf("  mu_order is zero. no Becke wts \n"); return; }
   if (natoms<2) return;
 
   int gsa = gs*natoms;
@@ -489,8 +720,8 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, double* grid1
   for (int j=0;j<i;j++)
   {
     int Z1 = atno[i]; int Z2 = atno[j];
-    double a12 = becke_ad(Z1,Z2);
-    double a21 = becke_ad(Z2,Z1);
+    double a12 = becke_ard(Rs[Z1],Rs[Z2]);
+    double a21 = becke_ard(Rs[Z2],Rs[Z1]);
     aij[i*natoms+j] = a12;
     aij[j*natoms+i] = a21;
   }
@@ -528,8 +759,8 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, double* grid1
       double omm12 = 1.-mu12*mu12;
       double nu12 =  mu12 + a12*omm12;
       double nu21 = -mu12 + a21*omm12;
-      double f12 = bf3d(nu12);
-      double f21 = bf3d(nu21);
+      double f12 = bf3d(mu_order,nu12);
+      double f21 = bf3d(mu_order,nu21);
 
       fp[i*natoms+j] = f12;
       fp[j*natoms+i] = f21;
@@ -561,9 +792,27 @@ void becke_weight_nc(const int natoms, const int gc, const int gs, double* grid1
   return;
 }
 
+void becke_weight_nc(const int mu_order, const int natoms, const int gc, const int gs, double* grid1, double* wt1, int* atno, double* coords)
+{
+  int Zmax = 0;
+  for (int n=0;n<natoms;n++)
+  if (atno[n]>Zmax)
+    Zmax = atno[n];
+  double* Rs = new double[Zmax+1]();
+  for (int m=0;m<Zmax;m++)
+    Rs[m+1] = get_radii_2(m+1);
+
+  becke_weight_nc(mu_order,natoms,Rs,gc,gs,grid1,wt1,atno,coords);
+
+  delete [] Rs;
+  return;
+}
+
 #pragma acc routine seq
 float get_3c_bw(float r1, float r2, float r3, float a12, float a13, float a23, float a21, float a31, float a32, float oR12, float oR13, float oR23)
 {
+  int mu_order = 3;
+
   float mu12 = (r1-r2)*oR12;
   float mu13 = (r1-r3)*oR13;
   float mu23 = (r2-r3)*oR23;
@@ -579,12 +828,12 @@ float get_3c_bw(float r1, float r2, float r3, float a12, float a13, float a23, f
   float nu23 =  mu23 + a23*omm23;
   float nu32 = -mu23 + a32*omm23;
 
-  float f12 = bf3(nu12);
-  float f21 = bf3(nu21);
-  float f13 = bf3(nu13);
-  float f31 = bf3(nu31);
-  float f23 = bf3(nu23);
-  float f32 = bf3(nu32);
+  float f12 = bf3(mu_order,nu12);
+  float f21 = bf3(mu_order,nu21);
+  float f13 = bf3(mu_order,nu13);
+  float f31 = bf3(mu_order,nu31);
+  float f23 = bf3(mu_order,nu23);
+  float f32 = bf3(mu_order,nu32);
 
   float norm = f12*f13 + f21*f23 + f31*f32 + 1.e-10f;
   float s1 = f12*f13/norm;
@@ -592,7 +841,7 @@ float get_3c_bw(float r1, float r2, float r3, float a12, float a13, float a23, f
   return s1;
 }
 
-void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3, 
+void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2, float* grid3, float* wt3,
                      int Z1, int Z2, int Z3, float A2, float B2, float C2, float A3, float B3, float C3)
 {
   float A23 = A3-A2; float B23 = B3-B2; float C23 = C3-C2;
@@ -650,6 +899,8 @@ void becke_weight_3c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
 void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
                      int Z1, int Z2, float A2, float B2, float C2)
 {
+  int mu_order = 3;
+
  //first center is at 0,0,0 and second at A2,B2,C2
   float R  = sqrtf(A2*A2+B2*B2+C2*C2);
   const float oR = 1./R;
@@ -669,8 +920,8 @@ void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
     float nu1 =  mu + a1*omm2;
     float nu2 = -mu + a2*omm2;
 
-    float f1 = bf3(nu1);
-    float f2 = bf3(nu2);
+    float f1 = bf3(mu_order,nu1);
+    float f2 = bf3(mu_order,nu2);
 
     float norm = f1 + f2 + 1.e-10f;
     float s1 = f1/norm;
@@ -689,8 +940,8 @@ void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
     float nu1 =  mu + a2*omm2;
     float nu2 = -mu + a1*omm2;
 
-    float f1 = bf3(nu1);
-    float f2 = bf3(nu2);
+    float f1 = bf3(mu_order,nu1);
+    float f2 = bf3(mu_order,nu2);
 
     float norm = f1 + f2 + 1.e-10f;
     float s1 = f1/norm;
@@ -702,9 +953,11 @@ void becke_weight_2c(int gs, float* grid1, float* wt1, float* grid2, float* wt2,
   return;
 }
 
-void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double* wt2, 
+void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double* wt2,
                      double zeta1, double zeta2, double A2, double B2, double C2)
 {
+  int mu_order = 3;
+
  //first center is at 0,0,0 and second at A2,B2,C2
   double R  = sqrt(A2*A2+B2*B2+C2*C2);
   const double oR = 1./R;
@@ -724,8 +977,8 @@ void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double*
     double nu1 =  mu + a1*omm2;
     double nu2 = -mu + a2*omm2;
 
-    double f1 = bf3d(nu1);
-    double f2 = bf3d(nu2);
+    double f1 = bf3d(mu_order,nu1);
+    double f2 = bf3d(mu_order,nu2);
 
     double norm = f1 + f2 + 1.e-10f;
     double s1 = f1/norm;
@@ -744,8 +997,8 @@ void becke_weight_2d(int gs, double* grid1, double* wt1, double* grid2, double*
     double nu1 =  mu + a2*omm2;
     double nu2 = -mu + a1*omm2;
 
-    double f1 = bf3d(nu1);
-    double f2 = bf3d(nu2);
+    double f1 = bf3d(mu_order,nu1);
+    double f2 = bf3d(mu_order,nu2);
 
     double norm = f1 + f2 + 1.e-10f;
     double s1 = f1/norm;
@@ -799,7 +1052,6 @@ void test_2c_becke()
     }
 
     becke_weight_2c(gs,grid1,wt1,grid2,wt2,Z1,Z2,A12,B12,C12);
-    
 
     if (0)
     {
@@ -865,10 +1117,10 @@ float becke_a(float alpha, int Z1, int Z2)
   return a;
 }
 
-float becke_a(int Z1, int Z2)
+#pragma acc routine seq
+__attribute__((noinline))
+float becke_a(float x)
 {
-  float x = get_radii_2(Z1)/get_radii_2(Z2);
-
   float u = (x-1.f)/(x+1.f);
   float a = u/(u*u-1.f);
 
@@ -877,10 +1129,26 @@ float becke_a(int Z1, int Z2)
   return a;
 }
 
-double becke_ad(int Z1, int Z2)
+float becke_a(int Z1, int Z2)
 {
-  double x = get_radii_2(Z1)/get_radii_2(Z2);
+  float x = get_radii_2(Z1)/get_radii_2(Z2);
+
+  return becke_a(x);
+}
+
+#pragma acc routine seq
+__attribute__((noinline))
+float becke_ar(float r1, float r2)
+{
+  float x = r1/r2;
+
+  return becke_a(x);
+}
 
+#pragma acc routine seq
+__attribute__((noinline))
+double becke_ad(double x)
+{
   double u = (x-1.)/(x+1.);
   double a = u/(u*u-1.);
 
@@ -890,17 +1158,29 @@ double becke_ad(int Z1, int Z2)
 }
 
 #pragma acc routine seq
-float bf3(float f1)
+__attribute__((noinline))
+double becke_ad(int Z1, int Z2)
 {
-  //default is order 3
-  #define ORDER4 0
-
-  f1 = 1.5f*f1 - 0.5f*f1*f1*f1;
-  f1 = 1.5f*f1 - 0.5f*f1*f1*f1;
-  f1 = 1.5f*f1 - 0.5f*f1*f1*f1;
- #if ORDER4
-  f1 = 1.5f*f1 - 0.5f*f1*f1*f1;
- #endif
+  double x = get_radii_2(Z1)/get_radii_2(Z2);
+
+  return becke_ad(x);
+}
+
+#pragma acc routine seq
+__attribute__((noinline))
+double becke_ard(double r1, double r2)
+{
+  double x = r1/r2;
+
+  return becke_ad(x);
+}
+
+#pragma acc routine seq
+__attribute__((noinline))
+float bf3(int mu_order, float f1)
+{
+  for (int n=0;n<mu_order;n++)
+    f1 = 1.5f*f1 - 0.5f*f1*f1*f1;
   f1 = 0.5f*(1.f-f1);
   f1 = f1*f1;
 
@@ -908,17 +1188,11 @@ float bf3(float f1)
 }
 
 #pragma acc routine seq
-double bf3d(double f1)
+__attribute__((noinline))
+double bf3d(int mu_order, double f1)
 {
-  //default is order 3
-  #define ORDER4 0
-
-  f1 = 1.5*f1 - 0.5*f1*f1*f1;
-  f1 = 1.5*f1 - 0.5*f1*f1*f1;
-  f1 = 1.5*f1 - 0.5*f1*f1*f1;
- #if ORDER4
-  f1 = 1.5*f1 - 0.5*f1*f1*f1;
- #endif
+  for (int n=0;n<mu_order;n++)
+    f1 = 1.5*f1 - 0.5*f1*f1*f1;
   f1 = 0.5*(1.-f1);
   f1 = f1*f1;
 
@@ -926,8 +1200,10 @@ double bf3d(double f1)
 }
 
 //all-float version
-void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt)
+void get_becke_grid_full(int mu_order, float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt)
 {
+  if (mu_order!=3) printf("\n WARNING: testing mu_order in get_becke_grid_full (float) \n");
+
   int gs = nrad*nang;
   int gsa = gs*natoms;
   int gsc = gc*gs;
@@ -989,7 +1265,7 @@ void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int
   if (alpha!=0.)
     atomic_domain_cell_wt(-1,alpha,beta,natoms,gc,gs,grid,wt,atno,coords);
   else
-    becke_weight_nc(natoms,gc,gs,grid,wt,atno,coords);
+    becke_weight_nc(mu_order,natoms,gc,gs,grid,wt,atno,coords);
 
   //distance from center (arbitrary center)
   #pragma acc parallel loop present(grid[0:gsac])
@@ -1005,6 +1281,12 @@ void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int
   return;
 }
 
+void get_becke_grid_full(float alpha, int natoms, int* atno, float* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt)
+{
+  int mu_order = 3;
+  return get_becke_grid_full(mu_order,alpha,natoms,atno,coords,nrad,nang,ang_g,ang_w,gc,grid,wt);
+}
+
 void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, float* ang_g, float* ang_w, const int gc, float* grid, float* wt)
 {
   float coordsf[3*natoms];
@@ -1015,18 +1297,21 @@ void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int na
   return;
 }
 
-void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt)
+void get_becke_grid_full(int mu_order, int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt)
 {
+  if (mu_order!=3) printf("\n WARNING: testing mu_order in get_becke_grid_full (double) \n");
+
   int gs = nrad*nang;
   int gsa = gs*natoms;
   int gsc = gc*gs;
   int gsac = gsc*natoms;
 
+  double r1[nrad]; double w1[nrad];
+  #pragma acc enter data create(r1[0:nrad],w1[0:nrad])
+
   for (int n=0;n<natoms;n++)
   {
-    double r1[nrad]; double w1[nrad];
     int Z1 = atno[n];
-    #pragma acc enter data create(r1[0:nrad],w1[0:nrad])
     get_murak_grid(nrad,r1,w1,Z1,3);
 
     #pragma acc parallel loop present(r1[0:nrad],w1[0:nrad],ang_g[0:3*nang],ang_w[0:nang],grid[0:gsac],wt[0:gsa])
@@ -1058,11 +1343,12 @@ void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int na
       grid1[gc*m+2] += C0;
     }
 
-    #pragma acc exit data delete(r1[0:nrad],w1[0:nrad])
   } //loop n over natoms
 
+  #pragma acc exit data delete(r1[0:nrad],w1[0:nrad])
+
   #pragma acc update self(grid[0:gsac],wt[0:gsa])
-  becke_weight_nc(natoms,gc,gs,grid,wt,atno,coords);
+  becke_weight_nc(mu_order,natoms,gc,gs,grid,wt,atno,coords);
 
   //distance from center (arbitrary center)
   #pragma acc parallel loop present(grid[0:gsac])
@@ -1078,8 +1364,16 @@ void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int na
   return;
 }
 
+void get_becke_grid_full(int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt)
+{
+  int mu_order = 3;
+  return get_becke_grid_full(mu_order,natoms,atno,coords,nrad,nang,ang_g,ang_w,gc,grid,wt);
+}
+
 void get_becke_grid_sparse(double rmax, int natoms, int* atno, double* coords, int nrad, int nang, double* ang_g, double* ang_w, const int gc, double* grid, double* wt)
 {
+  int mu_order = 3;
+
   int gs = nrad*nang;
   int gsa = gs*natoms;
   int gsc = gc*gs;
@@ -1126,7 +1420,7 @@ void get_becke_grid_sparse(double rmax, int natoms, int* atno, double* coords, i
   } //loop n over natoms
 
   #pragma acc update self(grid[0:gsac],wt[0:gsa])
-  becke_weight_nc(natoms,gc,gs,grid,wt,atno,coords);
+  becke_weight_nc(mu_order,natoms,gc,gs,grid,wt,atno,coords);
 
   //distance from center (arbitrary center)
   #pragma acc parallel loop present(grid[0:gsac])
@@ -1183,198 +1477,6 @@ void add_r1_to_grid4(int gs, float* grid1, float A2, float B2, float C2)
   return;
 }
 
-//CPMZ this ftn doesn't handle vxcs(GGA) part yet
-//CPMZ deprecating soon
-#if 0
-void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, int gsa, float* grid, float* wt, double* vxc, double* vxcs, double* fxc, int prl)
-{
-  if (!gbasis)
-  { printf("\n ERROR shouldn't be here in compute_fxc (Gaussian basis version) \n"); exit(-1); }
-
-  if (prl>1) printf("  compute_fxc: Gaussian basis \n");
-
-  //int gsa = gs*natoms;
-  int gsa6 = gsa*6;
-
-  int N = basis.size();
-  int N2 = N*N;
-  int* n2i = new int[natoms];
-  int imaxN = get_imax_n2i(natoms,N,basis,n2i);
-
-  int iN = imaxN;
-  float** val1 = new float*[iN];
-  float** val2 = new float*[iN];
-  for (int i=0;i<iN;i++)
-    val1[i] = new float[gsa];
-  for (int i=0;i<iN;i++)
-    val2[i] = new float[gsa];
-
-  float* val0 = new float[gsa];
-
-  float* grid1 = new float[gsa6];
-  float* grid2 = new float[gsa6];
-
-  #pragma acc enter data create(grid1[0:gsa6],grid2[0:gsa6])
-  #pragma acc enter data create(val0[0:gsa],val1[0:iN][0:gsa],val2[0:iN][0:gsa])
-
-  for (int j=0;j<N2;j++)
-    fxc[j] = 0.;
-
-  const int ig = 10; //first index to exp in Gaussian basis
-
-  for (int m=0;m<natoms;m++)
-  {
-   //working on this block of the matrix
-    int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
-
-    float Z1 = (float)atno[m];
-    double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
-
-    copy_grid(gsa,grid1,grid);
-    recenter_grid_zero(gsa,grid1,-A1,-B1,-C1);
-
-    //#pragma acc update self(grid1[0:gsa6])
-
-    #pragma acc parallel loop collapse(2) present(val1[0:iN][0:gsa])
-    for (int i1=0;i1<s2-s1;i1++)
-    for (int j=0;j<gsa;j++)
-      val1[i1][j] = 0.f;
-
-    for (int i1=s1;i1<s2;i1++)
-    {
-      int ii1 = i1-s1;
-
-      vector<double> basis1 = basis[i1];
-      int l1 = basis1[1]; int m1 = basis1[2]; int ng = basis1[3];
-      int in = ig + ng; //index to find norm
-
-      float* valm = val1[ii1];
-      for (int j=0;j<ng;j++)
-      {
-        double zeta1 = basis1[ig+j]; double norm1 = basis1[in+j];
-
-        eval_gh(gsa,grid1,val0,l1,m1,norm1,zeta1); //overwrites val0
-
-       #pragma acc parallel loop present(val0[0:gsa],valm[0:gsa])
-        for (int k=0;k<gsa;k++)
-          valm[k] += val0[k];
-      }
-    }
-
-   //compute all
-    for (int i1=s1;i1<s2;i1++)
-    {
-      float* valm = val1[i1-s1];
-
-      for (int i2=s1;i2<s2;i2++)
-      {
-        float* valn = val1[i2-s1];
-
-        double f1 = 0.;
-        if (need_wt)
-       #pragma acc parallel loop present(vxc[0:gsa],valm[0:gsa],valn[0:gsa],wt[0:gsa]) reduction(+:f1)
-        for (int j=0;j<gsa;j++)
-          f1 += vxc[j]*valm[j]*valn[j]*wt[j];
-        else
-       #pragma acc parallel loop present(vxc[0:gsa],valm[0:gsa],valn[0:gsa]) reduction(+:f1)
-        for (int j=0;j<gsa;j++)
-          f1 += vxc[j]*valm[j]*valn[j];
-
-        fxc[i1*N+i2] = f1;
-      }
-    }
-
-   //second atom
-    for (int n=m+1;n<natoms;n++)
-    {
-      int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
-
-      float Z2 = (float)atno[m];
-      double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
-
-      #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
-      for (int i2=0;i2<s4-s3;i2++)
-      for (int j=0;j<gsa;j++)
-        val2[i2][j] = 0.f;
-
-      copy_grid(gsa,grid2,grid);
-      recenter_grid_zero(gsa,grid2,-A2,-B2,-C2);
-
-      //#pragma acc update self(grid2[0:gsa6])
-
-      for (int i2=s3;i2<s4;i2++)
-      {
-        int ii2 = i2-s3;
-
-        vector<double> basis2 = basis[i2];
-        int l2 = basis2[1]; int m2 = basis2[2]; int ng = basis2[3];
-        int in = ig + ng; //index to find norm
-
-        float* valn = val2[ii2];
-        for (int j=0;j<ng;j++)
-        {
-          double zeta2 = basis2[ig+j]; double norm2 = basis2[in+j];
-
-          eval_gh(gsa,grid2,val0,l2,m2,norm2,zeta2);
-
-         #pragma acc parallel loop present(val0[0:gsa],valn[0:gsa])
-          for (int k=0;k<gsa;k++)
-            valn[k] += val0[k];
-        }
-      }
-
-     //two-atom Pao elements added to grid
-      for (int i1=s1;i1<s2;i1++)
-      {
-        float* valn = val1[i1-s1];
-
-        for (int i2=s3;i2<s4;i2++)
-        {
-          float* valm = val2[i2-s3];
-
-          double f1 = 0.;
-          if (need_wt)
-         #pragma acc parallel loop present(vxc[0:gsa],valm[0:gsa],valn[0:gsa],wt[0:gsa]) reduction(+:f1)
-          for (int j=0;j<gsa;j++)
-            f1 += vxc[j]*valm[j]*valn[j]*wt[j];
-          else
-         #pragma acc parallel loop present(vxc[0:gsa],valm[0:gsa],valn[0:gsa]) reduction(+:f1)
-          for (int j=0;j<gsa;j++)
-            f1 += vxc[j]*valm[j]*valn[j];
-
-          fxc[i1*N+i2] = fxc[i2*N+i1] = f1;
-        }
-      }
-    } //loop n over second atom
-  }
-
-  #pragma acc update device(fxc[0:N2])
-
-  if (prl>2)
-  {
-    printf("\n fxc: \n");
-    print_square(N,fxc);
-  }
-
-  #pragma acc exit data delete(grid1[0:gsa6],grid2[0:gsa6])
-  #pragma acc exit data delete(val0[0:gsa],val1[0:iN][0:gsa],val2[0:iN][0:gsa])
-
-  delete [] grid1;
-  delete [] grid2;
-  delete [] n2i;
-
-  delete [] val0;
-  for (int i=0;i<iN;i++)
-    delete [] val1[i];
-  for (int i=0;i<iN;i++)
-    delete [] val2[i];
-  delete [] val1;
-  delete [] val2;
-
-  return;
-}
-#endif
-
 //borrowed from Slater version
 void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vector<double> > &basis, bool need_wt, bool gga,
                  double* Pao, int gs, double* grid, double* wt, double* vxc, double* vxcs, double* fxc, int prl)
@@ -1387,6 +1489,10 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
     printf("\n ERROR: gga functionals require Pao in compute_fxc \n");
     exit(1);
   }
+  if (gga)
+    printf("\n WARNING: need to debug compute_fxcd (gbasis) \n");
+  if (need_wt && gga)
+    printf("\n ERROR: cannot use need_wt with gga in compute_fxcd \n\n");
 
   int gs3 = 3*gs;
   int gs6 = 6*gs;
@@ -1453,7 +1559,6 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -1489,14 +1594,11 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
         for (int k=0;k<gs;k++)
           valm[k] += val0[k];
 
-    	if (gga)
-        {
-          eval_pd_gh(gs,grid1,val0g,l1+1,l1,m1,norm1,zeta1);
+        eval_pd_gh(gs,grid1,val0g,l1+1,l1,m1,norm1,zeta1);
 
-         #pragma acc parallel loop present(val0g[0:gs3],valmg[0:gs3])
-          for (int k=0;k<gs3;k++)
-            valmg[k] += val0g[k];
-        }
+       #pragma acc parallel loop present(val0g[0:gs3],valmg[0:gs3])
+        for (int k=0;k<gs3;k++)
+          valmg[k] += val0g[k];
       }
     }
 
@@ -1512,6 +1614,7 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
         double* valm = val1[ii2];
         double* valpm = val1p[ii2];
 
+       //CPMZ check this
         double d1 = 4.*Paon[i1*N+i2];
 
        #pragma acc parallel loop present(grho[0:gs3],valn[0:gs],valpn[0:gs3],valm[0:gs],valpm[0:gs3])
@@ -1528,7 +1631,7 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -1564,14 +1667,11 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
           for (int k=0;k<gs;k++)
             valn[k] += val0[k];
 
-          if (gga)
-          {
-            eval_pd_gh(gs,grid2,val0g,l2+1,l2,m2,norm2,zeta2);
+          eval_pd_gh(gs,grid2,val0g,l2+1,l2,m2,norm2,zeta2);
 
-           #pragma acc parallel loop present(val0g[0:gs3],valng[0:gs3])
-            for (int k=0;k<gs3;k++)
-              valng[k] += val0g[k];
-          }
+         #pragma acc parallel loop present(val0g[0:gs3],valng[0:gs3])
+          for (int k=0;k<gs3;k++)
+            valng[k] += val0g[k];
         }
       }
 
@@ -1587,6 +1687,7 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
           double* valm = val2[ii2];
           double* valpm = val2p[ii2];
 
+         //CPMZ check this
           double d1 = 4.*Paon[i1*N+i2];
 
          #pragma acc parallel loop present(grho[0:gs3],valn[0:gs],valpn[0:gs3],valm[0:gs],valpm[0:gs3])
@@ -1617,7 +1718,6 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -1642,7 +1742,6 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
       int l1 = basis1[1]; int m1 = basis1[2]; int ng = basis1[3];
       int in = ig + ng; //index to find norm
 
-      //printf("  (1) basis %i has %2i terms \n",i1,ng);
       double* valm = val1[ii1];
       double* valmg = NULL; if (gga) valmg = val1p[ii1];
       for (int j=0;j<ng;j++)
@@ -1713,7 +1812,7 @@ void compute_fxcd(bool gbasis, int natoms, int* atno, double* coords, vector<vec
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
      #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -1924,7 +2023,6 @@ void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vect
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -1999,7 +2097,7 @@ void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vect
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -2088,7 +2186,6 @@ void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vect
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -2184,7 +2281,7 @@ void compute_fxc(bool gbasis, int natoms, int* atno, double* coords, vector<vect
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
      #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -2695,7 +2792,6 @@ void compute_rho(bool gbasis, int natoms, int* atno, double* coords, vector<vect
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -2782,7 +2878,7 @@ void compute_rho(bool gbasis, int natoms, int* atno, double* coords, vector<vect
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -3024,7 +3120,6 @@ void compute_lap_hessg(bool gbasis, int natoms, int* atno, double* coords, vecto
      //working on this block of the matrix
       int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-      double Z1 = (double)atno[m];
       double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
       if (lowmem)
@@ -3123,7 +3218,6 @@ void compute_lap_hessg(bool gbasis, int natoms, int* atno, double* coords, vecto
       {
         int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-        double Z2 = (double)atno[m];
         double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
         #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsaa])
@@ -3347,7 +3441,6 @@ void compute_rhodg(bool gbasis, int natoms, int* atno, double* coords, vector<ve
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -3434,7 +3527,6 @@ void compute_rhodg(bool gbasis, int natoms, int* atno, double* coords, vector<ve
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      double Z2 = (double)atno[m];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -3650,7 +3742,6 @@ void compute_rho(int natoms, int* atno, double* coords, vector<vector<double> >
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -3767,7 +3858,7 @@ void compute_rho(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -4033,7 +4124,6 @@ void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> >
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -4115,7 +4205,6 @@ void compute_rhod(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      double Z2 = (double)atno[m];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -4368,7 +4457,6 @@ void compute_lap_hess(int natoms, int* atno, double* coords, vector<vector<doubl
   {
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -4513,7 +4601,6 @@ void compute_lap_hess(int natoms, int* atno, double* coords, vector<vector<doubl
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      double Z2 = (double)atno[m];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -6077,7 +6164,7 @@ void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> >
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
+    //float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -6132,7 +6219,7 @@ void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -6312,7 +6399,7 @@ void compute_fxc(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
      #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -6578,7 +6665,7 @@ void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> >
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    double Z1 = (double)atno[m];
+    //double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -6633,7 +6720,6 @@ void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      double Z2 = (double)atno[m];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -6702,7 +6788,7 @@ void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> >
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    double Z1 = (double)atno[m];
+    //double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gs,grid1,grid);
@@ -6793,7 +6879,6 @@ void compute_fxcd(int natoms, int* atno, double* coords, vector<vector<double> >
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      double Z2 = (double)atno[m];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gs])
@@ -7023,7 +7108,6 @@ void compute_dft_grad(int natoms, int* atno, double* coords, vector<vector<doubl
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
@@ -7099,7 +7183,7 @@ void compute_dft_grad(int natoms, int* atno, double* coords, vector<vector<doubl
     {
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[m];
+      //float Z2 = (float)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
@@ -7516,3 +7600,60 @@ void test_becke_weight(int natoms, int* atno, double* coords, vector<vector<doub
   return;
 }
 
+
+
+
+
+ #if 0
+  for (int i=0;i<natoms;i++)
+  {
+    fp[i*natoms+i] = 1.;
+
+   // #pragma acc parallel loop collapse(2) present(Ra[0:nat2],aij[0:nat2],fp[0:nat2],coords[0:3*natoms],grid1[0:gc*gsa],wt1[0:gsa])
+    for (int j=0;j<natoms;j++)
+    {
+      float a12 = aij[i*natoms+j];
+      //float a21 = aij[j*natoms+i];
+      float oR12 = 1./Ra[i*natoms+j];
+
+      float dx1 = x-coords[3*i+0]; float dy1 = y-coords[3*i+1]; float dz1 = z-coords[3*i+2];
+      float dx2 = x-coords[3*j+0]; float dy2 = y-coords[3*j+1]; float dz2 = z-coords[3*j+2];
+      float r1 = sqrtf(dx1*dx1+dy1*dy1+dz1*dz1);
+      float r2 = sqrtf(dx2*dx2+dy2*dy2+dz2*dz2);
+
+      float mu12 = (r1-r2)*oR12;
+      float omm12 = 1.-mu12*mu12;
+      float nu12 =  mu12 + a12*omm12;
+      //float nu21 = -mu12 + a21*omm12;
+      float f12 = bf3(mu_order,nu12);
+      //float f21 = bf3(mu_order,nu21);
+
+      fp[i*natoms+j] = f12;
+      //fp[j*natoms+i] = f21;
+    }
+
+    float s1;
+    {
+      float s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp[wa*natoms+j];
+      s1 = s0;
+    }
+
+    float norm = 1.e-15;
+   #pragma acc loop reduction(+:norm)
+    for (int i=0;i<natoms;i++)
+    {
+      float s0 = 1.;
+     #pragma acc loop reduction(*:s0)
+      for (int j=0;j<natoms;j++)
+        s0 *= fp[i*natoms+j];
+      norm += s0;
+    }
+
+    wt1[n] *= s1/norm;
+
+  }
+ #endif
+
diff --git a/src/integrals/gauss.cpp b/src/integrals/gauss.cpp
index c2a2d6d..040f317 100644
--- a/src/integrals/gauss.cpp
+++ b/src/integrals/gauss.cpp
@@ -9,8 +9,60 @@
 //eval_gh: check factor of sqrt(2)?
 
 
+void eval_vghd(int gsa, double* grid, double* val, int n, int l, int m, double norm, double zt)
+{
+  int gsa6 = 6*gsa;
+  double nph = l+0.5;
+
+  if (l>5) { printf("\n ERROR: l>6 in eval_vgh \n"); exit(-1); }
+
+ #if 1
+  double normf = 2.*PI/(2.*l+1.)*norm;
+  double norm1 = pow(zt,-0.5*(2.*l+3.));
+  double norm2 = pow(zt,-1.-0.5*l);
+  double gf1 = 0.5*(2.*l+3.);
+  double gf2 = 1.-0.5*l;
+
+  #pragma acc parallel loop present(grid[0:gsa6],val[0:gsa])
+  for (int j=0;j<gsa;j++)
+  {
+    double r = grid[6*j+3];
+    double zr2 = zt*r*r;
+    double rl1 = pow(r,-l-1.);
+    double rl  = pow(r,l);
+    val[j] = normf*(norm1*rl1*igam(gf1,zr2) + norm2*rl*igamc(gf2,zr2));
+  }
+ #endif
 
+ #if 0
+  double normf = 0.5*PI*norm/zt;
+  //normf *= 11.136655993664;
+ //CPMZ check all of these
+ /*
+  if (l==1) normf *= 0.5;
+  if (l==2) normf *= 0.75;
+  if (l==3) normf *= 1.875;
+  if (l==4) normf *= 6.5625;
+  if (l==5) normf *= 29.53125;
+  if (l==6) normf *= 162.421875;
+ */
 
+ #pragma acc parallel loop present(grid[0:gsa6],val[0:gsa])
+  for (int j=0;j<gsa;j++)
+  {
+    double r = grid[6*j+3];
+    double zr2 = zt*r*r;
+    //double rl = pow(r,l);
+
+    double Fn = pow(zr2,-nph)*igam(nph,zr2);
+    val[j] = normf*Fn;
+  }
+ #endif
+
+  return;
+}
+
+#if 0
 void integrate_hole_para_gh(double* rdm, bool full_rdm, bool hfx_on, int Nc, int No, int M, int natoms, int* atno, double* coords, int gs, int gsb, vector<vector<double> >& basis,
                     double* Pao, double* Pmo, double* jCA, float* grid, float* gridb, float* wt, float* wtb, float* rho, float* vxch, int prl)
 {
@@ -440,216 +492,402 @@ void integrate_hole_para_gh(double* rdm, bool full_rdm, bool hfx_on, int Nc, int
 
   return;
 }
+#endif
 
-void wf_to_grid_gh_ke_2(int natoms, int* atno, double* coords, vector<vector<double> > basis, int nbas, int nenv, int N, int* atm, int* bas, double* env,
-                        double* jCA, int gs, float* grid, float* wt, double* Td, int prl)
+void wf_to_grid_gh(bool divide_by_rho, bool calc_rho, int natoms, int* atno, double* coords, vector<vector<double> > basis,
+                   double* Pao, double* gfao, int gs, double* grid, double* wt,
+                   double* rho, double* gf, double* Td, double* TL, int prl)
 {
-  if (prl>-1) printf("  compute_ke: Gaussian basis (testing) \n");
-  printf("\n WARNING: updated eval_gh_ke ftns to new procedure \n");
+  if (prl>1) printf("  wf_to_grid_gh \n");
 
   int gsa = gs*natoms;
   int gsa3 = gsa*3;
   int gsa6 = gsa*6;
 
-  int N0 = basis.size();
-  if (N0!=N) { printf("\n ERROR: mismatch in basis sizes \n"); exit(-1); }
-  //int N2 = N*N;
+  bool need_g = 1;
+  if (Td==NULL) need_g = 0;
+  bool need_gf = 1;
+  if (gf==NULL || gfao==NULL) need_gf = 0;
+  bool need_L = 1;
+  if (TL==NULL) need_L = 0;
+
+  int N = basis.size();
+  int N2 = N*N;
+
+  if (prl>1)
+  {
+    printf("\n Pao: \n");
+    print_square(N,Pao);
+  }
+
+ //normalization convention
+  for (int j=0;j<N2;j++)
+    Pao[j] *= 2.;
+  if (need_gf)
+  for (int j=0;j<N2;j++)
+    gfao[j] *= 2.;
+
   int* n2i = new int[natoms];
   int imaxN = get_imax_n2i(natoms,N,basis,n2i);
 
-  printf("  imaxN: %2i \n",imaxN);
-
   int iN = imaxN;
-  float** val1  = new float*[iN];
-  float** val1d = new float*[iN];
+  double* val0 = new double[gsa];
+  double* val0p = new double[gsa3];
+  double** val1  = new double*[iN];
+  double** val1p = new double*[iN];
+  double** val1L = new double*[iN];
   for (int i=0;i<iN;i++)
-    val1[i]  = new float[gsa];
+    val1[i]  = new double[gsa];
+  if (need_g)
   for (int i=0;i<iN;i++)
-    val1d[i] = new float[gsa3];
+    val1p[i] = new double[gsa3];
+  if (need_L)
+  for (int i=0;i<iN;i++)
+    val1L[i] = new double[gsa];
 
-  float** val2 = new float*[iN];
+  double** val2 = new double*[iN];
+  double** val2p = new double*[iN];
+  double** val2L = new double*[iN];
   for (int i=0;i<iN;i++)
-    val2[i] = new float[gsa];
-  float** val2d = new float*[iN];
+    val2[i] = new double[gsa];
+  if (need_g)
+  for (int i=0;i<iN;i++)
+    val2p[i] = new double[gsa3];
+  if (need_L)
   for (int i=0;i<iN;i++)
-    val2d[i] = new float[gsa3];
+    val2L[i] = new double[gsa];
 
-  float* valmo = new float[gsa];
-
-  float* grid1 = new float[gsa6];
-  float* grid2 = new float[gsa6];
+  double* grid1 = new double[gsa6];
+  double* grid2 = new double[gsa6];
 
   #pragma acc enter data create(grid1[0:gsa6],grid2[0:gsa6])
-  #pragma acc enter data create(valmo[0:gsa],val1[0:iN][0:gsa],val1d[0:iN][0:gsa3],val2[0:iN][0:gsa],val2d[0:iN][0:gsa3])
-
-  if (prl>1)
+  #pragma acc enter data create(val0[0:gsa],val1[0:iN][0:gsa],val2[0:iN][0:gsa])
+  if (need_g)
   {
-    printf("\n jCA: \n");
-    print_square(N,jCA);
+    #pragma acc enter data create(val0p[0:gsa3],val1p[0:iN][0:gsa3],val2p[0:iN][0:gsa3])
+  }
+  if (need_L)
+  {
+    #pragma acc enter data create(val1L[0:iN][0:gsa],val2L[0:iN][0:gsa])
   }
 
-  #pragma acc parallel loop present(valmo[0:gsa])
+  if (calc_rho)
+  #pragma acc parallel loop present(rho[0:gsa])
   for (int j=0;j<gsa;j++)
-    valmo[j] = 0.f;
+    rho[j] = 0.;
 
+  if (need_gf)
+  #pragma acc parallel loop present(gf[0:gsa])
+  for (int j=0;j<gsa;j++)
+    gf[j] = 0.;
+
+  if (need_g)
   #pragma acc parallel loop present(Td[0:gsa])
   for (int j=0;j<gsa;j++)
     Td[j] = 0.;
 
+  if (need_L)
+  #pragma acc parallel loop present(TL[0:gsa])
+  for (int j=0;j<gsa;j++)
+    TL[j] = 0.;
+
+  const int ig = 10;
+
   for (int m=0;m<natoms;m++)
   {
    //working on this block of the matrix
     int s1 = 0; if (m>0) s1 = n2i[m-1]; int s2 = n2i[m];
 
-    float Z1 = (float)atno[m];
+    //double Z1 = (double)atno[m];
     double A1 = coords[3*m+0]; double B1 = coords[3*m+1]; double C1 = coords[3*m+2];
 
     copy_grid(gsa,grid1,grid);
     recenter_grid_zero(gsa,grid1,-A1,-B1,-C1);
 
-   //tricky due to non-constant jumps in loop
-   //#pragma omp parallel for
-    for (int i1=s1;i1<s2;i1++)
-    {
-      int ii1 = i1-s1;
-
-      int nshl = eval_gh_full(gs,grid,&val1[ii1],i1,natoms,nbas,nenv,N,atm,bas,env);
-
-      //eval_gh_ke_full(gs,grid,&val1d[ii1],i1,natoms,nbas,nenv,N,atm,bas,env);
-      printf("  i1: %2i  nshl: %2i \n",i1,nshl);
-
-      i1 += nshl-1;
-    }
-    #pragma acc update device(val1[0:iN][0:gsa])
+    #pragma acc parallel loop collapse(2) present(val1[0:iN][0:gsa])
+    for (int i1=0;i1<s2-s1;i1++)
+    for (int j=0;j<gsa;j++)
+      val1[i1][j] = 0.;
 
+    if (need_g)
+    #pragma acc parallel loop collapse(2) present(val1p[0:iN][0:gsa3])
+    for (int i1=0;i1<s2-s1;i1++)
+    for (int j=0;j<gsa3;j++)
+      val1p[i1][j] = 0.;
 
+    if (need_L)
+    #pragma acc parallel loop collapse(2) present(val1L[0:iN][0:gsa])
+    for (int i1=0;i1<s2-s1;i1++)
+    for (int j=0;j<gsa;j++)
+      val1L[i1][j] = 0.;
 
-   #if 0
     for (int i1=s1;i1<s2;i1++)
     {
       int ii1 = i1-s1;
-
       vector<double> basis1 = basis[i1];
-      int n1 = basis1[0]; int l1 = basis1[1]; int m1 = basis1[2]; int ng = basis1[3];
+      int l1 = basis1[1]; int m1 = basis1[2]; int ng = basis1[3];
       int in = ig + ng; //index to find norm
 
-      float* valm = val1[ii1];
-      float* valL = val1L[ii1];
+      double* valm = val1[ii1];
+      double* valmp = val1p[ii1];
+      double* valml = val1L[ii1];
       for (int j=0;j<ng;j++)
       {
         double zeta1 = basis1[ig+j]; double norm1 = basis1[in+j];
 
-        eval_gh(gsa,grid1,val0,l1,m1,norm1,zeta1);
+        eval_ghd(gsa,grid1,val0,l1,m1,norm1,zeta1);
 
        #pragma acc parallel loop present(val0[0:gsa],valm[0:gsa])
         for (int k=0;k<gsa;k++)
           valm[k] += val0[k];
 
-        eval_gh_ke(gsa,grid1,val0,n1,l1,norm1,zeta1);
+        if (need_g)
+        {
+          eval_pd_gh(gsa,grid1,val0p,l1+1,l1,m1,norm1,zeta1);
 
-       #pragma acc parallel loop present(val0[0:gsa],valL[0:gsa])
-        for (int k=0;k<gsa;k++)
-          valL[k] += val0[k];
+         #pragma acc parallel loop present(val0p[0:gsa3],valmp[0:gsa3])
+          for (int k=0;k<gsa3;k++)
+            valmp[k] += val0p[k];
+        }
+
+        if (need_L)
+        {
+          eval_ghd_ke(gsa,grid1,val0p,l1+1,l1,1.,zeta1);
+
+         #pragma acc parallel loop present(val0[0:gsa],val0p[0:gsa3],valml[0:gsa])
+          for (int k=0;k<gsa;k++)
+            valml[k] += val0[k]*val0p[k];
+        }
       }
+
     }
-   #endif
 
    //compute all
     for (int i1=s1;i1<s2;i1++)
     {
-      float* valm = val1[i1-s1];
-      //float* vald = val1d[i1-s1];
+      int ii1 = i1-s1;
+      double* valm = val1[ii1];
+      double* valmp = val1p[ii1];
+      double* valml = val1L[ii1];
 
-      float c1 = jCA[i1*N+0];
+      for (int i2=s1;i2<s2;i2++)
+      {
+        int ii2 = i2-s1;
+        double* valn = val1[ii2];
+        double* valnp = val1p[ii2];
+        double* valnl = val1L[ii2];
 
-     #pragma acc parallel loop present(valmo[0:gsa],valm[0:gsa])
-      for (int j=0;j<gsa;j++)
-        valmo[j] += c1*valm[j];
+        double d12 = Pao[i1*N+i2];
+
+        if (calc_rho)
+       #pragma acc parallel loop present(rho[0:gsa],valm[0:gsa],valn[0:gsa])
+        for (int j=0;j<gsa;j++)
+          rho[j] += d12*valm[j]*valn[j];
 
-    // #pragma acc parallel loop present(Td[0:gsa],vald[0:gsa])
-    //  for (int j=0;j<gsa;j++)
-    //    TL[j] += c1*vald[j];
+        if (need_g)
+       #pragma acc parallel loop present(Td[0:gsa],valmp[0:gsa3],valnp[0:gsa3])
+        for (int j=0;j<gsa;j++)
+          Td[j] += d12*(valmp[3*j+0]*valnp[3*j+0] + valmp[3*j+1]*valnp[3*j+1] + valmp[3*j+2]*valnp[3*j+2]);
+
+        if (need_L)
+       #pragma acc parallel loop present(TL[0:gsa],valm[0:gsa],valn[0:gsa],valml[0:gsa],valnl[0:gsa])
+        for (int j=0;j<gsa;j++)
+          TL[j] += 0.5*d12*(valml[j]*valn[j]+valm[j]*valnl[j]);
+
+        if (need_gf)
+        {
+          double g12 = gfao[i1*N+i2];
+         #pragma acc parallel loop present(gf[0:gsa],valm[0:gsa],valn[0:gsa])
+          for (int j=0;j<gsa;j++)
+            gf[j] += g12*valm[j]*valn[j];
+        }
+      }
     }
 
-    if (0)
     for (int n=0;n<natoms;n++)
+    if (m!=n)
     {
      //working on this block of the matrix
       int s3 = 0; if (n>0) s3 = n2i[n-1]; int s4 = n2i[n];
 
-      float Z2 = (float)atno[n];
+      //double Z2 = (double)atno[n];
       double A2 = coords[3*n+0]; double B2 = coords[3*n+1]; double C2 = coords[3*n+2];
 
       copy_grid(gsa,grid2,grid);
-      recenter_grid_zero(gsa,grid1,-A2,-B2,-C2);
+      recenter_grid_zero(gsa,grid2,-A2,-B2,-C2);
+
+      #pragma acc parallel loop collapse(2) present(val2[0:iN][0:gsa])
+      for (int i1=0;i1<s4-s3;i1++)
+      for (int j=0;j<gsa;j++)
+        val2[i1][j] = 0.;
+
+      if (need_g)
+      #pragma acc parallel loop collapse(2) present(val2p[0:iN][0:gsa3])
+      for (int i1=0;i1<s4-s3;i1++)
+      for (int j=0;j<gsa3;j++)
+        val2p[i1][j] = 0.;
+
+      if (need_L)
+      #pragma acc parallel loop collapse(2) present(val2L[0:iN][0:gsa])
+      for (int i1=0;i1<s4-s3;i1++)
+      for (int j=0;j<gsa;j++)
+        val2L[i1][j] = 0.;
 
-      for (int i2=s3;i2<s4;i2++)
+      for (int i1=s3;i1<s4;i1++)
       {
-        int ii2 = i2-s3;
+        int ii1 = i1-s3;
+        vector<double> basis2 = basis[i1];
+        int l2 = basis2[1]; int m2 = basis2[2]; int ng = basis2[3];
+        int in = ig + ng; //index to find norm
+
+        double* valm = val2[ii1];
+        double* valmp = val2p[ii1];
+        double* valml = val2L[ii1];
+        for (int j=0;j<ng;j++)
+        {
+          double zeta2 = basis2[ig+j]; double norm2 = basis2[in+j];
 
-        int nshl = eval_gh_full(gs,grid,&val2[ii2],i2,natoms,nbas,nenv,N,atm,bas,env);
-        printf("  i2: %2i  nshl: %2i \n",i2,nshl);
+          eval_ghd(gsa,grid2,val0,l2,m2,norm2,zeta2);
 
-        i2 += nshl-1;
-      }
-      #pragma acc update device(val2[0:iN][0:gsa])
-    }
+         #pragma acc parallel loop present(val0[0:gsa],valm[0:gsa])
+          for (int k=0;k<gsa;k++)
+            valm[k] += val0[k];
 
-  }
+          if (need_g)
+          {
+            eval_pd_gh(gsa,grid2,val0p,l2+1,l2,m2,norm2,zeta2);
 
+           #pragma acc parallel loop present(val0p[0:gsa3],valmp[0:gsa3])
+            for (int k=0;k<gsa3;k++)
+              valmp[k] += val0p[k];
+          }
 
-  {
-   //check total density
-    float* rho = new float[gsa];
-    #pragma acc enter data create(rho[0:gsa])
+          if (need_L)
+          {
+            eval_ghd_ke(gsa,grid2,val0p,l2+1,l2,1.,zeta2);
 
-    #pragma acc parallel loop present(rho[0:gsa],valmo[0:gsa])
-    for (int j=0;j<gsa;j++)
-      rho[j] = valmo[j]*valmo[j];
+           #pragma acc parallel loop present(val0[0:gsa],val0p[0:gsa],valml[0:gsa])
+            for (int k=0;k<gsa;k++)
+              valml[k] += val0[k]*val0p[k];
+          }
+        }
 
-    double d1 = 0.;
-    #pragma acc parallel loop present(rho[0:gsa],wt[0:gsa]) reduction(+:d1)
-    for (int j=0;j<gsa;j++)
-      d1 += rho[j]*wt[j];
+      }
 
-    printf("   total density (v2): %9.6f \n",d1);
+     //compute all
+      for (int i1=s1;i1<s2;i1++)
+      {
+        int ii1 = i1-s1;
+        double* valm = val1[ii1];
+        double* valmp = val1p[ii1];
+        double* valml = val1L[ii1];
 
-    #pragma acc exit data delete(rho[0:gsa])
-    delete [] rho;
-  }
+        for (int i2=s3;i2<s4;i2++)
+        {
+          int ii2 = i2-s3;
+          double* valn = val2[ii2];
+          double* valnp = val2p[ii2];
+          double* valnl = val2L[ii2];
+
+          double d12 = Pao[i1*N+i2];
+
+          if (calc_rho)
+         #pragma acc parallel loop present(rho[0:gsa],valm[0:gsa],valn[0:gsa])
+          for (int j=0;j<gsa;j++)
+            rho[j] += d12*valm[j]*valn[j];
+
+          if (need_g)
+         #pragma acc parallel loop present(Td[0:gsa],valmp[0:gsa3],valnp[0:gsa3])
+          for (int j=0;j<gsa;j++)
+            Td[j] += d12*(valmp[3*j+0]*valnp[3*j+0] + valmp[3*j+1]*valnp[3*j+1] + valmp[3*j+2]*valnp[3*j+2]);
+
+          if (need_L)
+         #pragma acc parallel loop present(TL[0:gsa],valm[0:gsa],valn[0:gsa],valml[0:gsa],valnl[0:gsa])
+          for (int j=0;j<gsa;j++)
+            TL[j] += 0.5*d12*(valml[j]*valn[j]+valm[j]*valnl[j]);
+
+          if (need_gf)
+          {
+            double g12 = gfao[i1*N+i2];
+           #pragma acc parallel loop present(gf[0:gsa],valm[0:gsa],valn[0:gsa])
+            for (int j=0;j<gsa;j++)
+              gf[j] += g12*valm[j]*valn[j];
+          }
+        }
+      }
+    }
 
-  float ket = 0.;
-  #pragma acc parallel loop present(Td[0:gsa],valmo[0:gsa],wt[0:gsa]) reduction(+:ket)
-  for (int j=0;j<gsa;j++)
-    ket += valmo[j]*Td[j]*wt[j];
-  printf("\n total KE: %9.6f \n",ket);
+  }
 
+  if (prl>0 || calc_rho)
+  {
+    double dtot = 0.;
+   #pragma acc parallel loop present(rho[0:gsa],wt[0:gsa])
+    for (int j=0;j<gsa;j++)
+      dtot += rho[j]*wt[j];
+    printf("  integrated density: %9.6f \n",dtot);
+  }
 
-  float thresh = 1.e-12;
-  #pragma acc parallel loop present(valmo[0:gsa])
+  if (need_g)
+  #pragma acc parallel loop present(Td[0:gsa])
   for (int j=0;j<gsa;j++)
-  if (fabs(valmo[j])<thresh)
-    valmo[j] = thresh;
+    Td[j] *= 0.5;
 
-  #pragma acc parallel loop present(Td[0:gsa],valmo[0:gsa])
+  if (need_L)
+  #pragma acc parallel loop present(TL[0:gsa])
   for (int j=0;j<gsa;j++)
-    Td[j] *= -0.5/valmo[j];
+    TL[j] *= -0.5;
 
-  if (prl>-2)
+  if (divide_by_rho)
   {
-    #pragma acc update self(valmo[0:gsa])
-    printf("\n MO: \n");
-    print_vec(gsa,grid,valmo);
+    const double den = 1.e-30;
+
+    if (need_g)
+    #pragma acc parallel loop present(Td[0:gsa],rho[0:gsa])
+    for (int j=0;j<gsa;j++)
+      Td[j] /= (rho[j]+den);
+
+    if (need_L)
+    #pragma acc parallel loop present(TL[0:gsa],rho[0:gsa])
+    for (int j=0;j<gsa;j++)
+      TL[j] /= (rho[j]+den);
+
+    if (need_gf)
+    #pragma acc parallel loop present(gf[0:gsa],rho[0:gsa])
+    for (int j=0;j<gsa;j++)
+      gf[j] /= (rho[j]+den);
   }
-  if (0)
+
+  if (prl>2 && gsa<10000)
+  {
+    #pragma acc update self(gf[0:gsa])
+    printf("\n gf: \n");
+    print_vec(gsa,grid,gf);
+  }
+  if (prl>2 && need_g && gsa<10000)
   {
     #pragma acc update self(Td[0:gsa])
-    printf("\n ke: \n");
+    printf("\n Td: \n");
     print_vec(gsa,grid,Td);
   }
 
+ //revert normalization convention
+  for (int j=0;j<N2;j++)
+    Pao[j] *= 0.5;
+  if (need_gf)
+  for (int j=0;j<N2;j++)
+    gfao[j] *= 0.5;
+
   #pragma acc exit data delete(grid1[0:gsa6],grid2[0:gsa6])
-  #pragma acc exit data delete(valmo[0:gsa],val1[0:iN][0:gsa],val1d[0:iN][0:gsa3],val2[0:iN][0:gsa],val2d[0:iN][0:gsa3])
+  #pragma acc exit data delete(val0[0:gsa],val1[0:iN][0:gsa],val2[0:iN][0:gsa])
+  if (need_g)
+  {
+    #pragma acc exit data delete(val0p[0:gsa3],val1p[0:iN][0:gsa3],val2p[0:iN][0:gsa3])
+  }
+  if (need_L)
+  {
+    #pragma acc exit data delete(val1L[0:iN][0:gsa],val2L[0:iN][0:gsa])
+  }
 
   delete [] grid1;
   delete [] grid2;
@@ -657,18 +895,30 @@ void wf_to_grid_gh_ke_2(int natoms, int* atno, double* coords, vector<vector<dou
 
   for (int i=0;i<iN;i++)
     delete [] val1[i];
+  if (need_g)
+  for (int i=0;i<iN;i++)
+    delete [] val1p[i];
+  if (need_L)
   for (int i=0;i<iN;i++)
-    delete [] val1d[i];
+    delete [] val1L[i];
   delete [] val1;
-  delete [] val1d;
+  delete [] val1p;
+  delete [] val1L;
 
   for (int i=0;i<iN;i++)
     delete [] val2[i];
-  delete [] val2;
+  if (need_g)
   for (int i=0;i<iN;i++)
-    delete [] val2d[i];
-  delete [] val2d;
+    delete [] val2p[i];
+  if (need_L)
+  for (int i=0;i<iN;i++)
+    delete [] val2L[i];
+  delete [] val2;
+  delete [] val2p;
+  delete [] val2L;
 
+  delete [] val0;
+  delete [] val0p;
 
   return;
 }
@@ -852,12 +1102,13 @@ void wf_to_grid_gh_ke(int natoms, int* atno, double* coords, vector<vector<doubl
   return;
 }
 
-void eval_gh_ke(int gs, double* grid, double* val1, int n1, int l1, double norm1, const double zeta1)
+void eval_ghd_ke(int gs, double* grid, double* val1, int n1, int l1, double norm1, const double zeta1)
 {
   int gs6 = 6*gs;
 
   int nnm = n1*(n1-1);
   int llp = l1*(l1+1);
+  int nml = nnm-llp;
   double fz2 = 4.*zeta1*zeta1;
   double ta = 2.*zeta1*(1.+2.*n1);
 
@@ -866,7 +1117,7 @@ void eval_gh_ke(int gs, double* grid, double* val1, int n1, int l1, double norm1
   {
     double r = grid[6*j+3];
     double r2 = r*r;
-    val1[j] = -ta + (nnm-llp)/r2 + fz2*r2;
+    val1[j] = -ta + nml/r2 + fz2*r2;
   }
 
   if (norm1!=1.)
@@ -3407,19 +3658,14 @@ void eval_gh(int gs, float* grid, float* val1, int l1, int m1, const float norm1
   return;
 }
 
-void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double norm1, const double zeta1)
+//spherical harmonic part of Gaussian ftn
+void eval_gh_spd(bool include_r, int gs, double* grid, double* val1, int l1, int m1)
 {
-  #pragma acc parallel loop present(grid[0:6*gs],val1[0:gs])
-  for (int i=0;i<gs;i++)
-  {
-    double r = grid[6*i+3];
-    double g1 = exp(-r*r*zeta1);
-    val1[i] = norm1*g1;
-  }
-
  //most l1 values are untested
  //especially l1>=3
 
+  //printf("  eval_gh_spd. inc_r: %i gs: %4i l1: %i m1: %i \n",(int)include_r,gs,l1,m1);
+
   if (l1==0) return;
 
   if (l1==1)
@@ -3706,7 +3952,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double y = grid[6*i+1];
         double x2 = x*x;
         double y2 = y*y;
-        val1[i] *= -(5.*x2*x2 - 10.*x2*y2 + y2*y2)*y;
+        val1[i] *= (5.*x2*x2 - 10.*x2*y2 + y2*y2)*y; //changed sign
       }
     }
     else if (m1==-4)      //h (y2 - x2)*x*y*z
@@ -3719,7 +3965,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double z = grid[6*i+2];
         double x2 = x*x;
         double y2 = y*y;
-        val1[i] *= (y2 - x2)*x*y*z;
+        val1[i] *= -(y2 - x2)*x*y*z; //changed sign
       }
     }
    //ordering differed (lower should be +5)
@@ -3736,7 +3982,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double y2 = y*y;
         double z2 = z*z;
         double r2 = r*r;
-        val1[i] *= y * (r2*(3.*x2-y2) + 9.*z2*(y2-3.*x2));
+        val1[i] *= -y * (r2*(3.*x2-y2) + 9.*z2*(y2-3.*x2)); //changed sign
         //val1[i] *= (x2*x2 - 10.*x2*y2 + 5.*y2*y2)*x;
       }
     }
@@ -3769,7 +4015,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double r = grid[6*i+3];
         double z2 = z*z;
         double r2 = r*r;
-        val1[i] *= y * (14.*r2*z2 - 21.*z2*z2 - r2*r2);
+        val1[i] *= -y * (14.*r2*z2 - 21.*z2*z2 - r2*r2); //changed sign
         //val1[i] *= z * (x2*x2 - 6.*x2*y2 + y2*y2);
       }
     }
@@ -3798,7 +4044,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double r = grid[6*i+3];
         double z2 = z*z;
         double r2 = r*r;
-        val1[i] *= x*(-r2*r2 + 14.*r2*z2 - 21.*z2*z2);
+        val1[i] *= -x*(-r2*r2 + 14.*r2*z2 - 21.*z2*z2); //changed sign
         //val1[i] *= x * (x2-3.*y2)*(x2 + y2 - 8.*z2);
       }
     }
@@ -3814,7 +4060,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double r = grid[6*i+3];
         double z2 = z*z;
         double r2 = r*r;
-        val1[i] *= z * (r2 - 3.*z2)*(x-y)*(x+y);
+        val1[i] *= -z * (r2 - 3.*z2)*(x-y)*(x+y); //changed sign
         //val1[i] *= y * (x2*x2 + y2*y2 - 12.*y2*z2 + 8.*z2*z2 + 2.*x2 * (y2-6.*z2));
       }
     }
@@ -3832,7 +4078,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double y2 = y*y;
         double z2 = z*z;
         double r2 = r*r;
-        val1[i] *= x * (r2*(x2-3.*y2) + 9.*z2*(3.*y2-x2));
+        val1[i] *= -x * (r2*(x2-3.*y2) + 9.*z2*(3.*y2-x2)); //changed sign
         //val1[i] *= (x2 - y2) * (x2 + y2 - 2.*z2) * z;
       }
     }
@@ -3861,7 +4107,7 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
         double y = grid[6*i+1];
         double x2 = x*x;
         double y2 = y*y;
-        val1[i] *= x * (10.*x2*y2 - x2*x2 - 5.*y2*y2);
+        val1[i] *= -x * (10.*x2*y2 - x2*x2 - 5.*y2*y2); //changed sign
         //val1[i] *= x * (x2*x2 + y2*y2 - 12.*y2*z2 + 8.*z2*z2 + 2.*x2* (y2 - 6.*z2));
       }
     }
@@ -4079,5 +4325,35 @@ void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double n
     }
   }
 
+  if (include_r)
+  {
+   #pragma acc parallel loop present(grid[0:6*gs],val1[0:gs])
+    for (int i=0;i<gs;i++)
+    {
+      double r = grid[6*i+3];
+      val1[i] *= pow(r,-l1);
+    }
+  }
+
+  return;
+}
+
+void eval_gh_spd(int gs, double* grid, double* val1, int l1, int m1)
+{
+ //default is do not divide by r^l
+  return eval_gh_spd(0,gs,grid,val1,l1,m1);
+}
+
+void eval_ghd(int gs, double* grid, double* val1, int l1, int m1, const double norm1, const double zeta1)
+{
+  #pragma acc parallel loop present(grid[0:6*gs],val1[0:gs])
+  for (int i=0;i<gs;i++)
+  {
+    double r = grid[6*i+3];
+    double g1 = exp(-r*r*zeta1);
+    val1[i] = norm1*g1;
+  }
+  eval_gh_spd(gs,grid,val1,l1,m1);
+
   return;
 }
diff --git a/src/integrals/integrals.cpp b/src/integrals/integrals.cpp
index 757f92e..ffaf609 100644
--- a/src/integrals/integrals.cpp
+++ b/src/integrals/integrals.cpp
@@ -4237,11 +4237,13 @@ void compute_Enp(int natoms, int* atno, float* coords, vector<vector<double> > &
   return;
 }
 
-void reduce_Exyz(int i1, int i2, int N, int gs, double* val1m, double* val2m, double* grid1m, double A1, double B1, double C1, double* E)
+void reduce_Exyz(int i1, int i2, int N, int gs, double* val1m, double* val2m, double* grid1m, double A1, double B1, double C1, double rconf, double pconf, double* E)
 {
   int gs6 = 6*gs;
   int N2 = N*N;
 
+  const double a = 11.; //33, 66 also viable?
+
   double valx = 0.; double valy = 0.; double valz = 0.;
   #pragma acc parallel loop present(val1m[0:gs],val2m[0:gs],grid1m[0:gs6]) reduction(+:valx,valy,valz)
   for (int j=0;j<gs;j++)
@@ -4251,9 +4253,18 @@ void reduce_Exyz(int i1, int i2, int N, int gs, double* val1m, double* val2m, do
     double y = grid1m[6*j+1]+B1;
     double z = grid1m[6*j+2]+C1;
 
-    valx += val1m[j]*val2m[j]*x;
-    valy += val1m[j]*val2m[j]*y;
-    valz += val1m[j]*val2m[j]*z;
+   //wave centered at zero
+    double xs = x/(1.+exp(x*x/a));
+    double ys = y/(1.+exp(y*y/a));
+    double zs = z/(1.+exp(z*z/a));
+
+   //confining potential
+    double r = sqrt(x*x+y*y+z*z);
+    if (r>rconf) { xs += pconf*(r-rconf); ys += pconf*(r-rconf); zs += pconf*(r-rconf); }
+
+    valx += val1m[j]*val2m[j]*xs;
+    valy += val1m[j]*val2m[j]*ys;
+    valz += val1m[j]*val2m[j]*zs;
   }
 
   #pragma acc serial present(E[0:3*N2])
@@ -4265,11 +4276,13 @@ void reduce_Exyz(int i1, int i2, int N, int gs, double* val1m, double* val2m, do
   return;
 }
 
-void reduce_Exyz_2(int i1, int i2, int N, int gs, double* val1m, double* val1n, double* val2m, double* val2n, double* grid1m, double* grid2n, double A1, double B1, double C1, double A2, double B2, double C2, double* E)
+void reduce_Exyz_2(int i1, int i2, int N, int gs, double* val1m, double* val1n, double* val2m, double* val2n, double* grid1m, double* grid2n, double A1, double B1, double C1, double A2, double B2, double C2, double rconf, double pconf, double* E)
 {
   int gs6 = 6*gs;
   int N2 = N*N;
 
+  const double a = 11.;
+
   double valx = 0.; double valy = 0.; double valz = 0.;
   #pragma acc parallel loop present(val1m[0:gs],val1n[0:gs],val2m[0:gs],val2n[0:gs],grid1m[0:gs6],grid2n[0:gs6]) reduction(+:valx,valy,valz)
   for (int j=0;j<gs;j++)
@@ -4280,9 +4293,23 @@ void reduce_Exyz_2(int i1, int i2, int N, int gs, double* val1m, double* val1n,
     double x2 = grid2n[6*j+0]+A2;
     double y2 = grid2n[6*j+1]+B2;
     double z2 = grid2n[6*j+2]+C2;
-    valx += val1m[j]*val2m[j]*x1 + val1n[j]*val2n[j]*x2;
-    valy += val1m[j]*val2m[j]*y1 + val1n[j]*val2n[j]*y2;
-    valz += val1m[j]*val2m[j]*z1 + val1n[j]*val2n[j]*z2;
+
+   //wave centered at zero
+    double xs1 = x1/(1.+exp(x1*x1/a));
+    double ys1 = y1/(1.+exp(y1*y1/a));
+    double zs1 = z1/(1.+exp(z1*z1/a));
+    double xs2 = x2/(1.+exp(x2*x2/a));
+    double ys2 = y2/(1.+exp(y2*y2/a));
+    double zs2 = z2/(1.+exp(z2*z2/a));
+
+    double r1 = sqrt(x1*x1+y1*y1+z1*z1);
+    double r2 = sqrt(x2*x2+y2*y2+z2*z2);
+    if (r1>rconf) { xs1 += pconf*(r1-rconf); ys1 += pconf*(r1-rconf); zs1 += pconf*(r1-rconf); }
+    if (r2>rconf) { xs2 += pconf*(r2-rconf); ys2 += pconf*(r2-rconf); zs2 += pconf*(r2-rconf); }
+
+    valx += val1m[j]*val2m[j]*xs1 + val1n[j]*val2n[j]*xs2;
+    valy += val1m[j]*val2m[j]*ys1 + val1n[j]*val2n[j]*ys2;
+    valz += val1m[j]*val2m[j]*zs1 + val1n[j]*val2n[j]*zs2;
   }
 
   #pragma acc serial present(E[0:N2])
@@ -4297,9 +4324,9 @@ void reduce_Exyz_2(int i1, int i2, int N, int gs, double* val1m, double* val1n,
 //applied electric fields
 // x,y,z directions only
 // could expand this to include higher order terms
-void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, double* E, int prl)
+void compute_Exyz(double rconf, double pconf, int natoms, int* atno, double* coords, bool gbasis, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, double* E, int prl)
 {
-  if (prl>1) printf(" beginning compute_E (double precision) \n");
+  if (prl>1) printf(" beginning compute_E (double precision). rpconfine: %8.5f %8.5f \n",rconf,pconf);
 
   int N = basis.size();
   int N2 = N*N;
@@ -4417,7 +4444,7 @@ void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vec
         for (int j=0;j<gs;j++)
           val1m[j] *= wt1[j];
 
-        reduce_Exyz(i1,i2,N,gs,val1m,val2m,grid1m,A1,B1,C1,E);
+        reduce_Exyz(i1,i2,N,gs,val1m,val2m,grid1m,A1,B1,C1,rconf,pconf,E);
 
       } //i1,i2 over gbasis
     }
@@ -4449,7 +4476,7 @@ void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vec
       eval_shd(ii1,gs,grid1m,val1m,n1,l1,m1,zeta1); //basis 1
       eval_shd(ii1,gs,grid1m,val2m,n2,l2,m2,zeta2); //basis 2
 
-      reduce_Exyz(i1,i2,N,gs,val1m,val2m,grid1m,A1,B1,C1,E);
+      reduce_Exyz(i1,i2,N,gs,val1m,val2m,grid1m,A1,B1,C1,rconf,pconf,E);
 
     } //pairs of basis on single atoms
 
@@ -4543,7 +4570,7 @@ void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vec
             val1n[j] *= wt2[j];
           }
 
-          reduce_Exyz_2(i1,i2,N,gs,val1m,val1n,val2m,val2n,grid1m,grid2n,A1,B1,C1,A2,B2,C2,E);
+          reduce_Exyz_2(i1,i2,N,gs,val1m,val1n,val2m,val2n,grid1m,grid2n,A1,B1,C1,A2,B2,C2,rconf,pconf,E);
         } //i1,i2 over gbasis
       }
 
@@ -4596,7 +4623,7 @@ void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vec
         eval_shd(ii1,gs,grid1n,val2m,n2,l2,m2,zeta2); //basis 2 on center 1
         eval_shd(ii1,gs,grid2n,val2n,n2,l2,m2,zeta2); //basis 2 on center 2
 
-        reduce_Exyz_2(i1,i2,N,gs,val1m,val1n,val2m,val2n,grid1m,grid2n,A1,B1,C1,A2,B2,C2,E);
+        reduce_Exyz_2(i1,i2,N,gs,val1m,val1n,val2m,val2n,grid1m,grid2n,A1,B1,C1,A2,B2,C2,rconf,pconf,E);
 
       } //loop i1,i2
 
@@ -4693,6 +4720,11 @@ void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vec
   return;
 }
 
+void compute_Exyz(int natoms, int* atno, double* coords, bool gbasis, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, double* E, int prl)
+{
+  return compute_Exyz(0,0,natoms,atno,coords,gbasis,basis,nrad,nang,ang_g,ang_w,S,E,prl);
+}
+
 //high-precision overlap integrals
 void compute_Sd(int natoms, int* atno, float* coords, vector<vector<double> > &basis, int nrad, int nang, double* ang_g, double* ang_w, double* S, int prl)
 {
diff --git a/src/integrals/integrals_aux.cpp b/src/integrals/integrals_aux.cpp
index 63320b9..d337609 100644
--- a/src/integrals/integrals_aux.cpp
+++ b/src/integrals/integrals_aux.cpp
@@ -609,6 +609,24 @@ void get_angular_grid(int size_ang, double* ang_g, double* ang_w)
   return;
 }
 
+void get_angular_grid(int size_ang, float* ang_g, float* ang_w)
+{
+  double* ang_gd = new double[3*size_ang];
+  double* ang_wd = new double[size_ang];
+
+  get_angular_grid(size_ang,ang_gd,ang_wd);
+
+  for (int j=0;j<3*size_ang;j++)
+    ang_g[j] = ang_gd[j];
+  for (int j=0;j<size_ang;j++)
+    ang_w[j] = ang_wd[j];
+
+  delete [] ang_gd;
+  delete [] ang_wd;
+
+  return;
+}
+
 void rgrid_one_atom(int nrad, int Z1, float* r1)
 {
   float w1[nrad];
@@ -1531,6 +1549,7 @@ vector<vector<double> > setup_integrals_gsgpu(vector<vector<double> >& basis_aux
   env = inp.get_env();
 
   map< int, basis_t > basmap = inp.basmap;
+  map< int, basis_t > basmap_ri = inp.basmap_ri;
 
   vector<vector<double> > basis;
   for (int n=0;n<natoms;n++)
@@ -1607,12 +1626,43 @@ vector<vector<double> > setup_integrals_gsgpu(vector<vector<double> >& basis_aux
     }
   }
 
- //just to make basis_aux's size correct
   basis_aux.clear();
-  for (int i=0;i<Naux;i++)
+  for (int n=0;n<natoms;n++)
   {
-    vector<double> b1; b1.push_back(1);
-    basis_aux.push_back(b1);
+    int i1 = atno[n];
+    basis_t basis1 = basmap_ri[i1];
+    int Z = basis1.nuc;
+    int nshell = basis1.shells.size();
+
+    for (int j=0;j<nshell;j++)
+    {
+      int size1 = basis1.exps[j].size();
+      int l1 = basis1.shells[j];
+      int lmin = -l1; int lmax = l1;
+
+      for (int m=lmin;m<=lmax;m++)
+      {
+        //int m1 = get_gauss_m(l1,m);
+        vector<double> b1; for (int p=0;p<10;p++) b1.push_back(0);
+        b1[0] = l1+1; b1[1] = l1; b1[2] = m;
+       //using b1[3]==b1[4] to store # of gaussians
+        b1[3] = b1[4] = size1;
+        b1[5] = coords[3*n+0]; b1[6] = coords[3*n+1]; b1[7] = coords[3*n+2];
+        b1[8] = Z; b1[9] = n;
+
+       //zeta then normalization coeffs
+        for (int k=0;k<size1;k++)
+          b1.push_back(basis1.exps[j][k]);
+        for (int k=0;k<size1;k++)
+        {
+          double norm1 = get_gto_norm(l1,basis1.exps[j][k])*basis1.coef[j][k];
+          if (l1>0) norm1 *= norm_sh(l1,m)/n0;
+          b1.push_back(norm1);
+        }
+
+        basis_aux.push_back(b1);
+      }
+    }
   }
 
   return basis;
diff --git a/src/libcintw/cintwrapper.cpp b/src/libcintw/cintwrapper.cpp
index db8ddd8..d845257 100644
--- a/src/libcintw/cintwrapper.cpp
+++ b/src/libcintw/cintwrapper.cpp
@@ -56,12 +56,14 @@ void get_vri(double** val, int gs, int N,
     printf("  using ri basis in get_vri \n");
   }
 
+ /*
  //CPMZ something is off about the cache size estimate
   int cache_size = int1e_grids_sph(NULL, NULL, shls, atm, natm, bas, nbas+nbas_ri, env, NULL, NULL);
   //printf(" cache_size: %4i \n",cache_size);
   cache_size += gs;
   cache_size *= 10;
   //double* cache = new double[cache_size]();
+ */
 
  //CPMZ just let libcint allocate for itself
   double* cache = NULL;
@@ -271,10 +273,13 @@ void get_hcore(double *hcore, double *En, double *T, int N,
   int di, dj;
   int shls[2];
 
-  double *tmp = new double[N * N];
+  int N2 = N*N;
+  double *tmp = new double[N2];
 
-  if (BT::DO_CART) {
-    for (int i = 0; i < nbas; i++) {
+  if (BT::DO_CART)
+  {
+    for (int i = 0; i < nbas; i++)
+    {
       int idx_j = 0;
       shls[0] = i;
       di = CINTcgto_cart(i, bas);
@@ -297,12 +302,12 @@ void get_hcore(double *hcore, double *En, double *T, int N,
       idx_i += di;
     }// for i
 
-    for (int i = 0; i < N * N; i++) {
+    for (int i = 0; i < N2; i++)
       hcore[i] = tmp[i];
-    }
 
     idx_i = 0;
-    for (int i = 0; i < nbas; i++) {
+    for (int i = 0; i < nbas; i++)
+    {
       int idx_j = 0;
       shls[0] = i;
       di = CINTcgto_cart(i, bas);
@@ -325,8 +330,10 @@ void get_hcore(double *hcore, double *En, double *T, int N,
       idx_i += di;
     }// for i
   } // if BT::DO_CART
-  else {
-    for (int i = 0; i < nbas; i++) {
+  else
+  {
+    for (int i = 0; i < nbas; i++)
+    {
       int idx_j = 0;
       shls[0] = i;
       di = CINTcgto_spheric(i, bas);
@@ -350,15 +357,14 @@ void get_hcore(double *hcore, double *En, double *T, int N,
     }// for i
 
     if (En!=NULL)
-    for (int i = 0; i < N * N; i++) {
+    for (int i = 0; i < N2; i++)
       En[i] = tmp[i];
-    }
-    for (int i = 0; i < N * N; i++) {
+    for (int i = 0; i < N2; i++)
       hcore[i] = tmp[i];
-    }
 
     idx_i = 0;
-    for (int i = 0; i < nbas; i++) {
+    for (int i = 0; i < nbas; i++)
+    {
       int idx_j = 0;
       shls[0] = i;
       di = CINTcgto_spheric(i, bas);
@@ -383,19 +389,18 @@ void get_hcore(double *hcore, double *En, double *T, int N,
   }
 
   if (T!=NULL)
-  for (int i = 0; i < N * N; i++) {
+  for (int i = 0; i < N2; i++)
     T[i] = tmp[i];
-  }
-  for (int i = 0; i < N * N; i++) {
+  for (int i = 0; i < N2; i++)
     hcore[i] += tmp[i];
-  }
 
   delete [] tmp;
 }
 
 void get_hcore(double *hcore, int N,
                int natm, int nbas, int nenv,
-               int *atm, int *bas, double *env) {
+               int *atm, int *bas, double *env)
+{
   return get_hcore(hcore,NULL,NULL,N,natm,nbas,nenv,atm,bas,env);
 }
 
@@ -409,8 +414,8 @@ void get_tcore(double *tcore, int N,
   int N2 = N*N;
   double *tmp = new double[N2];
 
-  if (BT::DO_CART) {
-
+  if (BT::DO_CART)
+  {
     idx_i = 0;
     for (int i = 0; i < nbas; i++) {
       int idx_j = 0;
@@ -435,7 +440,8 @@ void get_tcore(double *tcore, int N,
       idx_i += di;
     }// for i
   } // if BT::DO_CART
-  else {
+  else
+  {
     idx_i = 0;
     for (int i = 0; i < nbas; i++) {
       int idx_j = 0;
@@ -544,12 +550,320 @@ void gen_pvp(double *pvp, int N,
     pvp[i] = tmp[i];
   }
 
-  delete [] tmp; 
+  delete [] tmp;
+}
+
+void gen_4c_overlap_m4(double* ovlp4, size_t N,
+                    int natm, int nbas, int nenv,
+                    int* atm, int* bas, double* env)
+{
+  size_t N2 = N*N;
+  size_t Npairs = N*(N+1)/2;
+  CINTOpt *no_opt = NULL;
+
+ #define USE_PARA_4C 1
+
+ #if USE_PARA_4C
+ //when j==i, parallel execution could write to same place at the same time
+  int nomp = 1;
+ #pragma omp parallel
+  nomp = omp_get_num_threads();
+  if (nbas<nomp) nomp = nbas;
+
+  printf("  gen_4c_overlap on %i threads \n",nomp);
+ #endif
+
+
+  int dmax = 1;
+  for (int i=0;i<nbas;i++)
+  {
+    int di = 0;
+    if (BT::DO_CART)
+      di = CINTcgto_cart(i, bas);
+    else
+      di = CINTcgto_spheric(i, bas);
+    if (dmax<di)
+      dmax = di;
+  }
+
+ #if USE_PARA_4C
+ //will need to use these offsets in the parallel loops
+  size_t* shell_offset = new size_t[nbas+1];
+  shell_offset[0] = 0;
+  if (BT::DO_CART)
+  {
+    for (int i=0;i<nbas;i++)
+      shell_offset[i+1] = shell_offset[i] + CINTcgto_cart(i,bas);
+  }
+  else
+  {
+    for (int i=0;i<nbas;i++)
+      shell_offset[i+1] = shell_offset[i] + CINTcgto_spheric(i,bas);
+  }
+ #endif
+
+  size_t dm4 = 2*dmax*dmax*dmax*dmax;
+  double* bufall = new double[dm4*nomp]();
+
+  if (BT::DO_CART)
+  {
+    //size_t idxi = 0, idxj = 0, idxk = 0, idxl = 0;
+
+   #pragma omp parallel for schedule(static,1) num_threads(nomp)
+    for (size_t i=0;i<nbas;i++)
+    {
+      size_t di, dj, dk, dl;
+      int tid = omp_get_thread_num();
+      double* buf = &bufall[tid*dm4];
+      //double* buf = bufall;
+
+      di = CINTcgto_cart(i, bas);
+      size_t idxi = shell_offset[i];
+      for (size_t j=0;j<=i;j++) //j<=i
+      {
+        dj = CINTcgto_cart(j, bas);
+        size_t idxj = shell_offset[j];
+        for (size_t k=0;k<nbas;k++)
+        {
+          dk = CINTcgto_cart(k, bas);
+          size_t idxk = shell_offset[k];
+          for (size_t l=0;l<=k;l++) //l<=k
+          {
+            dl = CINTcgto_cart(l, bas);
+            size_t idxl = shell_offset[l];
+
+            int shls[4] = {i, j, k, l};
+            cint4c1e_cart(buf, shls, atm, natm, bas, nbas, env, no_opt);
+
+            for (size_t l1=0;l1<dl;l1++)
+            for (size_t k1=0;k1<dk;k1++)
+            for (size_t j1=0;j1<dj;j1++)
+            for (size_t i1=0;i1<di;i1++)
+            {
+              size_t oia = i1 + idxi;
+              size_t oja = j1 + idxj;
+              size_t oi = max(oia,oja);
+              size_t oj = min(oia,oja);
+
+              size_t oka = k1 + idxk;
+              size_t ola = l1 + idxl;
+              size_t ok = max(oka,ola);
+              size_t ol = min(oka,ola);
+
+              size_t ij = oi*(oi+1)/2 + oj;
+              size_t kl = ok*(ok+1)/2 + ol;
+              size_t bdx = l1*(dk*dj*di) + k1*(dj*di) + j1*di + i1;
+
+              ovlp4[ij*Npairs+kl] = buf[bdx];
+            }
+
+            //idxl += dl;
+          }
+          //idxl = 0;
+          //idxk += dk;
+        }
+        //idxk = 0;
+        //idxj += dj;
+      }
+      //idxj = 0;
+      //idxi += di;
+    }
+  }
+  else
+  {
+    //size_t idxi = 0, idxj = 0, idxk = 0, idxl = 0;
+
+    // Same structure but use CINTcgto_spheric + cint4c1e_sph
+   #pragma omp parallel for schedule(static,1) num_threads(nomp)
+    for (size_t i=0;i<nbas;i++)
+    {
+      size_t di, dj, dk, dl;
+      int tid = omp_get_thread_num();
+      double* buf = &bufall[tid*dm4];
+      //double* buf = bufall;
+
+      di = CINTcgto_spheric(i, bas);
+      size_t idxi = shell_offset[i];
+      for (size_t j=0;j<=i;j++) //j<=i
+      {
+        dj = CINTcgto_spheric(j, bas);
+        size_t idxj = shell_offset[j];
+        for (size_t k=0;k<nbas;k++)
+        {
+          dk = CINTcgto_spheric(k, bas);
+          size_t idxk = shell_offset[k];
+          for (size_t l=0;l<=k;l++) //l<=k
+          {
+            dl = CINTcgto_spheric(l, bas);
+            size_t idxl = shell_offset[l];
+
+            int shls[4] = {i, j, k, l};
+            cint4c1e_sph(buf, shls, atm, natm, bas, nbas, env, no_opt);
+
+            for (size_t l1=0;l1<dl;l1++)
+            for (size_t k1=0;k1<dk;k1++)
+            for (size_t j1=0;j1<dj;j1++)
+            for (size_t i1=0;i1<di;i1++)
+            {
+              size_t oia = i1 + idxi;
+              size_t oja = j1 + idxj;
+              size_t oi = max(oia,oja);
+              size_t oj = min(oia,oja);
+
+              size_t oka = k1 + idxk;
+              size_t ola = l1 + idxl;
+              size_t ok = max(oka,ola);
+              size_t ol = min(oka,ola);
+
+              size_t ij = oi*(oi+1)/2 + oj;
+              size_t kl = ok*(ok+1)/2 + ol;
+              size_t bdx = l1*(dk*dj*di) + k1*(dj*di) + j1*di + i1;
+
+              ovlp4[ij*Npairs+kl] = buf[bdx];
+            }
+            //idxl += dl;
+          }
+          //idxl = 0;
+          //idxk += dk;
+        }
+        //idxk = 0;
+        //idxj += dj;
+      }
+      //idxj = 0;
+      //idxi += di;
+    }
+  }
+
+  delete [] bufall;
+ #if USE_PARA_4C
+  delete [] shell_offset;
+ #endif
+
+  return;
 }
 
-void gen_eri(double **eri, int N, 
+
+void gen_4c_overlap(double* ovlp4, size_t N,
+                    int natm, int nbas, int nenv,
+                    int* atm, int* bas, double* env)
+{
+  size_t N2 = N*N;
+  size_t idxi = 0, idxj = 0, idxk = 0, idxl = 0;
+  size_t di, dj, dk, dl;
+  CINTOpt *no_opt = NULL;
+
+  int dmax = 1;
+  for (int i=0;i<nbas;i++)
+  {
+    di = CINTcgto_cart(i, bas);
+    if (dmax<di)
+      dmax = di;
+    dj = CINTcgto_spheric(i, bas);
+    if (dmax<dj)
+      dmax = dj;
+  }
+  double* buf = new double[dmax*dmax*dmax*dmax]();
+
+  if (BT::DO_CART)
+  {
+    for (size_t i=0;i<nbas;i++)
+    {
+      di = CINTcgto_cart(i, bas);
+      for (size_t j=0;j<nbas;j++)
+      {
+        dj = CINTcgto_cart(j, bas);
+        for (size_t k=0;k<nbas;k++)
+        {
+          dk = CINTcgto_cart(k, bas);
+          for (size_t l=0;l<nbas;l++)
+          {
+            dl = CINTcgto_cart(l, bas);
+            int shls[4] = {i, j, k, l};
+
+            cint4c1e_cart(buf, shls, atm, natm, bas, nbas, env, no_opt);
+
+            for (size_t l1=0;l1<dl;l1++)
+            for (size_t k1=0;k1<dk;k1++)
+            for (size_t j1=0;j1<dj;j1++)
+            for (size_t i1=0;i1<di;i1++)
+            {
+              size_t oi = i1 + idxi;
+              size_t oj = j1 + idxj;
+              size_t ok = k1 + idxk;
+              size_t ol = l1 + idxl;
+              size_t ij = oi * N + oj;
+              size_t kl = ok * N + ol;
+              size_t bdx = l1*(dk*dj*di) + k1*(dj*di) + j1*di + i1;
+              ovlp4[ij*N2+kl] = buf[bdx];
+            }
+
+            idxl += dl;
+          }
+          idxl = 0;
+          idxk += dk;
+        }
+        idxk = 0;
+        idxj += dj;
+      }
+      idxj = 0;
+      idxi += di;
+    }
+  }
+  else
+  {
+    // Same structure but use CINTcgto_spheric + cint4c1e_sph
+    for (size_t i=0;i<nbas;i++)
+    {
+      di = CINTcgto_spheric(i, bas);
+      for (size_t j=0;j<nbas;j++)
+      {
+        dj = CINTcgto_spheric(j, bas);
+        for (size_t k=0;k<nbas;k++)
+        {
+          dk = CINTcgto_spheric(k, bas);
+          for (size_t l=0;l<nbas;l++)
+          {
+            dl = CINTcgto_spheric(l, bas);
+            int shls[4] = {i, j, k, l};
+
+            cint4c1e_sph(buf, shls, atm, natm, bas, nbas, env, no_opt);
+
+            for (size_t l1=0;l1<dl;l1++)
+            for (size_t k1=0;k1<dk;k1++)
+            for (size_t j1=0;j1<dj;j1++)
+            for (size_t i1=0;i1<di;i1++)
+            {
+              size_t oi = i1 + idxi;
+              size_t oj = j1 + idxj;
+              size_t ok = k1 + idxk;
+              size_t ol = l1 + idxl;
+              size_t ij = oi * N + oj;
+              size_t kl = ok * N + ol;
+              size_t bdx = l1*(dk*dj*di) + k1*(dj*di) + j1*di + i1;
+              ovlp4[ij*N2+kl] = buf[bdx];
+            }
+            idxl += dl;
+          }
+          idxl = 0;
+          idxk += dk;
+        }
+        idxk = 0;
+        idxj += dj;
+      }
+      idxj = 0;
+      idxi += di;
+    }
+  }
+
+  delete [] buf;
+
+  return;
+}
+
+void gen_eri(double **eri, int N,
              int natm, int nbas, int nenv,
-             int *atm, int *bas, double *env) {
+             int *atm, int *bas, double *env)
+{
   //int N2 = N*N;
 
   int idxi = 0;
@@ -561,14 +875,19 @@ void gen_eri(double **eri, int N,
   int dk = 0;
   int dl = 0;
   CINTOpt *no_opt = NULL;
-  if (BT::DO_CART) {
-    for (int i = 0; i < nbas; i++) {
+  if (BT::DO_CART)
+  {
+    for (int i = 0; i < nbas; i++)
+    {
       di = CINTcgto_cart(i,bas);
-      for (int j = 0; j < nbas; j++) {
+      for (int j = 0; j < nbas; j++)
+      {
         dj = CINTcgto_cart(j,bas);
-        for (int k = 0; k < nbas; k++) {
+        for (int k = 0; k < nbas; k++)
+        {
           dk = CINTcgto_cart(k,bas);
-          for (int l = 0; l < nbas; l++) {
+          for (int l = 0; l < nbas; l++)
+          {
             dl = CINTcgto_cart(l,bas);
             double * buf = new double[di*dj*dk*dl];
             int shls[4];
@@ -608,14 +927,19 @@ void gen_eri(double **eri, int N,
       idxi += di;
     } // for i
   } // if BT::DO_CART
-  else {
-    for (int i = 0; i < nbas; i++) {
+  else
+  {
+    for (int i = 0; i < nbas; i++)
+    {
       di = CINTcgto_spheric(i,bas);
-      for (int j = 0; j < nbas; j++) {
+      for (int j = 0; j < nbas; j++)
+      {
         dj = CINTcgto_spheric(j,bas);
-        for (int k = 0; k < nbas; k++) {
+        for (int k = 0; k < nbas; k++)
+        {
           dk = CINTcgto_spheric(k,bas);
-          for (int l = 0; l < nbas; l++) {
+          for (int l = 0; l < nbas; l++)
+          {
             dl = CINTcgto_spheric(l,bas);
             double * buf = new double[di*dj*dk*dl];
             int shls[4];
@@ -655,6 +979,8 @@ void gen_eri(double **eri, int N,
       idxi += di;
     } // for i
   } // else
+
+  return;
 }
 
 void gen_jMOI_gto(double **eri, int N,

From ff2b4d1669c337efada12f772033f3fa4daadff9 Mon Sep 17 00:00:00 2001
From: Paul Zimmerman <paulzim@umich.edu>
Date: Wed, 13 May 2026 15:57:06 -0400
Subject: [PATCH 2/4] compiler directive for 4c ints

---
 src/libcintw/cintwrapper.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/libcintw/cintwrapper.cpp b/src/libcintw/cintwrapper.cpp
index d845257..9072d6f 100644
--- a/src/libcintw/cintwrapper.cpp
+++ b/src/libcintw/cintwrapper.cpp
@@ -557,6 +557,7 @@ void gen_4c_overlap_m4(double* ovlp4, size_t N,
                     int natm, int nbas, int nenv,
                     int* atm, int* bas, double* env)
 {
+  #ifdef COMPILE_CINTW_4C
   size_t N2 = N*N;
   size_t Npairs = N*(N+1)/2;
   CINTOpt *no_opt = NULL;
@@ -738,7 +739,12 @@ void gen_4c_overlap_m4(double* ovlp4, size_t N,
  #if USE_PARA_4C
   delete [] shell_offset;
  #endif
-
+  
+  #else
+    printf("\n\n\n ERROR: this SlaterGPU is not set up for 4c center integrals gbasis \n");
+    exit(-1);
+  #endif
+  
   return;
 }
 
@@ -747,6 +753,7 @@ void gen_4c_overlap(double* ovlp4, size_t N,
                     int natm, int nbas, int nenv,
                     int* atm, int* bas, double* env)
 {
+ #ifdef COMPILE_CINTW_4C
   size_t N2 = N*N;
   size_t idxi = 0, idxj = 0, idxk = 0, idxl = 0;
   size_t di, dj, dk, dl;
@@ -857,6 +864,10 @@ void gen_4c_overlap(double* ovlp4, size_t N,
 
   delete [] buf;
 
+  #else
+    printf("\n\n\n ERROR: this SlaterGPU is not set up for 4c center integrals gbasis \n");
+    exit(-1);
+  #endif
   return;
 }
 

From 6f90ff37009f35da65906e7d8a808c6bf6578afe Mon Sep 17 00:00:00 2001
From: Joshua Kammeraad <joshkamm@umich.edu>
Date: Thu, 14 May 2026 11:26:27 -0700
Subject: [PATCH 3/4] add COMPILE_CINTW_4C build option

Adds a CMake variable that, when ON, defines the COMPILE_CINTW_4C
preprocessor macro for the cintw target. This macro gates the bodies
of gen_4c_overlap_m4 and gen_4c_overlap in cintwrapper.cpp, which
require libcint's cint4c1e_* functions (compiled only with a
non-default flag and affected by the known bug in issue #82).

Defaults to OFF. Mirrors the existing USE_ACC env-var pattern so users
can opt in via:

    export COMPILE_CINTW_4C=ON

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt              | 6 ++++++
 src/libcintw/CMakeLists.txt | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6fc849..64aa9dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,12 @@ endif()
 
 set(DO_GTO ${DO_GTO})
 
+set(COMPILE_CINTW_4C OFF)
+if("$ENV{COMPILE_CINTW_4C}" STREQUAL "ON")
+    set(COMPILE_CINTW_4C ON)
+endif()
+message(STATUS "COMPILE_CINTW_4C: ${COMPILE_CINTW_4C}")
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
diff --git a/src/libcintw/CMakeLists.txt b/src/libcintw/CMakeLists.txt
index defb754..ae64cfa 100644
--- a/src/libcintw/CMakeLists.txt
+++ b/src/libcintw/CMakeLists.txt
@@ -18,6 +18,9 @@ set(CINTW_COMP_FLAGS ${CMAKE_CXX_FLAGS}
 add_definitions("-w")
 
 target_compile_options(cintw PRIVATE ${CINTW_COMP_FLAGS})
+if(COMPILE_CINTW_4C)
+    target_compile_definitions(cintw PRIVATE COMPILE_CINTW_4C)
+endif()
 target_include_directories(cintw PRIVATE
     "${CMAKE_SOURCE_DIR}/include"
     "${LIBCINT_PATH}/include"

From f7afd4b9b717ae83adeef8752fb15cba90bc6d85 Mon Sep 17 00:00:00 2001
From: Joshua Kammeraad <joshkamm@umich.edu>
Date: Thu, 14 May 2026 11:27:43 -0700
Subject: [PATCH 4/4] fix indent on 4c integral compile guard

The #ifdef COMPILE_CINTW_4C inside gen_4c_overlap (cintwrapper.cpp:756)
was 1-space indented; the matching one inside gen_4c_overlap_m4 and
both #else / #endif lines are 2-space. Normalize to match.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/libcintw/cintwrapper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libcintw/cintwrapper.cpp b/src/libcintw/cintwrapper.cpp
index 9072d6f..1a5f565 100644
--- a/src/libcintw/cintwrapper.cpp
+++ b/src/libcintw/cintwrapper.cpp
@@ -753,7 +753,7 @@ void gen_4c_overlap(double* ovlp4, size_t N,
                     int natm, int nbas, int nenv,
                     int* atm, int* bas, double* env)
 {
- #ifdef COMPILE_CINTW_4C
+  #ifdef COMPILE_CINTW_4C
   size_t N2 = N*N;
   size_t idxi = 0, idxj = 0, idxk = 0, idxl = 0;
   size_t di, dj, dk, dl;