/*******************************************************************************
* Copyright (C) 2014 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file SparseMatrix.hpp

 HPCG data structures for the sparse matrix
 */

#ifndef SPARSEMATRIX_HPP
#define SPARSEMATRIX_HPP

#include <vector>
#include <cassert>
#include <complex>
#include "Geometry.hpp"
#include "Vector.hpp"
#include "MGData.hpp"
#include "VeryBasicProfiler.hpp"
#include "CustomKernels.hpp"

//SYCL includes
#include <sycl/sycl.hpp>

#if __cplusplus < 201103L
// for C++03
#include <map>
typedef std::map< global_int_t, local_int_t > GlobalToLocalMap;
#else
// for C++11 or greater
#include <unordered_map>
using GlobalToLocalMap = std::unordered_map< global_int_t, local_int_t >;
#endif

#include "stdio.h"

struct optData
{
    local_int_t nVectors;
    local_int_t nBlocks;
    local_int_t target_block_size;

    local_int_t nColors;
    local_int_t *xcolors_dev;
    local_int_t *xcolors_host;

    local_int_t *esblastLower;
    local_int_t *esbfirstUpper;
    local_int_t *esblastUpper;
    local_int_t *esbfirstNonloc;
    local_int_t *esbblockptr;
    local_int_t *esbcolind;
    double *esbvalues;

    // arrays for reordering blocks
    local_int_t *mv_reorder;
    local_int_t *trmv_l_reorder;
    local_int_t *trmv_u_reorder;

// #if defined(HPCG_TEST_CUSTOM_KERNELS)
// TODO: put these in a separate data structure or something
#if 1
    local_int_t *ia;
    local_int_t *ja;
    double *a;

    local_int_t *ib;
    local_int_t *jb;
    double *b;

    local_int_t nrow_b;
    local_int_t *bmap;
#endif

    local_int_t *invperm;
    local_int_t *perm;
    local_int_t *perm_coarse; // pointer to A.Ac->optData->perm for restriction/prolongation

    double *diags;
    double *dtmp;
    double *dtmp2;
    double *dtmp3;
    double *dtmp4;

    double *normr_dev;
    double *pAp_loc_dev;
    double *rtz_loc_dev;
    double *pAp_dev;
    double *rtz_dev;
    double *oldrtz_dev;

    double *normr_host;
    double *rtz_loc_host;
    double *pAp_loc_host;
    double *global_result_host;

    double *halo_host_vector;

    void *esbM;
    void *devInfo;

    VeryBasicProfiler *profiler;
};


// Main matrix structure
//
//  nonzerosInRow, mtxIndG, mtxIndL, matrixValues, matrixDiagonal are pointers into mtxL,
//  mtxG, mtxA data and are used to construct the global problem.  Then in OptimizeProblem
//  they are used to construct the optimized local A/nonlocal B matrices.  Finally, they
//  are deleted before main CG() as early as they can safely be deleted so as to not
//  have redundent copies of the matrices floating around.
//
//  opttimizationData contains the usable copy of the matrix and other relevant data for
//  timed runs.
//
//  the MPI structures are also used for the HaloExchanges.

struct SparseMatrix_STRUCT {
  char  * title; //!< name of the sparse matrix
  Geometry * geom; //!< geometry associated with this matrix
  global_int_t totalNumberOfRows; //!< total number of matrix rows across all processes
  global_int_t totalNumberOfNonzeros; //!< total number of matrix nonzeros across all processes
  local_int_t localNumberOfRows; //!< number of rows local to this process
  local_int_t localNumberOfColumns;  //!< number of columns local to this process
  local_int_t localNumberOfNonzeros;  //!< number of nonzeros local to this process
  char  * nonzerosInRow;  //!< The number of nonzeros in a row will always be 27 or fewer
  global_int_t ** mtxIndG; //!< matrix indices as global values
  local_int_t ** mtxIndL; //!< matrix indices as local values
  double ** matrixValues; //!< values of matrix entries
  double ** matrixDiagonal; //!< values of matrix diagonal entries
  GlobalToLocalMap globalToLocalMap; //!< global-to-local mapping
  global_int_t * localToGlobalMap;  //!< local-to-global mapping

  char  * nonzerosInRow_host;  //!< The number of nonzeros in a row will always be 27 or fewer
  global_int_t ** mtxIndG_host; //!< matrix indices as global values
  local_int_t ** mtxIndL_host; //!< matrix indices as local values
  double ** matrixValues_host; //!< values of matrix entries
  double ** matrixDiagonal_host; //!< values of matrix diagonal entries
  GlobalToLocalMap globalToLocalMap_host; //!< global-to-local mapping
  global_int_t * localToGlobalMap_host;  //!< local-to-global mapping

  mutable bool isDotProductOptimized;
  mutable bool isSpmvOptimized;
  mutable bool isMgOptimized;
  mutable bool isWaxpbyOptimized;
  /*!
   This is for storing optimized data structres created in OptimizeProblem and
   used inside optimized ComputeSPMV().
   */
  mutable struct SparseMatrix_STRUCT * Ac; // Coarse grid matrix
  mutable MGData * mgData; // Pointer to the coarse level data for this fine matrix
  mutable MGData * mgData_host; // Pointer to the coarse level data for this fine matrix on Host
  void * optimizationData;  // pointer that can be used to store implementation-specific data

#ifndef HPCG_NO_MPI
  local_int_t numberOfExternalValues; //!< number of entries that are external to this process
  int numberOfSendNeighbors; //!< number of neighboring processes that will be send local data
  local_int_t totalToBeSent; //!< total number of entries to be sent
  local_int_t * elementsToSend_h; //!< elements to send to neighboring processes
  local_int_t * elementsToSend_d; //!< elements to send to neighboring processes
  int * neighbors; //!< neighboring processes
  local_int_t * receiveLength; //!< lenghts of messages received from neighboring processes
  local_int_t * sendLength; //!< lenghts of messages sent to neighboring processes
  double * sendBuffer; //!< send buffer for non-blocking sends
  double * sendBuffer_h; //!< send buffer for non-blocking sends
#endif
  local_int_t numOfBoundaryRows;
  local_int_t *boundaryRows; //!< rows that contain less than 27 nonzeros
  local_int_t *mtxL;
  global_int_t *mtxG;
  double *mtxA;

  local_int_t *boundaryRows_host; //!< rows that contain less than 27 nonzeros
  local_int_t *mtxL_host;
  global_int_t *mtxG_host;
  double *mtxA_host;

  local_int_t *work;
  local_int_t *scounts;
  local_int_t *rcounts;
  local_int_t *sdispls;
  local_int_t *rdispls;
};
typedef struct SparseMatrix_STRUCT SparseMatrix;

/*!
  Initializes the known system matrix data structure members to 0.

  @param[in] A the known system matrix
 */
inline void InitializeSparseMatrix(SparseMatrix & A, Geometry * geom) {
  A.title = 0;
  A.geom = geom;
  A.totalNumberOfRows = 0;
  A.totalNumberOfNonzeros = 0;
  A.localNumberOfRows = 0;
  A.localNumberOfColumns = 0;
  A.localNumberOfNonzeros = 0;

  A.nonzerosInRow = nullptr;
  A.mtxIndG = nullptr;
  A.mtxIndL = nullptr;
  A.matrixValues = nullptr;
  A.matrixDiagonal = nullptr;
  A.localToGlobalMap = nullptr;

  A.nonzerosInRow_host = nullptr;
  A.mtxIndG_host = nullptr;
  A.mtxIndL_host = nullptr;
  A.matrixValues_host = nullptr;
  A.matrixDiagonal_host = nullptr;
  A.localToGlobalMap_host = nullptr;

  // Optimization is ON by default. The code that switches it OFF is in the
  // functions that are meant to be optimized.
  A.isDotProductOptimized = true;
  A.isSpmvOptimized       = true;
  A.isMgOptimized         = true;
  A.isWaxpbyOptimized     = true;

#ifndef HPCG_NO_MPI
  A.numberOfExternalValues = 0;
  A.numberOfSendNeighbors = 0;
  A.totalToBeSent = 0;
  A.elementsToSend_d = NULL;
  A.elementsToSend_h = NULL;
  A.neighbors = 0;
  A.receiveLength = 0;
  A.sendLength = 0;
  A.sendBuffer = 0;
  A.sendBuffer_h = 0;
#endif

  A.numOfBoundaryRows = 0;
  A.boundaryRows = nullptr;
  A.mtxL = nullptr;
  A.mtxG = nullptr;
  A.mtxA = nullptr;

  A.boundaryRows_host = nullptr;
  A.mtxL_host = nullptr;
  A.mtxG_host = nullptr;
  A.mtxA_host = nullptr;

  A.mgData = nullptr; // Fine-to-coarse grid transfer initially not defined.
  A.mgData_host = nullptr; // Fine-to-coarse grid transfer initially not defined.
  A.Ac =0;

  A.optimizationData=NULL;
  return;
}

/*!
  Copy values from matrix diagonal into user-provided vector.

  @param[in] A the known system matrix.
  @param[inout] diagonal  Vector of diagonal values (must be allocated before call to this function).
 */
inline sycl::event CopyMatrixDiagonal(SparseMatrix & A, Vector & diagonal, sycl::queue& main_queue) {
    sycl::event ev;
#ifdef HPCG_LOCAL_LONG_LONG
    throw std::runtime_error("LOCAL_LONG_LONG not implemented here");
#else

    struct optData *optData = (struct optData *)A.optimizationData;
    double *diags = optData->diags;
    double *newdiagvals = diagonal.values;
    if (diags != NULL) {
        ev = main_queue.memcpy(newdiagvals, diags, diagonal.localLength * sizeof(double));
    }
    else {

        ev = main_queue.submit([&](sycl::handler& cgh) {

            local_int_t nBlocks = optData->nBlocks;
            local_int_t block_size = optData->target_block_size;

            local_int_t *esbblockptr = optData->esbblockptr;
            local_int_t *esbcolind   = optData->esbcolind;
            double      *esbvalues   = optData->esbvalues;

            auto kernel = [=](sycl::id<1> block) {

                const local_int_t row_st = block * block_size;
                const local_int_t block_st = esbblockptr[block] * block_size;
                const local_int_t block_en = esbblockptr[block+1] * block_size;

                for (local_int_t vPtr = block_st; vPtr < block_en; vPtr += block_size) {
                    for (int loc = 0; loc < block_size; ++loc) {
                        if (esbcolind[vPtr + loc] == row_st + loc) {
                            newdiagvals[row_st + loc] = esbvalues[vPtr + loc];
                        }
                    }
                }
            };
            cgh.parallel_for( sycl::range<1>(nBlocks), kernel);
        });

    }
#endif
    return ev;
}

/*!
  Replace specified matrix diagonal value.

  @param[inout] A The system matrix.
  @param[in] diagonal  Vector of diagonal values that will replace existing matrix diagonal values.
 */
inline void ReplaceMatrixDiagonal(SparseMatrix & A, Vector & diagonal) {
    double ** curDiagA = A.matrixDiagonal;
    double * dv = diagonal.values;
    assert(A.localNumberOfRows==diagonal.localLength);
    for (local_int_t i=0; i<A.localNumberOfRows; ++i) *(curDiagA[i]) = dv[i];
  return;
}

inline sycl::event ReplaceSparseMatrixDiagonal(SparseMatrix &A, const Vector &diagonal,
                                               sycl::queue &main_queue)
{
    struct optData *optData = (struct optData *)A.optimizationData;
    custom::sparseMatrix *Matrix = (custom::sparseMatrix *)optData->esbM;

    // esbM  and diags is all we need
    auto ev_adjust_matrix = main_queue.submit([&](sycl::handler& cgh) {
        double *sparse_diags    = Matrix->diags;
        const double *dv = diagonal.values;

        local_int_t nBlocks = optData->nBlocks;
        local_int_t block_size = optData->target_block_size;
        local_int_t *esbblockptr = optData->esbblockptr;
        local_int_t *esbcolind = optData->esbcolind;
        double *esbvalues = optData->esbvalues;

        auto kernel = [=](sycl::id<1> block) {

            const local_int_t row_st = block * block_size;
            const local_int_t block_st = esbblockptr[block] * block_size;
            const local_int_t block_en = esbblockptr[block+1] * block_size;

            for (int loc = 0; loc < block_size; ++loc) {
                const double diagVal = dv[row_st + loc];
                sparse_diags[row_st + loc]    = diagVal;

                for (local_int_t vPtr = block_st; vPtr < block_en; vPtr += block_size) {
                    if (esbcolind[vPtr + loc] == row_st + loc) {
                        esbvalues[vPtr + loc] = diagVal;
                        break;
                    }
                }
            }
        };
        cgh.parallel_for( sycl::range<1>(nBlocks), kernel);
    });

    return ev_adjust_matrix;
}

inline void DebugDiag(SparseMatrix& A, sycl::queue& main_queue) {
    struct optData *optData = (struct optData *)A.optimizationData;
    double *diags = optData->diags;

    constexpr int check_count = 10;

    auto csr_diag_buffer = sycl::malloc_host<double>(check_count, main_queue);
    main_queue.fill(csr_diag_buffer, -2.7, check_count).wait();

    auto ev = main_queue.submit([&](sycl::handler& cgh) {
        cgh.parallel_for(
            sycl::range<1>(check_count),
            [=](sycl::id<1> id) {
                csr_diag_buffer[id] = diags[id];
            });
    });
    ev.wait();

    for (int i = 0; i < check_count; i++) {
        std::cout << i << " " << "csr_diag_buffer=" << csr_diag_buffer[i] << std::endl;
    }

    sycl::free(csr_diag_buffer, main_queue);
}

inline void DebugVector(Vector& vec, sycl::queue& main_queue) {
    constexpr int check_count = 10;
    auto vec_buffer = sycl::malloc_host<double>(check_count, main_queue);

    double * data = vec.values;
    auto ev = main_queue.submit([&](sycl::handler& cgh) {
        cgh.parallel_for(
            sycl::range<1>(check_count),
            [=](sycl::id<1> id) {
                vec_buffer[id] = data[id];
            });
    });
    ev.wait();

    for (int i = 0; i < check_count; i++) {
        std::cout << i << " " << "vec_buffer=" << vec_buffer[i] << std::endl;
    }
    sycl::free(vec_buffer, main_queue);
}


 inline void AllocateAndFillReferenceData(SparseMatrix &A, sycl::queue &main_queue)
 {

    const local_int_t numberOfNonzerosPerRow = 27;
    const local_int_t localNumberOfRows = A.localNumberOfRows;
    const local_int_t localNumberOfColumns = A.localNumberOfRows;
    const local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
    const local_int_t numOfBoundaryRows = A.numOfBoundaryRows;
    std::vector<sycl::event> dependencies;

    // created and fill  arrays that can be easily copied over directly
    char *nonzerosInRow_host = A.nonzerosInRow_host;
    if (nonzerosInRow_host == nullptr) {
        nonzerosInRow_host = (char *)sparse_malloc_host(sizeof(char)*localNumberOfRows, main_queue);
        dependencies.push_back(main_queue.memcpy(nonzerosInRow_host, A.nonzerosInRow, sizeof(char)*localNumberOfRows));
        A.nonzerosInRow_host = nonzerosInRow_host;
    }

    global_int_t *localToGlobalMap_host = A.localToGlobalMap_host;
    if (localToGlobalMap_host == nullptr) {
        localToGlobalMap_host = (global_int_t *)sparse_malloc_host(sizeof(global_int_t)*localNumberOfRows, main_queue);
        dependencies.push_back(main_queue.memcpy(localToGlobalMap_host, A.localToGlobalMap, sizeof(global_int_t)*localNumberOfRows));
        A.localToGlobalMap_host = localToGlobalMap_host;
    }

    local_int_t *mtxL_host = A.mtxL_host;
    if (mtxL_host == nullptr) {
        const size_t nnz = numberOfNonzerosPerRow * localNumberOfRows;
        mtxL_host = (local_int_t *)sparse_malloc_host(sizeof(local_int_t)*nnz, main_queue);
        dependencies.push_back(main_queue.memcpy(mtxL_host, A.mtxL, sizeof(local_int_t)*nnz));
        A.mtxL_host = mtxL_host;
    }

    double *mtxA_host = A.mtxA_host;
    if (mtxA_host == nullptr) {
        const size_t nnz = numberOfNonzerosPerRow * localNumberOfRows;
        mtxA_host = (double *)sparse_malloc_host(sizeof(double)*nnz, main_queue);
        dependencies.push_back(main_queue.memcpy(mtxA_host, A.mtxA, sizeof(double)*nnz));
        A.mtxA_host = mtxA_host;
    }

    local_int_t *boundaryRows_host = A.boundaryRows_host;
    if (boundaryRows_host == nullptr) {
        boundaryRows_host = (local_int_t*) sparse_malloc_host(sizeof(local_int_t)*numOfBoundaryRows, main_queue);
        dependencies.push_back(main_queue.memcpy(boundaryRows_host, A.boundaryRows, sizeof(local_int_t)*numOfBoundaryRows));
        A.boundaryRows_host = boundaryRows_host;
    }


    // create and fill arrays that are pointers into the new arrays above, more work to
    // be done to fill since it is related to offsets of the underlying arrays
    double **matrixValues_host = A.matrixValues_host;
    if (matrixValues_host == nullptr) {
        matrixValues_host = (double **)sparse_malloc_host(sizeof(double*)*localNumberOfRows, main_queue);
        A.matrixValues_host = matrixValues_host;
    }

    double **matrixDiagonal_host = A.matrixDiagonal_host;
    if (matrixDiagonal_host == nullptr) {
        matrixDiagonal_host = (double **)sparse_malloc_host(sizeof(double*)*localNumberOfRows, main_queue);
        A.matrixDiagonal_host = matrixDiagonal_host;
    }

    local_int_t **mtxIndL_host = A.mtxIndL_host;
    if (mtxIndL_host == nullptr) {
        mtxIndL_host = (local_int_t **)sparse_malloc_host(sizeof(local_int_t*)*localNumberOfRows, main_queue);
        A.mtxIndL_host = mtxIndL_host;
    }

    main_queue.submit([&](sycl::handler& cgh) {
      cgh.depends_on(dependencies);
      double      * mtxA           = A.mtxA;
      local_int_t * mtxL           = A.mtxL;
      double      **matrixValues   = A.matrixValues;
      double      **matrixDiagonal = A.matrixDiagonal;
      local_int_t **mtxIndL        = A.mtxIndL;
      auto kernel = [=](sycl::item<1> item) {
        local_int_t row = item.get_id(0);
        std::ptrdiff_t offset = 0;

        offset = matrixValues[row] - mtxA;
        matrixValues_host[row] = mtxA_host + offset;

        offset = matrixDiagonal[row] - mtxA;
        matrixDiagonal_host[row] = mtxA_host + offset;

        offset = mtxIndL[row] - mtxL;
        mtxIndL_host[row] = mtxL_host + offset;

      };
      cgh.parallel_for(sycl::range<1>(localNumberOfRows), kernel);
    }).wait();

 }

 inline void FreeReferenceData(SparseMatrix &A, sycl::queue &main_queue) {

   if (A.nonzerosInRow_host)    { sycl::free(A.nonzerosInRow_host,    main_queue); A.nonzerosInRow_host  = NULL; }
   if (A.boundaryRows_host)     { sycl::free(A.boundaryRows_host,     main_queue); A.boundaryRows_host   = NULL; }
   if (A.localToGlobalMap_host) { sycl::free(A.localToGlobalMap_host, main_queue); A.localToGlobalMap_host = NULL;}
   if (A.mtxA_host)             { sycl::free(A.mtxA_host,             main_queue); A.mtxA_host = NULL; }
   if (A.mtxL_host)             { sycl::free(A.mtxL_host,             main_queue); A.mtxL_host = NULL; }
   if (A.matrixValues_host)     { sycl::free(A.matrixValues_host,     main_queue); A.matrixValues_host = NULL; }
   if (A.matrixDiagonal_host)   { sycl::free(A.matrixDiagonal_host,   main_queue); A.matrixDiagonal_host = NULL; }
   if (A.mtxIndL_host)          { sycl::free(A.mtxIndL_host,          main_queue); A.mtxIndL_host = NULL; }

 }


/*!
  Deallocates the members of the data structure of the known system matrix provided they are not 0.

  @param[in] A the known system matrix
 */
inline void DeleteMatrix(SparseMatrix & A, sycl::queue & main_queue) {
#ifdef HPCG_LOCAL_LONG_LONG
# ifndef HPCG_CONTIGUOUS_ARRAYS
  for (local_int_t i = 0; i< A.localNumberOfRows; ++i) {
    delete [] A.matrixValues[i];
    delete [] A.mtxIndG[i];
    delete [] A.mtxIndL[i];
  }
# else // HPCG_CONTIGUOUS_ARRAYS
  if (A.matrixValues)      delete [] A.matrixValues;
  if (A.mtxIndG)           delete [] A.mtxIndG;
  if (A.mtxIndL)           delete [] A.mtxIndL;
  if (A.matrixValues_host) delete [] A.matrixValues_host;
  if( A.mtxIndG_host)      delete [] A.mtxIndG_host;
  if( A.mtxIndL_host)      delete [] A.mtxIndL_host;

# endif // HPCG_CONTIGUOUS_ARRAYS
  if (A.title)                  delete [] A.title;
  if (A.nonzerosInRow)          delete [] A.nonzerosInRow;
  if (A.matrixDiagonal)         delete [] A.matrixDiagonal;

# ifndef HPCG_NO_MPI
  if (A.elementsToSend_d)       delete [] A.elementsToSend_d;
  if (A.elementsToSend_h)       delete [] A.elementsToSend_h;
  if (A.neighbors)              delete [] A.neighbors;
  if (A.receiveLength)          delete [] A.receiveLength;
  if (A.sendLength)             delete [] A.sendLength;
  if (A.sendBuffer)             delete [] A.sendBuffer;
# endif // HPCG_NO_MPI
  if (A.geom!=0)   { DeleteGeometry(*A.geom); delete A.geom; A.geom = 0;}
  if (A.Ac!=0)     { DeleteMatrix(*A.Ac, main_queue); delete A.Ac; A.Ac = 0;} // Delete coarse matrix
  if (A.mgData!=0) { DeleteMGData(*A.mgData, main_queue); delete A.mgData; A.mgData = 0;} // Delete MG data

#else // not defined HPCG_LOCAL_LONG_LONG
  if (A.title)                  delete [] A.title;
  if (A.nonzerosInRow)    {sycl::free(A.nonzerosInRow,    main_queue); A.nonzerosInRow  = NULL; }
  if (A.matrixDiagonal)   {sycl::free(A.matrixDiagonal,   main_queue); A.matrixDiagonal = NULL; }
  if (A.boundaryRows)     {sycl::free(A.boundaryRows,     main_queue); A.boundaryRows   = NULL; }
  if (A.localToGlobalMap) {sycl::free(A.localToGlobalMap, main_queue); A.localToGlobalMap = NULL;}
  if (A.mtxIndG)          {sycl::free(A.mtxIndG,          main_queue); A.mtxIndG = NULL; }
  if (A.mtxIndL)          {sycl::free(A.mtxIndL,          main_queue); A.mtxIndL = NULL; }
  if (A.matrixValues)     {sycl::free(A.matrixValues,     main_queue); A.matrixValues = NULL; }
  if (A.mtxA)             {sycl::free(A.mtxA,             main_queue); A.mtxA = NULL; }
  if (A.mtxL)             {sycl::free(A.mtxL,             main_queue); A.mtxL = NULL; }
  if (A.mtxG)             {sycl::free(A.mtxG,             main_queue); A.mtxG = NULL; }

  FreeReferenceData(A, main_queue); // if any _host versions are still around, free them
  if (A.mtxIndG_host)          {sycl::free(A.mtxIndG_host,          main_queue); A.mtxIndG_host = NULL; }
  if (A.mtxG_host)             {sycl::free(A.mtxG_host,             main_queue); A.mtxG_host = NULL; }

# ifndef HPCG_NO_MPI
  if (A.elementsToSend_d){sycl::free(A.elementsToSend_d, main_queue); A.elementsToSend_d = NULL; }
  if (A.elementsToSend_h){      free(A.elementsToSend_h);             A.elementsToSend_h = NULL; }
  if (A.neighbors)     {sycl::free(A.neighbors,      main_queue); A.neighbors = NULL; }
  if (A.receiveLength) {sycl::free(A.receiveLength,  main_queue); A.receiveLength = NULL; }
  if (A.sendLength)    {sycl::free(A.sendLength,     main_queue); A.sendLength = NULL; }
  if (A.sendBuffer)    {sycl::free(A.sendBuffer,     main_queue); A.sendBuffer = NULL; }
  if (A.sendBuffer_h)  {      free(A.sendBuffer_h);               A.sendBuffer_h = NULL;} 
# endif // HPCG_NO_MPI
  struct optData *optData = (struct optData *)A.optimizationData;
  if ( optData != NULL )
  {

      if (optData->esblastLower  != NULL)  { sycl::free(optData->esblastLower, main_queue); optData->esblastLower = NULL;}
      if (optData->esbfirstUpper != NULL)  { sycl::free(optData->esbfirstUpper, main_queue); optData->esbfirstUpper = NULL;}
      if (optData->esblastUpper  != NULL)  { sycl::free(optData->esblastUpper, main_queue); optData->esblastUpper = NULL;}
      if (optData->esbfirstNonloc != NULL) { sycl::free(optData->esbfirstNonloc, main_queue); optData->esbfirstNonloc = NULL;}

      if (optData->esbblockptr   != NULL) { sycl::free(optData->esbblockptr, main_queue); optData->esbblockptr = NULL;}
      if (optData->esbcolind     != NULL) { sycl::free(optData->esbcolind,   main_queue); optData->esbcolind = NULL;}
      if (optData->esbvalues     != NULL) { sycl::free(optData->esbvalues,   main_queue); optData->esbvalues = NULL;}

      if (optData->devInfo       != NULL) { delete (custom::deviceInfo *)optData->devInfo; optData->devInfo = NULL; }

      if (optData->xcolors_dev   != NULL) { sycl::free(optData->xcolors_dev, main_queue); optData->xcolors_dev = NULL; }
      if (optData->xcolors_host  != NULL) { sycl::free(optData->xcolors_host, main_queue); optData->xcolors_host = NULL; }

      if (optData->esbM          != NULL) { delete (custom::sparseMatrix *)optData->esbM; optData->esbM = NULL; }

      if (optData->mv_reorder    != NULL) { sycl::free(optData->mv_reorder,   main_queue); optData->mv_reorder = NULL;}
      if (optData->trmv_l_reorder    != NULL) { sycl::free(optData->trmv_l_reorder,   main_queue); optData->trmv_l_reorder = NULL;}
      if (optData->trmv_u_reorder    != NULL) { sycl::free(optData->trmv_u_reorder,   main_queue); optData->trmv_u_reorder = NULL;}
      

// TODO: put these in a separate data structure or something
// #if defined(HPCG_TEST_CUSTOM_KERNELS)
#if 1

      if (optData->ia         != NULL) { sycl::free(optData->ia, main_queue);  optData->ia = NULL; }
      if (optData->ja         != NULL) { sycl::free(optData->ja, main_queue);  optData->ja = NULL; }
      if (optData->a          != NULL) { sycl::free(optData->a, main_queue);  optData->a = NULL; }

      if (optData->ib         != NULL) { sycl::free(optData->ib, main_queue);  optData->ib = NULL; }
      if (optData->jb         != NULL) { sycl::free(optData->jb, main_queue);  optData->jb = NULL; }
      if (optData->b          != NULL) { sycl::free(optData->b, main_queue);  optData->b = NULL; }

      if (optData->bmap       != NULL) { sycl::free(optData->bmap, main_queue);  optData->bmap = NULL; }
#endif

      if (optData->diags      != NULL) { sycl::free(optData->diags, main_queue);    optData->diags = NULL; }
      if (optData->invperm    != NULL) { sycl::free(optData->invperm, main_queue);  optData->invperm = NULL; }
      if (optData->perm       != NULL) { sycl::free(optData->perm, main_queue);     optData->perm = NULL; }
      if (optData->perm_coarse!= NULL) { optData->perm_coarse = NULL; } // only a reference to another Ac->perm, so don't free, just erase

      if (optData->dtmp       != NULL) { sycl::free(optData->dtmp, main_queue);      optData->dtmp = NULL; }
      if (optData->normr_dev  != NULL) { sycl::free(optData->normr_dev, main_queue); optData->normr_dev = NULL; }
      if (optData->normr_host != NULL) {       free(optData->normr_host);            optData->normr_host = NULL; }

      if (optData->halo_host_vector != NULL) { free(optData->halo_host_vector);  optData->halo_host_vector = NULL; }
      if (A.Ac == 0 && optData->profiler != NULL) { delete optData->profiler; }

      sycl::free(optData, main_queue);
  }
  if (A.geom!=0)   { DeleteGeometry(*A.geom, main_queue); delete A.geom; A.geom = 0;}
  if (A.Ac!=0)     { DeleteMatrix(*A.Ac, main_queue);     delete A.Ac; A.Ac = 0;} // Delete coarse matrix
  if (A.mgData!=0) { DeleteMGData(*A.mgData, main_queue); delete A.mgData; A.mgData = 0;} // Delete MG data
  if (A.mgData_host!=0) { DeleteMGData(*A.mgData_host);   delete A.mgData_host; A.mgData_host = 0;} // Delete MG host data
#endif // HPCG_LOCAL_LONG_LONG
  return;
}

inline void init_optData(struct optData *optData)
{
    optData->dtmp      = NULL;
    optData->dtmp2     = NULL;
    optData->dtmp3     = NULL;
    optData->dtmp4     = NULL;

    optData->diags     = NULL;

//#if defined(HPCG_TEST_CUSTOM_KERNELS)
#if 1
    optData->ia        = NULL;
    optData->ja        = NULL;
    optData->a         = NULL;

    optData->nrow_b    = 0;
    optData->ib        = NULL;
    optData->jb        = NULL;
    optData->b         = NULL;
    optData->bmap      = NULL;
#endif

    optData->invperm      = NULL;
    optData->perm         = NULL;
    optData->perm_coarse  = NULL;

    optData->normr_dev    = NULL;
    optData->normr_host   = NULL;
    optData->pAp_loc_dev  = NULL;
    optData->pAp_loc_host = NULL;
    optData->rtz_loc_dev  = NULL;
    optData->rtz_loc_host = NULL;
    optData->pAp_dev      = NULL;
    optData->rtz_dev      = NULL;
    optData->oldrtz_dev   = NULL;

    optData->nVectors           = 0;
    optData->nBlocks            = 0;
    optData->target_block_size  = 0;
    optData->esbblockptr        = NULL;
    optData->esbcolind          = NULL;
    optData->esbvalues          = NULL;

    optData->esblastLower       = NULL;
    optData->esbfirstUpper      = NULL;
    optData->esblastUpper       = NULL;
    optData->esbfirstNonloc     = NULL;

    optData->nColors            = 0;
    optData->xcolors_dev        = NULL;
    optData->xcolors_host       = NULL;

    optData->devInfo            = NULL;
    optData->esbM               = NULL;

    optData->global_result_host = NULL;
    optData->halo_host_vector   = NULL;

    optData->profiler  = NULL;
}

#endif // SPARSEMATRIX_HPP
