/*******************************************************************************
* Copyright (C) 2014 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file ComputeRestriction_ref.cpp

 HPCG routine
 */

#include "ComputeRestriction_ref.hpp"
#include "UsmUtil.hpp"

/*!
  Routine to compute the coarse residual vector.

  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and mgData->rc the coarse residual vector.
  @param[in]    rf - Fine grid RHS.


  Note that the fine grid residual is never explicitly constructed.
  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.

  @return Returns zero on success and a non-zero value otherwise.
*/
int ComputeRestriction_ref(const SparseMatrix & A, const Vector & rf) {

  double * Axfv = A.mgData_host->Axf->values;
  double * rfv = rf.values;
  double * rcv = A.mgData_host->rc->values;
  local_int_t * f2c = A.mgData_host->f2cOperator;
  local_int_t nc = A.mgData_host->rc->localLength;

#pragma ivdep
  for (local_int_t i=0; i<nc; ++i) rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];

  return 0;
}

sycl::event ComputeRestriction(const SparseMatrix & A, const Vector & rf, sycl::queue & main_queue,
                               int& ierr, const std::vector<sycl::event> & deps) {

  double * Axfv = A.mgData->Axf->values;
  double * rfv = rf.values;
  double * rcv = A.mgData->rc->values;
  local_int_t * f2c = A.mgData->f2cOperator;
  local_int_t nc = A.mgData->rc->localLength;

  struct optData *optData = (struct optData *)A.optimizationData;
  local_int_t *perm_fine = optData->perm;
  local_int_t *perm_coarse = optData->perm_coarse;


  //DPCPP ComputeRestrictionKernel
  return main_queue.submit([&](sycl::handler &cgh) {
    const local_int_t total_size = round_up_next_multiple(nc, 256);
    cgh.depends_on(deps);
    auto kernel = [=](sycl::nd_item<1> item) {
      local_int_t id = item.get_global_id(0); // f2c maps unpermuted fine to coarse, must add permutations for r
      if(id<nc) {
          local_int_t row_fine = perm_fine[f2c[id]];
          local_int_t row_coarse = perm_coarse[id];
          rcv[row_coarse] = rfv[row_fine] - Axfv[row_fine];
      }
    };
    cgh.parallel_for<class ComputeRestrictionClass>(sycl::nd_range<1>(total_size, 256), kernel);
  });

}

