/*******************************************************************************
* Copyright (C) 2018 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
 *
 *  Content:
 *            oneMKL VM different accuracies usage example
 *            and accuracy vs performance tradeoff demonstration:
 *
 *            Call acosf single precision function
 *            as scalar LIBM and oneMKL VM different vector API's:
 *            host (classic) and device from host: buffer, usm, span.
 *            Three accuracy oneMKL VM flavors used:
 *                HA (High Accuracy)
 *                LA (Low Accuracy)
 *                EP (Enhanced Performance)
 *            Compare maximum observed relative errors, ulps (units in last place)
 *            and performance measured in geval/sec (giga evaluations per second)
 *
 *******************************************************************************/

#include <algorithm>
#include <numeric>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <random>
#include <stdexcept>
#include <string>
#include <type_traits>

#include <sycl/sycl.hpp>
#include "oneapi/mkl.hpp"

#include "common_for_examples.hpp"

/**
 * General input constants:
 */
// Vector lengths in host & device VM calls
#if defined VM_QUICK_RUN
    static const int64_t host_length = 100'000;
    static const int64_t device_length = 1000'000;
#else
    static const int64_t host_length = 1'000'000;
    static const int64_t device_length = 100'000'000;
#endif
// Number of repeats for host
static const int host_repeats = 1000;
// Number of repeats for device
static const int device_repeats = 100;
// Input range begin: -1.0+EPS
static const float beg   = -1.0f + 1.1e-07f;
// Input range end: 1.0-EPS
static const float end   = +1.0f - 1.1e-07f;

/**
 * Mapping to oneMKL VM accuracy mode constants:
 */
// for Sycl API's
static const oneapi::mkl::vm::mode oneapi_accuracy_mode[] =
{
    oneapi::mkl::vm::mode::ha,
    oneapi::mkl::vm::mode::la,
    oneapi::mkl::vm::mode::ep,
};
// for classic host API
static const unsigned int classic_accuracy_mode[] =
{
    VML_HA,
    VML_LA,
    VML_EP
};

/**
 * Available function accuracies:
 */
enum VmAccuracy
{
    kHA = 0,  // HA
    kLA,      // LA
    kEP,      // EP
    kAccNum   // Number of accuracies
};

/**
 * @brief Testing preamble.
 *
 * Print device aand driver info.
 *
 * @param[in] dev    Sycl device
 *
 */
void own_preamble (sycl::device & dev)
{
    std::string dev_name       = dev.template get_info<sycl::info::device::name>();
    std::string driver_version = dev.template get_info<sycl::info::device::version>();
    fprintf (stdout, "\t       device name: %s\n", dev_name.c_str());
    fprintf (stdout, "\t    driver version: %s\n\n", driver_version.c_str());
    fflush (stdout);
    return;
}

/**
 * @brief Asynchronous error handler.
 *
 * Async sycl error catching procedure.
 *
 * @param[in] el Exceptions list
 *
 */
void own_async_sycl_error (sycl::exception_list el)
{
    fprintf (stderr, "async exceptions caught: \n");

    for (auto l = el.begin(); l != el.end(); l = l + 1)
    {
        try
        {
            std::rethrow_exception (*l);
        }
        catch (const sycl::exception & e)
        {
            fprintf (stderr, "SYCL exception occured with code %d with %s\n",
                e.code().value(), e.what());
        }
    }
    return;
} // own_async_sycl_error

/**
 * @brief Clock timer.
 *
 * Number of nanoseconds that the system has been running since it was booted.
 *
 * @return Number of nanoseconds that the system has been running.
 *
 */
static uint64_t own_get_nano ()
{
    auto now = std::chrono::high_resolution_clock::now();
    uint64_t val = now.time_since_epoch().count();
    return val;
}

/**
 * @brief Computation of maximum relative error and ulp.
 *
 * own_compute_err relative error and simplified ulp computations
 * between resulted and reference vectors.
 *
 * @param[in] len            Vectors length
 * @param[in] res            Resulted vector
 * @param[in] ref            Reference vector
 * @param[out] err           Computed relative error
 * @param[out] err           Computed ulp
 *
 */
static void own_compute_err (int64_t len, float* res, double* ref, double* err, double* ulp)
{
    int    ex     = 0;
    double maxerr = 0.0;
    double maxulp = 0.0;
    double den    = 1.0;

    for (int64_t i = 0; i < len; i++)
    {
        /**
         * Simplified ulp formula: |res-ref|/2^(ex-p+1)
         * where p - precision, equals to 23 for single one
         */
        frexp (ref[i], &ex);                 // ex: integral power of two of ref
        den = ldexp (1.0, ex - 24);          // den: ulp's denominator 2^(ex-p+1)
        den = (den == 0.0)? 0x1.p-149 : den; // if den=0 then replace by EPS to avoid divbyzero

        // max ulp = |res-ref|/2^(ex-24)
        maxulp = fmax (maxulp, fabs ((((double)(res[i]) - ref[i])) / den));
        // max relative error = |res-ref|/ref
        maxerr = fmax (maxerr, fabs (((double)res[i] - ref[i]) / ref[i]));
    }

    *err = maxerr;
    *ulp = maxulp;

    return;
}

/**
 * @brief Run scalar function on host
 *
 * Measure performance and acuracy for scalar host function
 *
 * @param[in]  len         Vector length
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative error
 * @param[out] ulp         Resulted ulp
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_libm_scalar (int64_t len, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    // Warmup run
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf(arg[i]);
    }
    uint64_t ns = own_get_nano ();
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf (arg[i]);
    }
    gev[kHA] = (double)(len) / (own_get_nano () - ns);
    // Compute relative error & ulp
    own_compute_err (len, res, ref, &(err[kHA]), &(ulp[kHA]));

    return;
}

/**
 * @brief Run VM host (classic) API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_host (int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    // Loop by accuracies
    for (int a = kHA; a < kAccNum; a++)
    {
        // Warmup call
        vmsAcos (len, arg, res, classic_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        // Do several repeats
        for (int j = 0; j < rep; j++)
        {
            vmsAcos (len, arg, res, classic_accuracy_mode[a]);
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        // Compute relative error & ulp
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
}

/**
 * @brief Run VM buffer device API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  queue       Sycl queue
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_buffer (sycl::queue & queue, int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    // Sycl buffer which copies data from 'arg', but not back
    sycl::buffer<float, 1> buff_arg { arg , arg + len };
    // Sycl buffer which doesn't copy data on creation
    sycl::buffer<float, 1> buff_res { res , len };
    // Loop by accuracies
    for (int a = kHA; a < kAccNum; a++)
    {
        // Warmup call
        oneapi::mkl::vm::acos(queue, len, buff_arg, buff_res, oneapi_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        // Do several repeats
        for (int j = 0; j < rep; j++)
        {
            oneapi::mkl::vm::acos (queue, len, buff_arg, buff_res, oneapi_accuracy_mode[a]);
            queue.wait_and_throw ();
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        // Get accessor to device buffer
        auto acc_res   = buff_res.get_host_access();
        // Get host pointer to device buffer
        float* dev_res = acc_res.get_pointer();
        // Compute relative error & ulp
        own_compute_err (len, dev_res, ref, &(err[a]), &(ulp[a]));
    }

    return;
}

/**
 * @brief Run VM USM device API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  queue       Sycl queue
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_usm (sycl::queue & queue, int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    // Allocate memory on device
    float * dev_arg = sycl::malloc_device<float>(len, queue);
    float * dev_res = sycl::malloc_device<float>(len, queue);
    // Copy arguments from host to device
    queue.memcpy(dev_arg, arg, len * sizeof(float));
    // Wait until async memcpy finished
    queue.wait_and_throw();
    // Loop by accuracies
    for (int a = kHA; a < kAccNum; a++)
    {
        // Warmup call
        oneapi::mkl::vm::acos(queue, len, dev_arg, dev_res, {}, oneapi_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        // Do several repeats
        for (int j = 0; j < rep; j++)
        {
            oneapi::mkl::vm::acos (queue, len, dev_arg, dev_res, {}, oneapi_accuracy_mode[a]);
            // Disable async multiple calls to VM
            queue.wait_and_throw();
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        // Copy results from device to host
        queue.memcpy(res, dev_res, len * sizeof(float));
        // Wait until async memcpy finished
        queue.wait_and_throw();
        // Compute relative error & ulp
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
    // Free allocated device memory
    sycl::free(dev_res, queue);
    sycl::free(dev_arg, queue);

    return;
}

/**
 * @brief Run VM span device API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  queue       Sycl queue
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_span (sycl::queue & queue, int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    // Allocate memory on device
    float * dev_arg = sycl::malloc_device<float>(len, queue);
    float * dev_res = sycl::malloc_device<float>(len, queue);
    // Copy arguments from host to device
    queue.memcpy(dev_arg, arg, len * sizeof(float));
    // Wait until async memcpy finished
    queue.wait_and_throw();
    // Create span containers for device memory
    sycl::span span_arg(dev_arg, len);
    sycl::span span_res(dev_res, len);
    // Loop by accuracies
    for (int a = kHA; a < kAccNum; a++)
    {
        // Warmup call
        oneapi::mkl::vm::acos(queue, span_arg, span_res, {}, oneapi_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        // Do several repeats
        for (int j = 0; j < rep; j++)
        {
            oneapi::mkl::vm::acos(queue, span_arg, span_res, {}, oneapi_accuracy_mode[a]);
            // Disable async multiple calls to VM
            queue.wait_and_throw();
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        // Copy results from device to host
        queue.memcpy(res, dev_res, len * sizeof(float));
        // Wait until async memcpy finished
        queue.wait_and_throw();
        // Compute relative error & ulp
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
    // Free allocated device memory
    sycl::free(dev_res, queue);
    sycl::free(dev_arg, queue);

    return;
}

/**
 * @brief Function to run VM accuracy example on device
 *
 * Performs accuracy vs performance tradeoff demonstration
 *
 * @param[in] dev    Sycl device
 *
 * @return           Status (-1 for errors)
 *
 */
int own_run_on (sycl::device & dev)
{
    int64_t max_length = (device_length > host_length)?device_length:host_length;
    /**
     * Relative error, ulp and gevals results for scalar, host, buffer, USM and span API's.
     */
    double scal_err[kAccNum] = {0}, scal_ulp[kAccNum] = {0}, scal_gev[kAccNum] = {0};
    double host_err[kAccNum] = {0}, host_ulp[kAccNum] = {0}, host_gev[kAccNum] = {0};
    double buff_err[kAccNum] = {0}, buff_ulp[kAccNum] = {0}, buff_gev[kAccNum] = {0};
    double usm_err [kAccNum] = {0}, usm_ulp [kAccNum] = {0}, usm_gev [kAccNum] = {0};
    double span_err[kAccNum] = {0}, span_ulp[kAccNum] = {0}, span_gev[kAccNum] = {0};

    own_preamble (dev); // Print sycl device info

    /**
     * Sycl device queue
     */
    sycl::queue queue { dev, own_async_sycl_error };

    /**
     * Allocate memory for argument, scalar (SC) and HA/LA/EP result, and reference vectors.
     */
    float * arg = new float[max_length];
    float * res = new float[max_length];
    double *ref = new double[max_length];

    /**
     * Fill source vector by random numbers uniformly distributed on [beg, end) range.
     */
    std::mt19937 eng (777);
    std::uniform_real_distribution<float> distr (beg, end);
    for (int64_t i = 0; i < max_length; i++)
    {
        arg[i] = distr (eng);
    }

    /**
     * Fill reference array computed with scalar double precision acos() for generated arguments.
     */
    for (int64_t i = 0; i < max_length; i++)
    {
        ref[i] = acos ((double)arg[i]);
    }

    /**
     * Run different API's:
     */
    // Scalar LIBM
    own_libm_scalar (host_length, arg, res, ref, scal_err, scal_ulp, scal_gev);
    // VM host (classic) API
    own_vm_host (host_length, host_repeats, arg, res, ref, host_err, host_ulp, host_gev);
    // VM device buffer API
    own_vm_buffer (queue, device_length, device_repeats, arg, res, ref, buff_err, buff_ulp, buff_gev);
    // VM device USM API
    own_vm_usm (queue, device_length, device_repeats, arg, res, ref, usm_err, usm_ulp, usm_gev);
    // VM device span API
    own_vm_span (queue, device_length, device_repeats, arg, res, ref, span_err, span_ulp, span_gev);

    /**
     * Result printouts.
     */
    fprintf(stdout, "\t=================================================================================\n");
    fprintf(stdout, "\t%15s,%12s,%12s,%12s,%12s,%12s\n", "<acosf>","Scalar", "VM Host", "VM Buffer", "VM USM", "VM Span");
    fprintf(stdout, "\t=================================================================================\n");
    fprintf(stdout, "\t%15s,%12.3le,%12.3le,%12.3le,%12.3le,%12.3le\n", "Relative err HA",  scal_err[kHA], host_err[kHA], buff_err[kHA], usm_err[kHA], span_err[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le,%12.3le,%12.3le,%12.3le\n", "Relative err LA", "", host_err[kLA], buff_err[kLA], usm_err[kLA], span_err[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le,%12.3le,%12.3le,%12.3le\n", "Relative err EP", "", host_err[kEP], buff_err[kEP], usm_err[kEP], span_err[kEP]);
    fprintf(stdout, "\t---------------------------------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lg,%12.3lg,%12.3lg,%12.3lg,%12.3lg\n", "Ulp err HA",  scal_ulp[kHA], host_ulp[kHA], buff_ulp[kHA], usm_ulp[kHA], span_ulp[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg,%12.3lg,%12.3lg,%12.3lg\n", "Ulp err LA",  "", host_ulp[kLA], buff_ulp[kLA], usm_ulp[kLA], span_ulp[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg,%12.3lg,%12.3lg,%12.3lg\n", "Ulp err EP",  "", host_ulp[kEP], buff_ulp[kEP], usm_ulp[kEP], span_ulp[kEP]);
    fprintf(stdout, "\t---------------------------------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lf,%12.3lf,%12.3lf,%12.3lf,%12.3lf\n", "GEval/sec HA",  scal_gev[kHA], host_gev[kHA], buff_gev[kHA], usm_gev[kHA], span_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf,%12.3lf,%12.3lf,%12.3lf\n", "GEval/sec LA",  "", host_gev[kLA], buff_gev[kLA], usm_gev[kLA], span_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf,%12.3lf,%12.3lf,%12.3lf\n", "GEval/sec EP",  "", host_gev[kEP], buff_gev[kEP], usm_gev[kEP], span_gev[kEP]);
    fprintf(stdout, "\t=================================================================================\n");
    fprintf(stdout, "\t GEval/sec performance comparisons:\n");
    fprintf(stdout, "\t=================================================================================\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "HA vs Scalar",  "", host_gev[kHA]/scal_gev[kHA], buff_gev[kHA]/scal_gev[kHA], usm_gev[kHA]/scal_gev[kHA], span_gev[kHA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "LA vs Scalar",  "", host_gev[kLA]/scal_gev[kHA], buff_gev[kLA]/scal_gev[kHA], usm_gev[kLA]/scal_gev[kHA], span_gev[kLA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "EP vs Scalar",  "", host_gev[kEP]/scal_gev[kHA], buff_gev[kEP]/scal_gev[kHA], usm_gev[kEP]/scal_gev[kHA], span_gev[kEP]/scal_gev[kHA]);
    fprintf(stdout, "\t---------------------------------------------------------------------------------\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "LA vs HA",  "", host_gev[kLA]/host_gev[kHA], buff_gev[kLA]/buff_gev[kHA], usm_gev[kLA]/usm_gev[kHA], span_gev[kLA]/span_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "EP vs LA",  "", host_gev[kEP]/host_gev[kLA], buff_gev[kEP]/buff_gev[kLA], usm_gev[kEP]/usm_gev[kLA], span_gev[kEP]/span_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx,%11.2lfx,%11.2lfx,%11.2lfx\n", "EP vs HA",  "", host_gev[kEP]/host_gev[kHA], buff_gev[kEP]/buff_gev[kHA], usm_gev[kEP]/usm_gev[kHA], span_gev[kEP]/span_gev[kHA]);
    fprintf(stdout, "\t=================================================================================\n");

    /**
     * Free allocated memory
     */
    delete[] ref;
    delete[] arg;
    delete[] res;

    return 0;
}

//
// Main entry point for example.
//
// Dispatches to appropriate device types as set at build time with flag:
// -DSYCL_DEVICES_cpu -- only runs SYCL CPU device
// -DSYCL_DEVICES_gpu -- only runs SYCL GPU device
// -DSYCL_DEVICES_all (default) -- runs on all: CPU and GPU devices
//
//  For each device selected and each data type supported, the example
//  runs with all supported data types
//
int main (int argc, char **argv)
{
    int ret = 0; // return status
    fprintf (stdout, "sycl vm_host_perf_accuracy: started...\n"); fflush (stdout);

    // List of available devices
    std::list<my_sycl_device_types> list_of_devices;
    set_list_of_devices (list_of_devices);

    // Loop by all available devices
    for (auto dev_type : list_of_devices)
    {
        sycl::device my_dev;
        bool my_dev_is_found = false;
        get_sycl_device (my_dev, my_dev_is_found, dev_type);

        // Run tests if the device is available
        if (my_dev_is_found)
        {
            fprintf (stdout, "Running tests on %s.\n", sycl_device_names[dev_type].c_str()); fflush (stdout);
            try {
                ret |= own_run_on (my_dev);
            } catch (sycl::exception const& e) {
                fprintf (stderr, "sycl::exception caught. %s\n", e.what());
                ret = 1;
            } catch (std::exception const& e) {
                fprintf (stderr, "std::exception caught. %s\n", e.what());
                ret = 1;
            }
        }
        else
        {
            fprintf (stderr, "No %s devices found; skipping %s tests.\n",
                sycl_device_names[dev_type].c_str(), sycl_device_names[dev_type].c_str());
        }
    }

    fflush (stdout); fprintf (stdout, "sycl vm_host_perf_accuracy: %s\n\n", (ret != 0)?"FAIL":"PASS");
    return ret;
}
