/*******************************************************************************
* Copyright (C) 2018 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
 *
 *  Content:
 *            oneMKL VM different accuracies usage example
 *            and accuracy vs performance tradeoff demonstration:
 *
 *            Call scalar LIBM acosf and oneMKL VM vmsAcos
 *            single precision functions on randomly distributed (-1.0f, 1.0f)
 *            vector with length = 1M using three accuracy oneMKL VM flavors:
 *                HA (High Accuracy)
 *                LA (Low Accuracy)
 *                EP (Enhanced Performance)
 *            Compare maximum observed relative errors, ulps (units in last place)
 *            and performance measured in geval/sec (giga evaluations per second)
 *
 *
 *******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef _WIN32
  #include <windows.h>
#else
  #include <time.h>
#endif

#include "mkl.h"

/**
 * General input constants:
 */
/**
 * General input constants:
 */
/* Vector lengths in VM calls */
static const int64_t vm_length = 1000000;
/* Number of repeats of VM calls */
static const int vm_repeats = 1000;
/* Input range begin: -1.0+EPS */
static const float beg   = -1.0f + 1.1e-07f;
/* Input range end: 1.0-EPS */
static const float end   = +1.0f - 1.1e-07f;

/**
 * Mapping to oneMKL VM accuracy mode constants:
 */
static const unsigned int vm_accuracy_mode[] =
{
    VML_HA,
    VML_LA,
    VML_EP
};

/**
 * Available function accuracies:
 */
enum VmAccuracy
{
    kHA = 0,  /* HA */
    kLA,      /* LA */
    kEP,      /* EP */
    kAccNum   /* Number of accuracies */
};

/**
 * @brief Safe malloc
 *
 * own_safe_malloc allocates memory and check resulted pointer.
 * Report error and exit application if unsuccessful.
 *
 * @param[in] size          Size in bytes
 * @return                  Pointer to allocated memory
 *
 */
static void* own_safe_malloc(int size)
{
    void* ptr = malloc (size);
    if (ptr == NULL)
    {
       fprintf (stderr, "\t\tERROR: %d bytes allocated unsuccesfully\n", size);
       exit(-1);
    }

    return ptr;
}

/**
 * @brief Safe free
 *
 * own_safe_free deallocates memory.
 * Report error if NULL pointer passed.
 *
 * @param[in] ptr          Pointer to memory
 *
 */
static void own_safe_free(void *ptr)
{
    if (ptr != NULL) { free (ptr); }
    else
    {
       fprintf (stderr, "\t\tERROR: NULL pointer cannot be deallocated\n");
       exit(-1);
    }

    return;
}

/**
 * @brief Clock timer.
 *
 * own_get_nano Number of nanoseconds that the system has been running since it was booted.
 *
 * @return Number of nanoseconds that the system has been running.
 *
 */
static uint64_t own_get_nano()
{
    uint64_t timer = 0;
#ifdef _WIN32
    LARGE_INTEGER li;
    if (QueryPerformanceCounter (&li)) { timer = li.QuadPart; }
#else
    struct timespec ts;
    if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
    {
        timer = (uint64_t)ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
    }
#endif
    return timer;
}

/**
 * @brief Computation of maximum relative error and ulp.
 *
 * own_compute_err relative error and simplified ulp computations
 * between resulted and reference vectors.
 *
 * @param[in] len            Vectors length
 * @param[in] res            Resulted vector
 * @param[in] ref            Reference vector
 * @param[out] err           Computed relative error
 * @param[out] err           Computed ulp
 *
 */
static void own_compute_err (int64_t len, float* res, double* ref, double* err, double* ulp)
{
    int    ex     = 0;
    double maxerr = 0.0;
    double maxulp = 0.0;
    double den    = 1.0;

    for (int64_t i = 0; i < len; i++)
    {
        /**
         * Simplified ulp formula: |res-ref|/2^(ex-p+1)
         * where p - precision, equals to 23 for single one
         */
        frexp (ref[i], &ex);                 /* ex: integral power of two of ref */
        den = ldexp (1.0, ex - 24);          /* den: ulp's denominator 2^(ex-p+1) */
        den = (den == 0.0)? 0x1.p-149 : den; /* if den=0 then replace by EPS to avoid divbyzero */
        
        /* max ulp = |res-ref|/2^(ex-24) */
        maxulp = fmax (maxulp, fabs ((((double)(res[i]) - ref[i])) / den)); 
        /* max relative error = |res-ref|/ref */
        maxerr = fmax (maxerr, fabs (((double)res[i] - ref[i]) / ref[i]));
    }

    *err = maxerr;
    *ulp = maxulp;

    return;
}

/**
 * @brief Run scalar function on host
 *
 * Measure performance and acuracy for scalar host function
 *
 * @param[in]  len         Vector length
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative error
 * @param[out] ulp         Resulted ulp
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_libm_scalar (int64_t len, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    /* Warmup run */
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf(arg[i]);
    }
    uint64_t ns = own_get_nano ();
    #pragma novector
    for (int64_t i = 0; i < len; i++)
    {
        res[i] = acosf(arg[i]);
    }
    gev[kHA] = (double)(len) / (own_get_nano () - ns);
    /* Compute relative error & ulp */
    own_compute_err (len, res, ref, &(err[kHA]), &(ulp[kHA]));

    return;
}

/**
 * @brief Run VM host (classic) API
 *
 * Measure performance and acuracy for VM host API's
 *
 * @param[in]  len         Vector length
 * @param[in]  rep         Number of repeats
 * @param[in]  arg         Arguments array
 * @param[out] res         Results array
 * @param[in]  ref         Reference results array
 * @param[out] err         Resulted relative errors
 * @param[out] ulp         Resulted ulps
 * @param[out] gev         Resulted performance (GEval/sec)
 *
 */
void own_vm_host (int64_t len, int64_t rep, float* arg, float* res, double* ref, double* err, double* ulp, double* gev)
{
    /* Loop by accuracies */
    for (int a = kHA; a < kAccNum; a++)
    {
        /* Warmup call */
        vmsAcos (len, arg, res, vm_accuracy_mode[a]);
        uint64_t ns = own_get_nano ();
        /* Do several repeats */
        for (int j = 0; j < rep; j++)
        {
            vmsAcos (len, arg, res, vm_accuracy_mode[a]);
        }
        gev[a] = (double)(len * rep) / (own_get_nano () - ns);
        /* Compute relative error & ulp */
        own_compute_err (len, res, ref, &(err[a]), &(ulp[a]));
    }
}


/**
 * @brief Main function for VM accuracy example
 *
 * Main performs accuracy vs performance tradeoff demonstration
 *
 * @param[in] argc         Number of arguments
 * @param[in] argv         Pointer to argument strings
 * @return                 -1 for FAIL or 0 for PASS
 *
 */
int main(int argc, char **argv)
{
    /**
     * Relative error, ulp and gevals results for scalar and VM.
     */
    double scal_err[kAccNum] = {0}, scal_ulp[kAccNum] = {0}, scal_gev[kAccNum] = {0};
    double vm_err[kAccNum] = {0}, vm_ulp[kAccNum] = {0}, vm_gev[kAccNum] = {0};
    /**
     * Allocate memory for argument, scalar (SC) and HA/LA/EP result, and reference vectors.
     */
    float *arg   = (float*) own_safe_malloc (vm_length * sizeof(float));
    float *res   = (float*) own_safe_malloc (vm_length * sizeof(float));
    double *ref  = (double*)own_safe_malloc (vm_length * sizeof(double));

    fprintf(stdout, "classic c vm_perf_accuracy: started...\n"); fflush (stdout);

    /**
     * Fill source vector by random numbers uniformly distributed on [beg, end) range.
     */
    srand(777);
    for (int64_t i = 0; i < vm_length; i++)
    {
        arg[i] = (float)(beg + (end - beg) * (float)(rand()) / (float)(RAND_MAX));
    }

    /**
     * Fill reference array computed with scalar double precision acos() for generated arguments.
     */
    for (int64_t i = 0; i < vm_length; i++)
    {
        ref[i] = acos ((double)arg[i]);
    }

    /**
     * Run different API's:
     */
    /* Scalar LIBM */
    own_libm_scalar (vm_length, arg, res, ref, scal_err, scal_ulp, scal_gev);
    /* VM */
    own_vm_host (vm_length, vm_repeats, arg, res, ref, vm_err, vm_ulp, vm_gev);

    /**
     * Result printouts.
     */
    fprintf(stdout, "\t=========================================\n");
    
                                                     
    fprintf(stdout, "\t%15s,%12s,%12s\n", "<acosf>","Scalar", "VM");
    fprintf(stdout, "\t=========================================\n");
    fprintf(stdout, "\t%15s,%12.3le,%12.3le\n", "Relative err HA",  scal_err[kHA], vm_err[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le\n", "Relative err LA", "", vm_err[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3le\n", "Relative err EP", "", vm_err[kEP]);
    fprintf(stdout, "\t-----------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lg,%12.3lg\n", "Ulp err HA",   scal_ulp[kHA], vm_ulp[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg\n", "Ulp err LA",  "", vm_ulp[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lg\n", "Ulp err EP",  "", vm_ulp[kEP]);
    fprintf(stdout, "\t-----------------------------------------\n");
    fprintf(stdout, "\t%15s,%12.3lf,%12.3lf\n", "GEval/sec HA",   scal_gev[kHA], vm_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf\n", "GEval/sec LA",  "", vm_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%12.3lf\n", "GEval/sec EP",  "", vm_gev[kEP]);
    fprintf(stdout, "\t=========================================\n");
    fprintf(stdout, "\t GEval/sec performance comparisons:\n");
    fprintf(stdout, "\t=========================================\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "HA vs Scalar",  "", vm_gev[kHA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "LA vs Scalar",  "", vm_gev[kLA]/scal_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "EP vs Scalar",  "", vm_gev[kEP]/scal_gev[kHA]);
    fprintf(stdout, "\t-----------------------------------------\n");
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "LA vs HA",  "", vm_gev[kLA]/vm_gev[kHA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "EP vs LA",  "", vm_gev[kEP]/vm_gev[kLA]);
    fprintf(stdout, "\t%15s,%12s,%11.2lfx\n", "EP vs HA",  "", vm_gev[kEP]/vm_gev[kHA]);
    fprintf(stdout, "\t=========================================\n");

    /**
     * Free allocated memory
     */
    own_safe_free (arg);
    own_safe_free (res);
    own_safe_free (ref);

    int ret = (vmlGetErrStatus() < VML_STATUS_OK)?-1:0;
    fprintf(stdout, "classic c vm_perf_accuracy: %s\n\n", (ret != 0)?"FAIL":"PASS");

    return ret;
}

