
//===========================================================================
//
// This example from a prerelease of the Scalable HeterOgeneous Computing
// (SHOC) Benchmark Suite Alpha v1.1.1i for Intel MIC architecture
// Contact: Kyle Spafford <kys@ornl.gov>
//         Steve Sylvester <steve.s.sylvester@intel.com>
//
// Copyright (c) 2011, UT-Battelle, LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor
//    the names of its contributors may be used to endorse or promote products
//    derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// ==============================================================================


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef _WIN32
#include <sys/mman.h>
#else
#include <Windows.h>
#endif

#ifdef _MSC_VER 
#define sprintf(X, Y, ...)  sprintf_s((X), SZ_DEFAULT_BUF, (Y), __VA_ARGS__)
#define report_fence_error(fence_err_buf)  \
                strerror_s(fence_err_buf, SZ_DEFAULT_BUF, errno); \
                fprintf(stderr, "self scif_fence_signal failed with err %s\n", fence_err_buf);
#else 
#define report_fence_error(X)  \
                fprintf(stderr, "self scif_fence_signal failed with err %s\n", strerror(errno));
#endif

#include <scif.h>
#include "OptionParser.h"
#include "ResultDatabase.h"
#include "cross_timer.h"

// ****************************************************************************
// Function: addBenchmarkSpecOptions
//
// Purpose:
//   Add benchmark specific command line argument parsing.
//
//   -nopinned
//   This option controls whether page-locked or "pinned" memory is used.
//   The use of pinned memory typically results in higher bandwidth for data
//   transfer between host and device.
//
// Arguments:
//   op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Jeremy Meredith
// Creation: September 08, 2009
//
// Modifications:
//
// ****************************************************************************
void addBenchmarkSpecOptions(OptionParser &op)
{
    op.addOption("nopinned", OPT_BOOL, "",
                 "disable usage of pinned (pagelocked) memory", 'p');

    op.addOption("fence", OPT_INT, "0", "toggle fence_mark vs fence_signal (default)", 'f');
}

// ****************************************************************************
// Function: runBenchmark
//
// Purpose:
//   Measures the bandwidth of the bus connecting the host processor to the
//   OpenCL device.  This benchmark repeatedly transfers data chunks of various
//   sizes across the bus to the OpenCL device, and calculates the bandwidth.
//
//
// Arguments:
//  resultDB: the benchmark stores its results in this ResultDatabase
//  op: the options parser / parameter database
//
// Returns:  nothing
//
// Programmer: Jeremy Meredith
// Creation: September 08, 2009
//
// Modifications:
//
// ****************************************************************************


#define START_OFFSET 0x200000
#define PAGE_SIZE 4096
#define SZ_DEFAULT_BUF 256
#define SCIFBARRIER(epd, string) { \
        if ((err = scif_send(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
                printf("scif_send failed with err %d\n", err); \
                fflush(stdout); \
                goto close; \
        } \
        if ((err = scif_recv(epd, &control_msg, sizeof(control_msg), 1)) <= 0) { \
                printf("scif_recv failed with err %d\n", err); \
                fflush(stdout); \
                goto close; \
        } \
}

#define FENCE(newepd, signal_addr, signal_offset, err, fence_err_buf) { \
        *(volatile int*)signal_addr = 0; \
        if (scif_fence_signal(newepd, signal_offset, 0xdeadbeef, 0, 0, \
                    SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL) < 0) { \
            err = errno; \
            report_fence_error(fence_err_buf); \
        } \
        while (*(volatile int*)signal_addr != 0xdeadbeef); \
}

struct window_info {
    void *self_addr;
    void *vself_addr;
    off_t offset;
};

void RunBenchmark(OptionParser &op, ResultDatabase &resultDB, scif_epd_t epd)
{
    int control_msg;
    off_t offset;

    cross_timer_t timeIn;
    cross_timer_t timeOut;
    char err_buf[SZ_DEFAULT_BUF];
    memset(&timeIn, '\0', sizeof(timeIn));
    memset(&timeOut, '\0', sizeof(timeOut));
    int err = 0;
    int reps = 0;
    int mark =-1;
    off_t signal_offset = 0x1000;
    void* signal_addr = 0;
    unsigned int signal_msg_size = PAGE_SIZE;
    float *devMem;

    const bool verbose = op.getOptionBool("verbose");
    const bool pinned = !op.getOptionBool("nopinned");
    const unsigned int use_fence_mark = (unsigned int)op.getOptionInt("fence");
    const unsigned int passes = (unsigned int)op.getOptionInt("passes");

    // Sizes are in kB
    const int nSizes  = 21;
    int sizes[nSizes] = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 1<<16, 1<<17, 1<<18, 1<<19, 1<<20};
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;

    struct window_info buffer;
    memset(&buffer, '\0', sizeof(buffer));

#ifdef _WIN32
    buffer.self_addr = NULL;
    buffer.self_addr = _aligned_malloc(numMaxFloats*sizeof(float), START_OFFSET);

    if(!buffer.self_addr)
    {
        printf("SHOC Win32 Allocation Error\n");
        exit(-1);
    }

    signal_addr = NULL;
    signal_addr = _aligned_malloc(4096, 4096);
    if(!signal_addr)
    {
        printf("SHOC Win32 Signal Page Error\n");
        exit(-1);
    }


#else
    err = posix_memalign(&(buffer.self_addr), 0x200000, numMaxFloats*sizeof(float));
    err = posix_memalign(&(signal_addr), 0x1000, PAGE_SIZE);


    if(err)
    {
        printf("SCIF memory allocation failed\n");
        exit(-1);
    }
#endif
    memset(signal_addr, 0x0, signal_msg_size);

    if((signal_offset = scif_register(epd,
                signal_addr,
                signal_msg_size,
                signal_offset,
                SCIF_PROT_READ | SCIF_PROT_WRITE,
                SCIF_MAP_FIXED)) < 0)
    {
        printf("signal register failed\n");
        exit(-1);
    }

    // align buffer to 4096 bytes
    float* hostBuf = (float*)buffer.self_addr;
    offset = START_OFFSET;

    for (int i = 0; i < numMaxFloats; i++)
    {
        hostBuf[i] = (float)(i % 77);
    }

    if (pinned)
    {
        buffer.offset = scif_register(epd,
                        buffer.self_addr,
                        numMaxFloats*sizeof(float),
                        offset,
                        SCIF_PROT_READ | SCIF_PROT_WRITE,
                        SCIF_MAP_FIXED);

        if(buffer.offset < 1)
        {
            printf("scif_register buffer 0 failed with error: ");
            fflush(NULL);
            perror(NULL);
            goto close;
        }
    }

    reps = passes * nSizes;

    SCIFBARRIER(epd, "Client: Window Registered\n");

    err = scif_send(epd, &reps, sizeof(reps), 1);
    if (err < 0)
    {
        printf("error sending repetition count\n");
        perror(NULL);
        goto close;
    }
    // Create some memory pattern
    devMem = (float*)buffer.self_addr;

    for (int i = 0; i < numMaxFloats; i++)
    {
        devMem[i] = (float)(i % 77);
    }

    //get the data into the device side
    if (pinned)
    {
        err = scif_writeto(epd,
                           buffer.offset,
                           numMaxFloats*sizeof(float),
                           buffer.offset,
                           0);
    }
    else
    {
        err = scif_vwriteto(epd,
                           buffer.self_addr,
                           numMaxFloats*sizeof(float),
                           offset,
                           0);
    }
    if(err != 0)
    {
        printf("scif_writeto failed with err %d\n", errno);
        perror(NULL);
    }
    FENCE(epd, signal_addr, signal_offset, err, err_buf);

    // Three passes, forward and backward both
    for (unsigned int pass = 0; pass < passes; pass++)
    {
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass % 2) == 0)
                sizeIndex = i;
            else
                sizeIndex = (nSizes - 1) - i;

            int nbytes = sizes[sizeIndex] * 1024;

            //Time In
            cross_timer_sample(&timeIn);

            if (pinned)
            {
                err = scif_readfrom(epd,
                            buffer.offset,
                            nbytes,
                            buffer.offset,
                            0);
            }
            else
            {
                err = scif_vreadfrom(epd,
                            buffer.self_addr,
                            nbytes,
                            offset,
                            0);
            }
            if(err != 0)
            {
                printf("scif_readfrom failed with err %d\n", errno);
                perror(NULL);
            }

            //Completion Detection
            if(use_fence_mark == 0)
            {
                FENCE(epd, signal_addr, signal_offset, err, err_buf);
            }
            else
            {
                scif_fence_mark(epd, SCIF_FENCE_INIT_SELF, &mark);
                scif_fence_wait(epd, mark);
            }
            //Time Out
            cross_timer_sample(&timeOut);

            long double t = cross_timer_diff(timeIn, timeOut);
            if (verbose)
            {
                cerr << "size " << sizes[sizeIndex] << "k took " << t <<
                        " sec\n";
            }
            double speed = ((long double)(sizes[sizeIndex]) / ((long double)1024. * (long double)1024.)) / t;
            char sizeStr[SZ_DEFAULT_BUF];
            sprintf(sizeStr, "% 8dkB", sizes[sizeIndex]);
            resultDB.AddResult("ReadbackSpeed", sizeStr, "GiB/sec", speed);
            resultDB.AddResult("ReadbackTime", sizeStr, "ms", (double)(t*1e3));
        }
    }
    SCIFBARRIER(epd, "Client: Done\n");

close:
    // Cleanup
#ifndef _WIN32
    free(buffer.self_addr);
    buffer.self_addr = NULL;
    free(signal_addr);
    signal_addr = NULL;
#else
    if(buffer.self_addr)
    {
        _aligned_free(buffer.self_addr);
        buffer.self_addr = NULL;
    }

    if(signal_addr)
    {
        _aligned_free(signal_addr);
        signal_addr = NULL;
    }
#endif
}
