/*******************************************************************************
!   Copyright(C) 2011-2012 Intel Corporation. All Rights Reserved.
!
!   The source code, information  and  material ("Material") contained herein is
!   owned  by Intel Corporation or its suppliers or licensors, and title to such
!   Material remains  with Intel Corporation  or its suppliers or licensors. The
!   Material  contains proprietary information  of  Intel or  its  suppliers and
!   licensors. The  Material is protected by worldwide copyright laws and treaty
!   provisions. No  part  of  the  Material  may  be  used,  copied, reproduced,
!   modified, published, uploaded, posted, transmitted, distributed or disclosed
!   in any way  without Intel's  prior  express written  permission. No  license
!   under  any patent, copyright  or  other intellectual property rights  in the
!   Material  is  granted  to  or  conferred  upon  you,  either  expressly,  by
!   implication, inducement,  estoppel or  otherwise.  Any  license  under  such
!   intellectual  property  rights must  be express  and  approved  by  Intel in
!   writing.
!
!   *Third Party trademarks are the property of their respective owners.
!
!   Unless otherwise  agreed  by Intel  in writing, you may not remove  or alter
!   this  notice or  any other notice embedded  in Materials by Intel or Intel's
!   suppliers or licensors in any way.
!
!*******************************************************************************
!   Content:
!       Utility functions for SGEMM, DGEMM, CGEMM, ZGEMM benchmarks
!******************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
#include <sys/mman.h>

#include <immintrin.h>

#include "utils.h"

#if defined(MCDRAM_KNLSB)

#include <memkind.h>

memkind_t kind;

void *bench_malloc(size_t size)
{
    int err;
    void *ptr = NULL;

    if (memkind_check_available(MEMKIND_HBW) == 0) {
        kind = MEMKIND_HBW;
    } else {
        fprintf(stderr, "WARNING: There's NO high bandwidth memory using standard memory\n");
        kind = MEMKIND_DEFAULT;
    }

    err = memkind_posix_memalign(kind, &ptr, 4096, size);
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate MCDRAM memory\n");
        exit(-err);
    }
    advise_huge_pages(ptr, size);
    return ptr;
}

void bench_free(void *ptr, size_t size)
{
    memkind_free(kind, ptr);
}

#elif defined(KNL_SNC_MODE)

#include <numa.h>
#include <string.h>

void *bench_malloc(size_t size, int hbw_memory_numa_node)
{
	void *ptr = NULL;
	char* mkl_fast_memory_limit = getenv("MKL_FAST_MEMORY_LIMIT");

	// enviroment variable MKL_FAST_MEMORY_LIMIT=0 means use DDR memory only
	if (mkl_fast_memory_limit != NULL && strncmp("0", mkl_fast_memory_limit, 2) == 0)
		ptr = (void*) numa_alloc_local(size);
	else
		ptr = (void*) numa_alloc_onnode(size, hbw_memory_numa_node);

	if (ptr == NULL) {
		fprintf(stderr, "ERROR: unable to allocate memory for matrices.\n");
	}
	advise_huge_pages(ptr, size);
	return ptr;
}

void bench_free(void *ptr, size_t size)
{
	if (ptr != NULL)
		numa_free(ptr, size);
}

/* Receives four pointers which are filled once get_numa_nodes() is done. returns
 * 0 on suceess or 1 on error.
 *
 *  - cpu_nodes, num_cpu_nodes, pointer to an array of int's which will hold the
 *    NUMA ID's of the nodes with CPUs and pointer to a counter used to indicate
 *    how many elements are in the array.
 *
 *  - hbw_nodes, num_hbw_nodes, pointer to an array of int's which will hold the
 *    NUMA ID's of the HBW NUMA nodes and pointer to a counter used to indicate
 *    how many elements are in the array.
 *
 * cpu_nodes and hbw_nodes should be big enough.
 */
int get_numa_nodes(int *cpu_nodes, size_t *num_cpu_nodes, int *hbw_nodes, size_t *num_hbw_nodes)
{
	if (cpu_nodes == NULL || num_cpu_nodes == NULL ||
	   hbw_nodes == NULL || num_hbw_nodes == NULL)
		return 1;

	int number_of_nodes = numa_max_node();
	struct bitmask *cpus_mask = numa_allocate_cpumask();
	int i;

	for (i=0; i<=number_of_nodes; i++) {

		if (numa_node_to_cpus(i, cpus_mask) != 0) {
			printf("WARNING: Unable to determine CPUs for NUMA node %d\n", i);
			numa_bitmask_clearall(cpus_mask);
			continue;
		}

		if (numa_bitmask_weight(cpus_mask) == 0) {
			hbw_nodes[(*num_hbw_nodes)++] = i;
		} else {
			cpu_nodes[(*num_cpu_nodes)++] = i;
		}

		numa_bitmask_clearall(cpus_mask);
	}

	numa_free_cpumask(cpus_mask);
	return 0;
}


/*
 * Receives a pointer to an array of numa_neighbour structs big enough to hold the
 * information for all the NUMA nodes in the system, closer_numa_nodes() fills in
 * such array and returns the number of elements populated or -1 on error.
 */
int closer_numa_nodes(struct numa_neighbour *all_numa_neighbours)
{
	int cpu_nodes[MAX_KNL_NUMA_NODES], hbw_nodes[MAX_KNL_NUMA_NODES];
	size_t num_cpu_nodes = 0;
	size_t num_hbw_nodes = 0;

	char nodes_count = 0;
	int i, j;
	int min_distance, closer_hbw_node, distance;

	if (all_numa_neighbours == NULL)
		return -1;

	if (get_numa_nodes(cpu_nodes, &num_cpu_nodes, hbw_nodes, &num_hbw_nodes))
		return -1;

	if (num_cpu_nodes > MAX_KNL_NUMA_NODES) {
		printf("WARNING: Expected %d numa nodes, numa nodes found %zu,\n", MAX_KNL_NUMA_NODES, num_hbw_nodes);
		printf("         only the first %d nodes will be processed.\n", MAX_KNL_NUMA_NODES);
	}

	for (i=0; i<num_cpu_nodes && i<MAX_KNL_NUMA_NODES; i++) {
		min_distance = 2<<15; // a big number
		closer_hbw_node = cpu_nodes[i]; // initial condition, closest node to current node is the node itself

		for (j=0; j<num_hbw_nodes; j++) {
			distance = numa_distance(cpu_nodes[i], hbw_nodes[j]);

			// numa_distance() returns 0 on error
			if (distance != 0 && distance != NUMA_DISTANCE_TO_MYSELF && distance < min_distance) {
				min_distance = distance;
				closer_hbw_node = hbw_nodes[j];
			}
		}

		all_numa_neighbours[nodes_count].hbw_node = closer_hbw_node;
		all_numa_neighbours[nodes_count].cpu_node = cpu_nodes[i];
		nodes_count++;
	}

	return nodes_count;
}

#else

/* Memory allocation functions that use Intel C compiler intrinsics */

void *bench_malloc(size_t size)
{
    void *ptr = _mm_malloc(size, 2*1024*1024);
    if (ptr == NULL) {
        printf("Error: Could not allocate memory (%zu Mbytes)\n", size / (1024 * 1024));
        exit(1);
    }
    advise_huge_pages(ptr, size);
    return ptr;
}

void bench_free(void *ptr, size_t size)
{
	_mm_free(ptr);
}

#endif

// TODO : fill_matrix functions will use MKL larnv functins after
// MKL 2018 GOLD will be released
// It can be also tested if contents of matrices matters, if not
// this can be replaced by simple memset

void fill_matrix(fptype_mtx_t *A, size_t nelems)
{
	size_t j, step = 2 * 1024 * 1024;
#pragma omp parallel for
	for (j = 0; j < nelems; j += step) {
		int len = (j + step > nelems || j + step < step) ? nelems - j : step;
		fptype_mtx_t *A_cur = A + j;
		fptype_mtx_t *A_end = A_cur + len;
		short idx = SHRT_MIN;

		for(; A_cur != A_end; A_cur++)
		{
			*A_cur = idx;
			idx++;
			if (idx == SHRT_MAX) {
				idx = SHRT_MIN;
			}
		}
	}
}

void fill_matrix_c(fptype_mtx_c_t *A, size_t nelems)
{
	size_t j, step = 2 * 1024 * 1024;
#pragma omp parallel for
	for (j = 0; j < nelems; j += step) {
		int len = (j + step > nelems || j + step < step) ? nelems - j : step;
		fptype_mtx_c_t *A_cur = A + j;
		fptype_mtx_c_t *A_end = A_cur + len;
		short idx = SHRT_MIN;

		for(; A_cur != A_end; A_cur++)
		{
			*A_cur = idx;
			idx++;
			if (idx == SHRT_MAX) {
				idx = SHRT_MIN;
			}
		}
	}
}

void advise_huge_pages(void *memptr, size_t size) {
    if (madvise(memptr, size, MADV_HUGEPAGE)) {
        fprintf(stderr, "WARNING: Unable to advise huge pages\n");
    }
}
