/*******************************************************************************
!   Copyright(C) 2011-2012 Intel Corporation. All Rights Reserved.
!
!   The source code, information  and  material ("Material") contained herein is
!   owned  by Intel Corporation or its suppliers or licensors, and title to such
!   Material remains  with Intel Corporation  or its suppliers or licensors. The
!   Material  contains proprietary information  of  Intel or  its  suppliers and
!   licensors. The  Material is protected by worldwide copyright laws and treaty
!   provisions. No  part  of  the  Material  may  be  used,  copied, reproduced,
!   modified, published, uploaded, posted, transmitted, distributed or disclosed
!   in any way  without Intel's  prior  express written  permission. No  license
!   under  any patent, copyright  or  other intellectual property rights  in the
!   Material  is  granted  to  or  conferred  upon  you,  either  expressly,  by
!   implication, inducement,  estoppel or  otherwise.  Any  license  under  such
!   intellectual  property  rights must  be express  and  approved  by  Intel in
!   writing.
!
!   *Third Party trademarks are the property of their respective owners.
!
!   Unless otherwise  agreed  by Intel  in writing, you may not remove  or alter
!   this  notice or  any other notice embedded  in Materials by Intel or Intel's
!   suppliers or licensors in any way.
!
!*******************************************************************************
!   Content:
!       SGEMM, DGEMM, CGEMM, ZGEMM benchmarks
!******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#ifndef _WIN32
#include <getopt.h>
#else
#include "getopt_win.h"
#endif

#include "utils.h"
#include "bench.h"

#ifdef KNL_SNC_MODE
#include <mpi.h>
#include <numa.h>
#endif

void usage(char *progname)
{
	printf("\n"
		"Usage: %s [options...]\n\n"
		"Supported options are:\n"
		"  -n <# of threads> : default value = -1 (autodetect)\n"
		"  -T <target coprocessor> : default value = -1 (autodetect)\n"
		"  -i <minimum # of bench iterations> : default value = %d\n"
		"  -t <minimum total bench time> : default value = %f sec.\n"
		"  -f <initial size of testing matrix> : default value = %d\n"
		"  -l <final size of testing matrix> : default value = %d\n"
		"  -s <size step> : default value = %d\n"
		"  -M <fixed M value> : default none\n"
		"  -N <fixed N value> : default none\n"
		"  -K <fixed K value> : default none\n"
		"  -m <transposition mode> : 0 = NN, 1 = NT, 2 = TN, 3 = TT;\n"
		"                            default: run all modes\n\n",
		progname,
		DEF_MIN_NITERS, DEF_MIN_T, DEF_FIRST_IND, DEF_LAST_IND, DEF_STEP);
	exit(1);
}

#define STRINGIFY1(x) #x
#define STRINGIFY(x) STRINGIFY1(x)

int main(int argc, char *argv[])
{
	MKLVersion Version;
	int i;
	int mode;
	int firstInd = DEF_FIRST_IND;
	int lastInd = DEF_LAST_IND;
	int step = DEF_STEP;
	int min_niters = DEF_MIN_NITERS;
	double min_t = DEF_MIN_T;
	int num_threads = DEF_NUM_THREADS;
	int coprocessor = DEF_COPROCESSOR;
	int min_mode = 0, max_mode = 3;
	int fixedM = -1, fixedN = -1, fixedK = -1;

#ifdef KNL_SNC_MODE
	int rank, world_size;
	struct numa_neighbour all_numa_neighbours[MAX_KNL_NUMA_NODES];
	int num, ii;
	int mpi_error = 0;
	int abort_execution = 0;
	int error_list[MAX_KNL_NUMA_NODES];


	if (numa_available() == -1) {
		fprintf(stderr, "ERROR: numa is not available");
		return -1;
	}

	MPI_Init(NULL, NULL);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);

	num = closer_numa_nodes(all_numa_neighbours);
	if (num != world_size) {
		printf("ERROR: number of processes (%d) do not match the number of sub NUMA clusters (%d)\n", world_size, num);
		return -1;
	}

	if (numa_run_on_node(all_numa_neighbours[rank].cpu_node) == -1) {
		printf("ERROR: unable to bind rank %d to NUMA node %d\n",
				rank, all_numa_neighbours[rank].cpu_node);
		return -1;
	}
#endif

	int c;
	while ((c = getopt(argc, argv, "n:f:l:s:i:d:T:t:m:M:N:K:h")) != -1)
		switch (c) {
		case 'f': firstInd = atoi(optarg); break;
		case 'l': lastInd = atoi(optarg); break;
		case 's': step = atoi(optarg); break;
		case 'i': min_niters = atoi(optarg); break;
		case 't': min_t = atof(optarg); break;
		case 'n': num_threads = atoi(optarg); break;
		case 'm': min_mode = max_mode = atoi(optarg); break;
		case 'M': fixedM = atoi(optarg); break;
		case 'N': fixedN = atoi(optarg); break;
		case 'K': fixedK = atoi(optarg); break;
		case 'h': usage(argv[0]); break;
		}

	/* Check loop bounds for mode */
	min_mode = (min_mode < 0) ? 0 : min_mode;
	max_mode = (max_mode < 0) ? 0 : max_mode;
	min_mode = (min_mode > 3) ? 3 : min_mode;
	max_mode = (max_mode > 3) ? 3 : max_mode;
	max_mode = (max_mode < min_mode) ? min_mode : max_mode;
	/* Check loop bounds for matrix size */
	fixedM = (fixedM > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : fixedM;
	fixedN = (fixedN > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : fixedN;
	fixedK = (fixedK > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : fixedK;
	if (fixedM >= 0 && fixedN >=0 && fixedK >= 0) {
		firstInd = lastInd = step = 1;
	} else {
		firstInd = (firstInd < 1) ? 1 : firstInd;
		lastInd = (lastInd < 1) ? 1 : lastInd;
		firstInd = (firstInd > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : firstInd;
		lastInd = (lastInd > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : lastInd;
		lastInd = (lastInd < firstInd) ? firstInd : lastInd;
		step = (step < 1) ? 1 : step;
		step = (step > MAX_ALLOWED_IND) ? MAX_ALLOWED_IND : step;
	}

	if (min_niters <= 0)
	{
		printf("'min_niters' has been reset to default value.\n");
		min_niters = DEF_MIN_NITERS;
	}
	if (min_t <= 0) {
		printf("'min_t' has been reset to default value.\n");
		min_t = DEF_MIN_T;
	}

#ifdef KNL_SNC_MODE
	if (rank == 0)
#endif
	{
		printf("benchmarking: " STRINGIFY(xgemm) "\n");
		printf("timer       : native\n");
		printf("num_threads : %d%s\n", num_threads,
			(num_threads < 0 ? " (autodetect / environment)" : ""));
		printf("min_niters  : %d\n", min_niters);
		printf("min_t       : %f\n", min_t);
		printf("first index : %d\n", firstInd);
		printf("last  index : %d\n", lastInd);
		printf("step        : %d\n", step);
		printf("fixed M     : %d\n", fixedM);
		printf("fixed N     : %d\n", fixedN);
		printf("fixed K     : %d\n", fixedK);
		printf("data transf.: %s\n",
			"maybe (depends on MKL AO setting)");
	}

#ifdef KNL_SNC_MODE
	if (rank == 0)
#endif
	{
		mkl_get_version(&Version);
		printf("MKL         : %d.%d.%d build %s (%s)\n",
				Version.MajorVersion, Version.MinorVersion, Version.UpdateVersion,
				Version.Build, Version.ProductStatus);
		printf("processor   : %s\n", Version.Processor);
		printf("CPU freq.   : %.2f (may float due to scaling)\n",
				mkl_get_cpu_frequency());
		printf("# cores aval: %d\n", mkl_get_max_threads());
		printf("max threads : %d\n", mkl_get_max_threads()*THREADS_PER_CORE);
	}

	for (mode = min_mode; mode < max_mode + 1; mode++) {
		int best_i = firstInd;
		double best_avg_gflops = 0;
		double best_min_gflops = 0;
		double best_max_gflops = 0;
		double best_stddev = 0;
		char transa, transb;

		transa = (mode / 2 == 0 ? 'N' : 'T');
		transb = (mode % 2 == 0 ? 'N' : 'T');

		/* It is important to initialize the benchmark early so that OpenMP runtime
		 * is properly configured. */
#ifndef KNL_SNC_MODE
		initialize_bench(fixedM > 0 ? fixedM : lastInd,
			fixedN > 0 ? fixedN : lastInd,
			fixedK > 0 ? fixedK : lastInd,
			num_threads, transa, transb);
#else
		initialize_bench(fixedM > 0 ? fixedM : lastInd,
			fixedN > 0 ? fixedN : lastInd,
			fixedK > 0 ? fixedK : lastInd,
			num_threads, transa, transb,
			all_numa_neighbours[rank].hbw_node, rank, world_size);

		if (!is_bench_initialized()) {
			printf("ERROR: allocating matrices for " STRINGIFY(xgemm) " unable to continue\n" );
			mpi_error = 1;
		}

		// make sure all ranks were able to allocate resources, on failure terminate execution
		MPI_Allreduce(&mpi_error, &abort_execution, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
		if (abort_execution) {
			MPI_Bcast(&abort_execution, 1, MPI_INT, 0, MPI_COMM_WORLD);
			MPI_Finalize();
			return -1;
		}
#endif


#ifdef KNL_SNC_MODE
		if (rank == 0)
#endif
		{
			printf("\n#%d: %c%c\n", mode, transa, transb);

			printf("\ntesting XGEMM( '%c', '%c', n, n, ... )\n\n",
					transa, transb);
			printf("%11s %10s %10s %10s %10s\n",
					"n", "min", "avg", "max", "stddev");
			fflush(NULL);
		}
		for (i = firstInd; i <= lastInd; i += step) {
#if defined(SINGLE_PREC) || defined (DOUBLE_PREC) || defined(INTEGER)
			fptype_t alpha = 1.0;
			fptype_t beta = 1.0;
#else
			fptype_t alpha = {1.0, 0.0};
			fptype_t beta = {1.0, 0.0};
#endif
			int iter = 0;
			double min = DBL_MAX, max = -DBL_MAX, sum_time = 0.0, stddev = 0.0;
			int M = fixedM > 0 ? fixedM : i;
			int N = fixedN > 0 ? fixedN : i;
			int K = fixedK > 0 ? fixedK : i;
			double avg_time;
			double ngops;
			double min_gflops;
			double avg_gflops;
			double max_gflops;
			volatile double t;
			double sum_perf = 0.0;
			double avg_perf;

			/*xGEMM number of operations*/
			ngops = 2.0 * M * N * K / 1E9;
#if !defined(SINGLE_PREC) && !defined (DOUBLE_PREC) && !defined(INTEGER)
			ngops *= 4.0;
#endif


#ifdef KNL_SNC_MODE
			MPI_Barrier(MPI_COMM_WORLD);
#endif
			/* Two warmup calls (aids stability on small sizes) */
			t = xgemm_bench(transa, transb, M, N, K, alpha, beta);
			t = xgemm_bench(transa, transb, M, N, K, alpha, beta);

			/* Run computations until either the timeout or iterations limit
			 * is reached */

			while ((iter < min_niters || sum_time < min_t)) {
				double t = xgemm_bench(transa, transb, M, N, K, alpha, beta);
				min = (min < t) ? min : t;
				max = (max > t) ? max : t;
				sum_time += t;
				sum_perf += ngops / t;
				stddev += (ngops / t) * (ngops / t);
				iter++;
			}
			avg_time = sum_time / iter;
			avg_perf = sum_perf / iter;
			stddev = sqrt(stddev / iter - avg_perf * avg_perf);

			min_gflops = ngops / max;
			avg_gflops = ngops / avg_time;
			max_gflops = ngops / min;
#ifndef KNL_SNC_MODE
			printf("%11d %10.2f %10.2f %10.2f %10.3e\n",
					i, min_gflops, avg_gflops, max_gflops, stddev);
#else
			printf("%11d %10.2f %10.2f %10.2f %10.3e (MPI rank %d)\n",
				i, min_gflops, avg_gflops, max_gflops, stddev, rank);
#endif

			fflush(NULL);

			if (best_avg_gflops < avg_gflops) {
				best_i = i;
				best_avg_gflops = avg_gflops;
				best_min_gflops = min_gflops;
				best_max_gflops = max_gflops;
				best_stddev = stddev;
			}
		}

#ifndef KNL_SNC_MODE
		printf("*%10d %10.2f %10.2f %10.2f %10.3e\n", best_i,
				best_min_gflops, best_avg_gflops,
				best_max_gflops, best_stddev);
		fflush(NULL);
#else
		MPI_Barrier(MPI_COMM_WORLD);
		for (ii=0; ii < world_size; ++ii) {
			MPI_Barrier( MPI_COMM_WORLD );
			if (ii == rank) {
				printf("*%10d %10.2f %10.2f %10.2f %10.3e\n", best_i,
						best_min_gflops, best_avg_gflops,
						best_max_gflops, best_stddev);
				fflush(NULL);
			}
		}
#endif
	}

	finalize_bench();
#ifdef KNL_SNC_MODE
	MPI_Finalize();
#endif

	return 0;
}

