#  Copyright 2012-2017, Intel Corporation, All Rights Reserved.
#
# This software is supplied under the terms of a license
# agreement or nondisclosure agreement with Intel Corp.
# and may not be copied or disclosed except in accordance
# with the terms of that agreement.
#
#  Author:  Christopher M. Cantalupo

import os
import math

import micp.kernel as micp_kernel
import micp.info as micp_info
import micp.common as micp_common
import micp.params as micp_params

class dgemm(micp_kernel.Kernel):
    def __init__(self):
        self.name = 'dgemm'
        self.param_validator = micp_params.XGEMM_VALIDATOR
        self._paramNames = ['f_first_matrix_size',
                            'i_num_rep',
                            'T_device',
                            'n_num_thread',
                            'm_mode',
                            'l_last_matrix_size',
                            's_step']

        info = micp_info.Info()
        devIdx = info.get_device_index()

        self._paramDefaults = {'f_first_matrix_size':'1024',
                               'i_num_rep':'3',
                               'T_device':str(devIdx),
                               'n_num_thread':'228',
                               'm_mode':'NN',
                               'l_last_matrix_size':'1024',
                               's_step':'1024'}

        self._categoryParams = {}
        self._categoryParams['test'] = [' ']

        maxCount = info.num_cores()
        if micp_info.Info().is_in_sub_numa_cluster_mode():
            maxCount = micp_info.Info().snc_max_threads_per_quadrant()

        maxMemory = info.mic_memory_size() - 1024**3
        maxMatrixSize = int(math.sqrt(maxMemory/8.0/3.0))
        if maxMatrixSize > 10240:
            maxMatrixSize = 10240

        args = '--n_num_thread {0} --f_first_matrix_size {1} --l_last_matrix_size {1} --s_step {1} --i_num_rep 3'
        maxStep = maxMatrixSize/512
        maxMatrixSize = maxStep*512
        matrixConfig = [512*ii for ii in range(1, maxStep+1)]

        self._categoryParams['scaling'] = [args.format(0, matrixSize)
                                           for matrixSize in matrixConfig]
        self._categoryParams['scaling_quick'] = [args.format(0, matrixSize)
                                                 for matrixSize in matrixConfig[:8]]
        if maxMatrixSize < 7680:
            self._categoryParams['optimal'] = [args.format(0, maxMatrixSize)]
        elif maxCount == 52:
            self._categoryParams['optimal'] = [args.format(0, 6656)]
        else:
            self._categoryParams['optimal'] = [args.format(0, 7680)]

        self._categoryParams['optimal_quick'] = [args.format(0, 4096)]

        step = int(round(maxCount/10.0))
        if step < 4:
            step = 4
        coreConfig = range(step, maxCount, step)
        coreConfig.append(maxCount)
        self._categoryParams['scaling_core'] = [args.format(coreCount, 8192)
                                                for coreCount in coreConfig]
        self._set_defaults_to_optimal()

    def _do_unit_test(self):
        return False

    def offload_methods(self):
        return['native', 'pragma', 'auto', 'local']

    def path_host_exec(self, offType):
        if offType == 'pragma':
            if micp_common.is_platform_windows():
                return self._path_exec(micp_kernel.LIBEXEC_HOST, 'dgemm_ofl.exe')
            else:
                return self._path_exec(micp_kernel.LIBEXEC_HOST, 'dgemm_ofl.x')

        if offType == 'auto' or offType == 'local':
            # binary name
            if micp_info.Info().is_processor_mcdram_available():
                dgemm_binary = 'dgemm_mcdram_cpu'
            else:
                dgemm_binary = 'dgemm_cpu'

            # In SNC mode use a different MPI based binary regardless of the
            # kind of memory to be used
            if micp_info.Info().is_in_sub_numa_cluster_mode():
                dgemm_binary = 'dgemm_mpi_snc_cpu'

            # extension
            if micp_common.is_platform_windows():
                dgemm_binary = '{0}.exe'.format(dgemm_binary)
            else:
                dgemm_binary = '{0}.x'.format(dgemm_binary)
            return self._path_exec(micp_kernel.LIBEXEC_HOST, dgemm_binary)

        return None

    def path_dev_exec(self, offType):
        if offType == 'native':
            return self._path_exec(micp_kernel.LIBEXEC_DEV, 'dgemm_mic.x')
        return None

    def path_aux_data(self, offType):
        result = []
        if offType == 'native':
            result.append(self.mic_library_find('libiomp5.so'))
        return result

    def param_type(self):
        return 'flag'

    def parse_desc(self, raw):
        line = [line for line in raw.splitlines() if line.startswith('*')][0]
        matrixSize = line.split()[1]

        dd = dict([tuple([ll.strip() for ll in line.split(':')])
                   for line in raw.splitlines()
                   if ':' in line and line.find(':') == line.rfind(':')])

        numThreads = dd['num_threads']
        numIt = dd['min_niters']

        result = '{0} x {0} MKL DGEMM with {1} threads and {2} iterations'.format(matrixSize, numThreads, numIt)
        return result

    def parse_perf(self, raw):
        """Parse xGEMM's raw output and extract performance results, expected
        line format (in SNC modes we also expect an avg for each NUMA node):
            xGEMM output...

                      n        min        avg        max     stddev
            *     10240     286.64     290.43     296.65  3.815e+00

            additional output...

        return results in dictionary as required by the micp/kernel.py interface.
        """
        line = [float(line.split()[3]) for line in raw.splitlines() if line.startswith('*')]
        speed = str(sum(line))

        dd = dict([tuple([ll.strip() for ll in line.split(':')])
                   for line in raw.splitlines()
                   if ':' in line and line.find(':') == line.rfind(':')])
        try:
            if dd['timer'] == 'native':
                tag = 'Task.Computation.Avg'
            elif dd['timer'] == 'invoke':
                tag = 'Device.Computation.Avg'
            elif dd['timer'] == 'full':
                tag = 'Host.Computation.Avg'
        except KeyError:
            tag = 'Computation.Avg'
        result = {}
        result[tag] = {'value':speed, 'units':'GFlops', 'rollup':True}
        return result

    def environment_dev(self):
        return {'LD_LIBRARY_PATH':'/tmp'}

    def environment_host(self):
        """returns extra enviroment variables needed to run dgemm on the host"""
        info = micp_info.Info()
        numThreads = info.num_cores() - 1
        maxMemory = str(int((info.mic_memory_size() - 1024**3)/(1024**3)))
        mic_lb = {'MIC_BUFFERSIZE':'256M',
                'MKL_MIC_ENABLE':'1',
                'MKL_MIC_DISABLE_HOST_FALLBACK':'1',
                'LD_LIBRARY_PATH':self.ld_library_path(),
                'MIC_LD_LIBRARY_PATH':self.mic_ld_library_path(),
                'MIC_ENV_PREFIX':'MIC',
                'MIC_OMP_NUM_THREADS':str(numThreads),
                'KMP_AFFINITY':'compact,1,0',
                'MIC_KMP_AFFINITY':'explicit,granularity=fine,proclist=[1-' + str(numThreads) + ':1]',
                'MIC_USE_2MB_BUFFERS':'16K',
                'MKL_MIC_MAX_MEMORY':maxMemory + 'G'}

        mic_sb = {'KMP_AFFINITY':'compact,1,0',
                  'LD_LIBRARY_PATH':self.ld_library_path(),
                  'USE_2MB_BUFFERS':'16K'}

        # additional variables for Windows running on the Xeon Phi processor
        if micp_common.is_platform_windows() and micp_common.is_selfboot_platform():
            mic_sb['OMP_NUM_THREADS'] = str(info.num_cores())
            mic_sb['MKL_DYNAMIC'] = 'false'
            mic_sb['KMP_BLOCKTIME'] = 'infinite'
            mic_sb['KMP_LIBRARY'] = 'turnaround'

        # MKL_FAST_MEMORY_LIMIT forces MKL to store buffers in DDR memory
        if not micp_info.Info().is_processor_mcdram_available():
            mic_sb['MKL_FAST_MEMORY_LIMIT'] = '0'

        if micp_info.Info().is_in_sub_numa_cluster_mode():
            cores = micp_info.Info().snc_max_threads_per_quadrant()
            mic_sb['KMP_HW_SUBSET'] = '{0}c,1t'.format(cores)

        if micp_common.is_selfboot_platform():
            return mic_sb
        else:
            return mic_lb

    def independent_var(self, category):
        if category == 'scaling_core':
            return 'n_num_thread'
        return 'f_first_matrix_size'

    def device_param_name(self):
        return 'T_device'

    def get_process_modifiers(self):
        """returns the MPI command line (as a list) to run mpi_stream in
        the SNC modes, for the other cluster modes returns an empty list"""
        if micp_info.Info().is_in_sub_numa_cluster_mode():
            subclusters = micp_info.Info().get_number_of_nodes_with_cpus()
            return ['mpirun', '-n', str(subclusters)]
        else:
            return []

    def is_mpi_required(self):
        """MPI is required to run xGEMM when system is in the SCN2 or SNC4 mode"""
        return micp_info.Info().is_in_sub_numa_cluster_mode()

    def is_optimized_for_snc_mode(self):
        """micperf provides an optimized version for SNC modes"""
        return True
