/*
 * Copyright (c) 2017, Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

/*
 * The module provides mechanism to change order in a list of free pages kept
 * in Linux kernel buddy allocator. The order is changed so to minimize possible
 * page collisions later when those pages are served to the application.
 * The algorithm assumes that the system has up to 16GB of direct-mapped cache.
 */

#include <linux/init.h>
#include <linux/numa.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_sort.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/string.h>
#include <linux/dmi.h>

#define MODNAME "zonesort"

#define err(format, arg...) pr_err(MODNAME ": " format, ## arg)
#define debug(format, arg...) pr_debug(MODNAME ": " format, ## arg)
#define info(format, arg...) pr_info(MODNAME ": " format, ## arg)
#define warn(format, arg...) pr_warn(MODNAME ": " format, ## arg)

#ifndef CONFIG_SYSFS
#error "This module requires CONFIG_SYSFS"
#endif

#define KB(x)   ((x) * 1024ull)
#define MB(x)   (KB (KB (x)))
#define GB(x)   (MB (KB (x)))
#define B2MB(x) ((x) >> 20)

#define COLLISION_GRANULARITY_ORDER (MAX_ORDER - 2)
// expressed as multiply of 4KB page size
#define COLLISION_GRANULARITY (1l*(1<<COLLISION_GRANULARITY_ORDER)*KB(4))
// defines number of buckets for pages
#define MAX_BAD_IDX 4

/*
 * From SMBIOS reference specification
 */
enum dmi_form_factor_field {
	DMI_FORM_FACTOR_OTHER = 1,
	DMI_FORM_FACTOR_UNKNOWN,
	DMI_FORM_FACTOR_SIMM,
	DMI_FORM_FACTOR_SIP,
	DMI_FORM_FACTOR_CHIP,
	DMI_FORM_FACTOR_DIP,
	DMI_FORM_FACTOR_ZIP,
	DMI_FORM_FACTOR_PROPRIETARY_CARD,
	DMI_FORM_FACTOR_DIMM,
	DMI_FORM_FACTOR_TSOP,
	DMI_FORM_FACTOR_ROW_OF_CHIPS,
	DMI_FORM_FACTOR_RIMM,
	DMI_FORM_FACTOR_SODIMM,
	DMI_FORM_FACTOR_SRIMM,
	DMI_FORM_FACTOR_FB_DIMM,
};

/*** Type 14 - Group Associations ***/

struct dmi_group_associations {
	struct dmi_header header;
	u8 group_name;
	u8 item_type;
	u16 item_handle;
} __packed;

/*** Type 17 - Memory Device ***/

struct dmi_memory_device {
	struct dmi_header header;
	u8 unused_1[8];
	u16 size;
	u8 form_factor;
	u8 unused_2[25];
} __packed;

/*
 * From OEM-specific SMBIOS structure for member KNL general information
 */
#define DMI_SYS_GROUP_NAME_KNL "Group: Knights Landing Information"
#define DMI_SYS_GROUP_NAME_KNM "Group: Knights Mill Information"

struct dmi_knl_general_information {
	struct dmi_header header;
	u16 member_identifier;
	u8 member_name;
	u8 supported_cluster_mode;
	u8 configured_cluster_mode;
	u8 supported_memory_mode;
	u8 configured_memory_mode;
	u8 mcdram_cache_information;
} __packed;

/*
 * GLOBALS
 */

static struct workqueue_struct *sort_queue;

static void _sort_work_fn(struct work_struct *work);
static DECLARE_DELAYED_WORK(sort_work, _sort_work_fn);
static DEFINE_MUTEX(sort_lock);
static unsigned long sort_interval_msecs;
static uint64_t node_cache_size;

struct dmi_read_state {
	uint32_t mcdram_size;
	uint32_t sdram_size;
	uint64_t cache_per_node;
	struct dmi_header dh_group;
	int ret;
};

static u64 list_to_phys(struct list_head* l)
{
	struct page* p = list_entry(l, struct page, lru);
	return page_to_phys(p);
}

static int cmp_pages(void *priv, struct list_head *a, struct list_head *b)
{
	/*
	 * We just need to compare the pointers.  The 'struct
	 * page' with vmemmap are ordered in the virtual address
	 * space by physical address.  The list_head is embedded
	 * in the 'struct page'.  So we don't even have to get
	 * back to the 'struct page' here.
	 */
	if (a < b)
		return -1;
	if (a == b)
		return 0;
	/* a > b */
	return 1;
}

/*
 * SORTING ALGORITHM
 */

// each list represents bucket for pages
// index of the list defines how 'bad' the pages are:
// [no collision, 1 collision, 2 collistion, more]
static struct list_head bad_list[MAX_BAD_IDX];
static int bad_list_size[MAX_BAD_IDX];
// array of pages of maximal size fitting in cache
static u32 collisions[GB(16) / COLLISION_GRANULARITY];

static void sort_init(void)
{
	int i;

	memset(bad_list_size, 0, sizeof(bad_list_size));
	for (i=0; i<MAX_BAD_IDX; ++i)
		INIT_LIST_HEAD(&bad_list[i]);
}

static int compute_bad_idx(int idx)
{
	// empirically determined threshold values
	u32 v = collisions[idx];
	if (v == 0)						return 0;
	if (v < (1 << COLLISION_GRANULARITY_ORDER))		return 1;
	if (v < (1 << COLLISION_GRANULARITY_ORDER) * 2)		return 2;
	return 3;
}

static void sort_movable_order(struct zone *zone, int order)
{
	int i, idx, bad_idx;
	unsigned long flags, phys;
	struct list_head *pos, *tmp, *free_l;

	sort_init();

	spin_lock_irqsave(&zone->lock, flags);
	free_l = &zone->free_area[order].free_list[MIGRATE_MOVABLE];
	for (i=0; i < order; ++i) {
		list_for_each (pos, &zone->free_area[i].free_list[MIGRATE_MOVABLE]) {
			phys = list_to_phys(pos);
			idx = (phys % node_cache_size) / COLLISION_GRANULARITY;

			collisions[idx] += (1 << i);
		}
	}

	// walk choosen order, update collisions and distribute to less & more
	// heaviliy occupied bad lists
	list_for_each_safe (pos, tmp, free_l) {
		phys = list_to_phys(pos);
		idx = (phys % node_cache_size) / COLLISION_GRANULARITY;

		bad_idx = compute_bad_idx(idx);

		// occupy two pages
		collisions[idx + 0] += (1 << COLLISION_GRANULARITY_ORDER);
		collisions[idx + 1] += (1 << COLLISION_GRANULARITY_ORDER);

		list_del_init(pos);
		list_add(pos, &bad_list[bad_idx]);
		bad_list_size[bad_idx]++;
	}

	// at this point, free list should be empty
	debug("list_empty(free_l) == %d\n", (int)list_empty(free_l));

	// re-create free list of chosen order
	for (i=0; i< MAX_BAD_IDX; ++i) {
		list_splice_tail(&bad_list[i], free_l);
		debug("list %d size = %d\n", i, bad_list_size[i]);
	}

	spin_unlock_irqrestore(&zone->lock, flags);
}


static void sort_pagelists(struct zone *zone)
{
	unsigned int order;
	unsigned int type;
	unsigned long flags;

	for_each_migratetype_order(order, type) {
		if (type == MIGRATE_MOVABLE && order == MAX_ORDER - 1) {
			sort_movable_order(zone, order);
		}
		else {
			spin_lock_irqsave(&zone->lock, flags);
			list_sort(NULL, &zone->free_area[order].free_list[type], &cmp_pages);
			spin_unlock_irqrestore(&zone->lock, flags);
		}
	}

}

/*
 * BUDDY LOG DEBUG IFACE
 */

static int buddy_list_seq_show(struct seq_file* file, void* data)
{
	unsigned int order;
	unsigned int type;
	unsigned long flags;

	int max_size = 1024*1024;
	u64* buffer = vmalloc(max_size * sizeof(u64));
	struct list_head *iter;
	int i, j, entries;
	struct zone *zone = NULL;
	unsigned int node_id = 0;

	if (!buffer)
		return -ENOMEM;

	for (node_id = 0; node_id < MAX_NUMNODES; node_id++) {
		if (!node_online(node_id))
			continue;

		for (i = 0; i < MAX_NR_ZONES; i++) {
			zone = &NODE_DATA(node_id)->node_zones[i];

			if (!zone_is_initialized(zone) || !populated_zone(zone))
				continue;

			for_each_migratetype_order(order, type) {
				entries = 0;

				spin_lock_irqsave(&zone->lock, flags);
				list_for_each(iter, &zone->free_area[order].free_list[type]) {
					buffer[entries++] = list_to_phys(iter);
					if (entries == max_size)
						break;
				}
				spin_unlock_irqrestore(&zone->lock, flags);

				seq_printf(file, "Node id: %d, zone: %d, order: %d, type %d, total pages: %d\n",
						(int)node_id, (int)i, (int)order, (int)type, (int)entries);
				for (j=0; j<entries; j++)
					seq_printf(file, "%p\n", (void *)buffer[j]);
			}
		}
	}

	vfree(buffer);
	return 0;
}

struct dentry* buddy_lists;

static int buddy_list_open(struct inode* inode, struct file* file)
{
	return single_open(file, buddy_list_seq_show, NULL);
}

struct file_operations buddy_lists_fops = {
	.owner		= THIS_MODULE,
	.open		= buddy_list_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release
};

/*
 * DIRECT MAPPED CACHE DEBUG IFACE
 */

static char * const dmc_migratetype_names[MIGRATE_TYPES] = {
	"Unmovable",
	"Reclaimable",
	"Movable",
	"Reserve",
#ifdef CONFIG_CMA
	"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	"Isolate",
#endif
};

/* KNL: 16GB of memory size cache 16GB */
#define DIRECTMAPPEDCACHE_SHIFT (34)
#define DIRECTMAPPEDCACHE_SIZE (1L << DIRECTMAPPEDCACHE_SHIFT)
#define DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT (21)
#define DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SHIFT (DIRECTMAPPEDCACHE_SHIFT - DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT)
#define DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SIZE (1L << DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SHIFT)
#define DIRECTMAPPEDCACHE_MAX_LEVEL (8)

struct directmappedcache_tracker_tab {
	unsigned int tab[DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SIZE];
};

static void directmappedcache_show_tracker_status(struct seq_file* file, unsigned long entries, unsigned int order, unsigned int type, struct directmappedcache_tracker_tab *tracker_tab)
{
	unsigned long over_max = 0;
	unsigned long i;

	if (order >= (DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT - PAGE_SHIFT)) {
		unsigned long level[DIRECTMAPPEDCACHE_MAX_LEVEL];
		memset(level, 0, sizeof(level));
		for (i = 0; i < DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SIZE; i++) {
			if (tracker_tab->tab[i] >= DIRECTMAPPEDCACHE_MAX_LEVEL) {
				over_max ++;
				seq_printf(file, "      over_max idx[%4lu] = %u\n", i, tracker_tab->tab[i]);
			} else {
				level[tracker_tab->tab[i]]++;
			}
		}
		seq_printf(file, "      [%3u ][%-12s] %7lu:  ",	order, dmc_migratetype_names[type], entries);
		for (i = 0; i < DIRECTMAPPEDCACHE_MAX_LEVEL; i++) {
			seq_printf(file, "%6lu ", level[i]);
		}
		if ( over_max > 0) {
			seq_printf(file, "over_max %6lu", over_max);
		}
		seq_printf(file, "\n");
	} else {
		seq_printf(file, "Not implemented for order: %d\n", order);
	}
}


static void directmappedcache_pagelists_show(struct seq_file* file, struct zone *zone)
{
	unsigned int order;
	unsigned int type;
	unsigned long free_count_table[MAX_ORDER][MIGRATE_TYPES] = {{0}};
	unsigned int show_tracker_title = 0;
	unsigned long flags;
	struct directmappedcache_tracker_tab *tracker_tab = vmalloc(sizeof(struct directmappedcache_tracker_tab));

	for_each_migratetype_order(order, type) {
		unsigned long free_count = 0;
		struct list_head *curr;

		/* for order equal or greater than 9 (2MB) recalculate colission */
		if (order >= (DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT - PAGE_SHIFT))
		{

			unsigned long pos_per_order;
			unsigned long analyzed_entries;
			int my_error_log = 0;

			pos_per_order = 1 << (order - (DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT - PAGE_SHIFT));
			analyzed_entries = 0;

			memset(tracker_tab, 0, sizeof(struct directmappedcache_tracker_tab));

			spin_lock_irqsave(&zone->lock, flags);
			list_for_each(curr, &zone->free_area[order].free_list[type]) {
				unsigned long index;
				unsigned int detect_used = 0;
				unsigned long i;
				unsigned long pfn;

				if (show_tracker_title == 0) {
					seq_printf(file, "  DIRECT MAPPED CACHE STATE:\n");
					seq_printf(file, "      order [type        ] entries:   empty      1      2  \n");
					show_tracker_title = 1;
				}

				pfn = page_to_pfn(list_entry(curr, struct page, lru));
				index = (pfn >> (DIRECTMAPPEDCACHE_2MB_PAGE_SHIFT - PAGE_SHIFT))
					& ((1lu << DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SHIFT)-1);

				/* find max used entry */
				for (i=0; i < pos_per_order; i++) {
					if (tracker_tab->tab[i+index] > detect_used) {
						detect_used = tracker_tab->tab[i+index];
					}
				}
				/* use +1 regarding to already used level */
				detect_used++;
				if ((detect_used >= DIRECTMAPPEDCACHE_MAX_LEVEL) && (my_error_log<10)) {
					/* something wrong with the page */
					seq_printf(file, "   --> ERR: pfn %lx idx %lu detect_used %u \n", pfn, index, detect_used);
					my_error_log ++;
				}
				for (i=0; i < pos_per_order; i++) {
					tracker_tab->tab[i+index] = detect_used;
				}
				analyzed_entries += pos_per_order;
				if ((analyzed_entries & (DIRECTMAPPEDCACHE_2MB_TRACER_TAB_SIZE-1)) == 0) {
					directmappedcache_show_tracker_status(file, analyzed_entries, order, type, tracker_tab);
				}
				free_count++;
			}
			spin_unlock_irqrestore(&zone->lock, flags);
			if (analyzed_entries) {
				directmappedcache_show_tracker_status(file, analyzed_entries, order, type, tracker_tab);
			}

		} else {
			spin_lock_irqsave(&zone->lock, flags);
			list_for_each(curr, &zone->free_area[order].free_list[type]) {
				free_count++;
			}
			spin_unlock_irqrestore(&zone->lock, flags);
		}

		free_count_table[order][type] = free_count;
	}
	seq_printf(file, "  SUMMARY:\n");
	for (type = 0; type < MIGRATE_TYPES; type++) {
		seq_printf(file, "      %-12s:",	dmc_migratetype_names[type]);
		for (order = 0; order < MAX_ORDER; ++order) {
			seq_printf(file,"%6lu ", free_count_table[order][type]);
		}
		seq_printf(file, "\n");
	}

    vfree(tracker_tab);
}

static int directmappedcache_state_seq_show(struct seq_file* file, void* data)
{
	struct zone *zone = NULL;
	int i = 0;
	unsigned int node_id = 0;

	for (node_id = 0; node_id < MAX_NUMNODES; node_id++) {
		if (!node_online(node_id))
			continue;

		seq_printf(file, "DIRECTMAPPEDCACHE Show node %u, max zones %u\n", node_id, MAX_NR_ZONES);
		for (i = 0; i < MAX_NR_ZONES; i++) {
			zone = &NODE_DATA(node_id)->node_zones[i];
			if (!zone_is_initialized(zone)) {
				seq_printf(file, "Zone %d is not initialized\n", i);
				continue;
			}
			if (!populated_zone(zone)) {
				seq_printf(file, "Zone %d is not populated\n", i);
				continue;
			}
			seq_printf(file, "Zone %d (%s) to analyze\n", i, zone->name);
			// lock to avoid using static variables from multiple processes
			directmappedcache_pagelists_show(file, zone);
		}
	}
	return 0;
}

struct dentry* directmappedcache_state_debugfs;

static int directmappedcache_state_open(struct inode* inode, struct file* file)
{
	return single_open(file, directmappedcache_state_seq_show, NULL);
}

struct file_operations directmappedcache_state_fops = {
	.owner		= THIS_MODULE,
	.open		= directmappedcache_state_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release
};

/*
 * MODULE COMMON
 */

static void sort_node(unsigned int nodeid)
{
	int i;
	struct zone *zone = NULL;

	for (i = 0; i < MAX_NR_ZONES; i++) {
		zone = &NODE_DATA(nodeid)->node_zones[i];
		if (!zone_is_initialized(zone)) {
			info("Zone %d is not initialized\n", i);
			continue;
		}
		if (!populated_zone(zone)) {
			info("Zone %d is not populated\n", i);
			continue;
		}
		memset(collisions, 0, sizeof(collisions));
		sort_pagelists(zone);
	}
}

void _sort_work_fn(struct work_struct *work)
{
	unsigned int nid;
	int ret;

	// skip if previous sorting did not finish
	if(mutex_trylock(&sort_lock)) {
		for_each_online_node(nid) {
			info("Sorting node %u\n", nid);
			sort_node(nid);
		}
		mutex_unlock(&sort_lock);
	} else {
		warn("periodic sorting skipped, consider increasing sorting interval.\n");
	}
	// reschedule work
	if (sort_interval_msecs != 0) {
		ret = queue_delayed_work(sort_queue, &sort_work, msecs_to_jiffies(sort_interval_msecs));
		// should never happen, we have only one work
		if (!ret)
			warn("internal error\n");
	}
}

static ssize_t sort_interval_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu sec\n", sort_interval_msecs/1000);
}

static ssize_t sort_interval_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count)
{
	unsigned int sort_interval_sec;

	if (sscanf(buf, "%u", &sort_interval_sec) != 1)
		return -EINVAL;

	if (sort_interval_sec == 0 && sort_interval_msecs != 0)
		info("periodic sort delay turned off\n");

	sort_interval_msecs = sort_interval_sec * 1000;

	if (sort_interval_sec == 0) {
		// return value ignored on purpose
		cancel_delayed_work_sync(&sort_work);
	} else {
		info("periodic sorting interval: %d sec\n", sort_interval_sec);
		// also queues work if it was idle
		// return value ignored on purpose
		mod_delayed_work(sort_queue, &sort_work, msecs_to_jiffies(sort_interval_msecs));
	}

	return count;
}

static struct kobj_attribute sort_interval_attribute =
	__ATTR(sort_interval, 0664, sort_interval_show, sort_interval_store);

static ssize_t nodeid_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
	return -EPERM;
}

static ssize_t nodeid_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count)
{
	unsigned int nodeid;

	if (sscanf(buf, "%u", &nodeid) != 1)
		return -EINVAL;

	if (nodeid >= MAX_NUMNODES || !node_online(nodeid)) {
		info("Node %u is not online\n", nodeid);
		return -EINVAL;
	}

	if (sort_interval_msecs != 0)
		return -EBUSY;

	info("Sorting node %u\n", nodeid);
	if (mutex_trylock(&sort_lock)) {
		sort_node(nodeid);
		mutex_unlock(&sort_lock);
	} else {
		return -EBUSY;
	}

	return count;
}

static struct kobj_attribute nodeid_attribute =
	__ATTR(nodeid, 0664, nodeid_show, nodeid_store);

static struct attribute *attrs[] = {
	&nodeid_attribute.attr,
	&sort_interval_attribute.attr,
	NULL,
};

static struct attribute_group attr_group = {
	.attrs = attrs,
};

static struct kobject *zonesort_kobject;

/*
 * cluster_mode - Check the number of numa nodes available and calculate
 * the mcdram memory available as cache per numa node detected.
 * @dh: dmi_header of the SMBIOS table that has the cluster and
 * memory configurations.
 * @data: Data structure that stores the information decoded
 * from the SMBIOS. If more than one numa node is detected,
 * data->cache_per_node value is updated.
 *
 * Returns true on success. If an invalid cluster mode is detected
 * return false.
 */
static bool cluster_mode(struct dmi_knl_general_information *dkgi,
		struct dmi_read_state *data)
{
	switch (dkgi->configured_cluster_mode) {
		case 0x01:
			info("Cluster mode: Quadrant\n");
			break;
		case 0x02:
			info("Cluster mode: Hemisphere\n");
			break;
		case 0x04:
			info("Cluster mode: SNC4\n");
			data->cache_per_node = data->cache_per_node >> 2;
			break;
		case 0x08:
			info("Cluster mode: SNC2\n");
			data->cache_per_node = data->cache_per_node >> 1;
			break;
		case 0x10:
			info("Cluster mode: ALL2ALL\n");
			break;
		default:
			warn("Cluster mode: Invalid value\n");
			return true;
	}
	return false;
}

/*
 * Calculate the amount of mcdram memory available as cache
 */
static bool mcdram_as_cache(uint8_t cache_info, struct dmi_read_state *data)
{
	if (cache_info <= 4 && cache_info != 3) {
		info("%d%% of MCDRAM used as Cache\n", 25 * cache_info);
		data->cache_per_node = MB(((data->mcdram_size) >> 2) * cache_info);
		return false;
	}
	warn("Invalid amount of mcdram as cache value\n");
	return true;
}

static bool memory_mode(struct dmi_knl_general_information *dkgi,
		struct dmi_read_state *data)
{
	switch (dkgi->configured_memory_mode) {
	case 0x01:
		info("Memory mode: Cache\n");
		data->cache_per_node = MB(data->mcdram_size);
		break;
	case 0x02:
		info("Memory mode: Flat\n");
		data->cache_per_node = 0;
		break;
	case 0x04:
		info("Memory mode: Hybrid\n");
		return mcdram_as_cache(dkgi->mcdram_cache_information, data);
	default:
		warn("Memory mode: Unavailable\n");
		return true;
	}
	return false;
}

static void handle_xeon_phi_group_assoc(struct dmi_group_associations *dga,
		struct dmi_read_state *data)
{
	char *cursor = (char*)dga + dga->header.length;

	while (*cursor) {
		if (!strcmp(cursor, DMI_SYS_GROUP_NAME_KNL) ||
				!strcmp(cursor, DMI_SYS_GROUP_NAME_KNM)) {
			data->dh_group.type = dga->item_type;
			data->dh_group.handle = dga->item_handle;
			data->ret = 0;
			return;
		}
		/* next string */
		while (*(cursor++));
	}
	/* No match */
	data->ret = -ENODEV;
}

/*
 * Walk the blacklist table running matching functions until someone
 * returns zero or we hit the end.
 */
static void find_xeon_phi_oem_smbios_structure(
		const struct dmi_header *dh, void *private_data)
{
	struct dmi_read_state *data = private_data;
	struct dmi_group_associations *dga;

	/* Xeon Phi group association already found */
	if (!data->ret)
		return;

	if (dh->type == DMI_ENTRY_GROUP_ASSOC) {
		dga = (struct dmi_group_associations*)dh;
		handle_xeon_phi_group_assoc(dga, data);
	}
}

static void decode_memory_device(const struct dmi_header *dh,
		void *private_data)
{
	struct dmi_memory_device *dmd = (struct dmi_memory_device*)dh;
	struct dmi_read_state* data = private_data;

	if (dmd->header.type == DMI_ENTRY_MEM_DEVICE) {
		if (dmd->form_factor == DMI_FORM_FACTOR_DIMM)
			data->sdram_size += dmd->size;
		if (dmd->form_factor == DMI_FORM_FACTOR_CHIP)
			data->mcdram_size += dmd->size;
	}
}

static void decode_uncore_mode(const struct dmi_header *dh, void *private_data)
{
	struct dmi_knl_general_information *dkgi;
	struct dmi_read_state* data = private_data;
	dkgi = (struct dmi_knl_general_information*)dh;

	if (dkgi->header.type == data->dh_group.type &&
		dkgi->header.handle == data->dh_group.handle) {
		if (memory_mode(dkgi, data)) {
			data->ret = -ENODEV;
			return;
		}
		if (cluster_mode(dkgi, data)) {
			data->ret = -ENODEV;
			return;
		}
		data->ret = 0;
	}
}

static int read_smbios_table(void)
{
	struct dmi_read_state data = {
		.mcdram_size = 0,
		.sdram_size = 0,
		.ret = -ENODEV,
	};

	if (dmi_walk(find_xeon_phi_oem_smbios_structure, &data))
		goto error;

	if (data.ret) {
		warn("No Xeon Phi(TM) found\n");
		return data.ret;
	}
	info("Xeon Phi(TM) found\n");

	if (dmi_walk(decode_memory_device, &data))
		goto error;

	if (dmi_walk(decode_uncore_mode, &data))
		goto error;

	if (data.ret)
		return data.ret;

	node_cache_size = data.cache_per_node;

	info("MCDRAM size: %d MB\n", data.mcdram_size);
	info("SDRAM size: %d MB\n", data.sdram_size);
	info("Cache per numa node: %d MB\n", (uint32_t)B2MB(node_cache_size));

	return data.ret;
error:
	warn("Cannot read SMBIOS table\n");
	return -EPERM;
}

static int __init m_init(void)
{
	int ret;

	info("init\n");

	ret = read_smbios_table();
	if (ret)
		return ret;

	if (!node_cache_size) {
		warn("No MCDRAM as cache detected\n");
		goto fatal;
	}

	zonesort_kobject = kobject_create_and_add("zone_sort_free_pages", kernel_kobj);
	if (!zonesort_kobject)
		goto fatal;

	ret = sysfs_create_group(zonesort_kobject, &attr_group);
	if (ret)
		goto clean_zonesort_free_iface;

	buddy_lists = debugfs_create_file("buddy_lists", 0444, NULL, NULL, &buddy_lists_fops);
	if (!buddy_lists)
		goto clean_zonesort_free_iface;

	directmappedcache_state_debugfs = debugfs_create_file("directmappedcache_state",
			0444, NULL, NULL, &directmappedcache_state_fops);
	if (!directmappedcache_state_debugfs)
		goto clean_buddy_iface;

	sort_queue = create_singlethread_workqueue(MODNAME " queue");
	if (!sort_queue)
		goto clean_dmpc_state_iface;

	return 0;

	clean_dmpc_state_iface:
		debugfs_remove(directmappedcache_state_debugfs);
	clean_buddy_iface:
		debugfs_remove(buddy_lists);
	clean_zonesort_free_iface:
		kobject_put(zonesort_kobject);
	fatal:
		warn("Failed to create sysfs or debugfs file\n");
		return -ENOMEM;
}

static void __exit m_exit(void)
{
	sort_interval_msecs = 0;
	// cancel if pending
	cancel_delayed_work_sync(&sort_work);
	// wait to finish
	flush_workqueue(sort_queue);
	destroy_workqueue(sort_queue);

	kobject_put(zonesort_kobject);
	debugfs_remove(buddy_lists);
	debugfs_remove(directmappedcache_state_debugfs);

	info("exit\n");
}

module_init(m_init);
module_exit(m_exit);
MODULE_ALIAS("zonesort_module");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Intel Corporation");
MODULE_DESCRIPTION("Zone's free list sorter");
MODULE_VERSION(ZONESORT_VERSION);
