// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2020 Alibaba Cloud. or its affiliates. All Rights Reserved.
 *
 * This module introduce pvdma device and allows it to be used over a virtual,
 * memory mapped platform device.
 *
 * The major downside of VFIO device passthrough is pinning all of the VM's
 * memory region for the lifetime of the VM. This is a known issue that leads
 * to slow guest startup and memory consumption which is unacceptable in many
 * scenarios.
 *
 * Actually, it's DMA mapping problem which can be stated briefly as
 * "when should a memory page be mapped or unmapped for DMA?". In our case,
 * because we're shipping the hypervisor together with guest kernel for
 * running microVMs, we can get the answer of the problem and leverage it to
 * solve that issue. And that's what pvDMA comes into play.
 *
 * pvDMA consists of two parts, the backend emulated device which shares a
 * mapping cache with guest, the front-end driver in the guest kernel which
 * intercepts device DMA operations and notifies the backend when a memory
 * page should be mapped or unmapped. This change implements the latter one.
 *
 * To enable this, guest kernel shall be configured with CONFIG_PVDMA set,
 * and booted with vring_force_dma_api for using DMA-APIs on virtio-mmio
 * devices.
 *
 * The guest device can be instantiated in kernel commandline parameter.
 * Syntax: [pvdma_mmio.]device=<size>@<baseaddr>:<irq>
 *
 * where:
 *         <size>     := size (can use standard suffixes like K, M or G)
 *         <baseaddr> := physical base address
 * e.g.:
 *         pvdma_mmio.device=<size>@<baseaddr>
 *
 * Based on Virtio MMIO driver.
 */
#define pr_fmt(fmt) "pvdma-mmio: " fmt

#include <linux/mm.h>
#include <linux/atomic.h>
#include <linux/io.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/pvdma.h>

/* pvDMA memory regions */
#define PVDMA_MMIO_DEV_INFO_REGS_OFFSET 0x1000
#define PVDMA_MMIO_CTRL_REGS_OFFSET	0x2000

/*
 * Since x86 probes virtio-mmio through cmdline, but arm through fdt,
 * the device name will be different.
 *
 * 1.cmdline sets dev_name through platform_device_add(), and the naming
 * format is: virtio-mmio.id.
 *
 * 2.fdt sets dev_name through of_device_make_bus_id(), and the naming
 * format is: address.virtio_mmio.
 */
#ifdef CONFIG_X86
#define is_virtio_mmio(dev) \
	(!strncmp("virtio-mmio", dev_name(dev), 11))
#elif CONFIG_ARM64
#define is_virtio_mmio(dev) \
	(strstr(dev_name(dev), "virtio_mmio"))
#endif
/* pvDMA features */
#define PVDMA_F_PERSISTENT_MAPPING BIT(0)

/* pvDMA operation code */
enum {
	PVDMA_OP_INV,
	PVDMA_OP_MAP,
	PVDMA_OP_UNMAP,
	PVDMA_OP_MAX,
};

/* pvDMA magic value: "VDMA" */
const u32 k_pvdma_mmio_magic = 0x56444d41;

/* pvDMA command format */
struct pvdma_cmd_op {
	union {
		unsigned int val;
		struct {
			/* guest huge page frame number */
			unsigned int gpfn : 29;
			/* operation code */
			unsigned int opcode : 3;
		};
	};
};

/* pvDMA cache entry format */
struct map_cache_entry {
	union {
		atomic_t val;
		struct {
			unsigned int map_count : 31;
			unsigned int is_pinned : 1;
		};
	};
};

/*
 * mapping cache shared by host, Read Only
 */
#define PVDMA_MAX_GUEST_PHYS    (512 << 30)
#ifdef CONFIG_ARM64
#define PMD_PAGE_SIZE           PMD_SIZE
#endif
#define PVDMA_PAGE_LVL_SIZE     PMD_PAGE_SIZE
#define PVDMA_MAP_ENTRY_NUM     (PVDMA_MAX_GUEST_PHYS / PVDMA_PAGE_LVL_SIZE)
#define PVDMA_MAP_ENTRY_SIZE    (sizeof(struct map_cache_entry))
#define PVDMA_MAP_CACHE_SIZE    (PVDMA_MAP_ENTRY_NUM * PVDMA_MAP_ENTRY_SIZE)

#define PVDMA_MAP_CACHE_F_MAPPED BIT(31)

/**
 * pvdma_dev_info - pvDMA device information region, Read Only
 * @magic: magic value to identify device
 * @version: device version
 * @features: features that the emulated device supports
 * @status: device status bits
 * @num_entries: number of entries in mapping cache
 * @cache_size: size of mapping cache in bytes
 */
struct pvdma_dev_info {
	u32 magic;
	u32 version;
	u32 features;
	u32 status;
	u32 cache_num;
	u32 cache_size;
};

/**
 * pvdma_ctl_cmd - pvDMA device control region
 * @driver_features: features that this driver supports
 * @cmd: control command
 */
struct pvdma_ctl_cmd {
	u32 driver_features;
	struct pvdma_cmd_op cmd;
};

/**
 * pvdma_mmio_device - representation of a dragonball pvDMA device using MMIO
 * @pdev: platform device
 * @base: base address of memory mapped resource
 * @lock: protects control operations
 * @dev_info: device information region (read-only)
 * @ctrl: device control region (write-only)
 * @map_cache: mapping cache is composed of set of guest pages which are pinned
 */
struct pvdma_mmio_device {
	struct platform_device *pdev;
	void __iomem *base;
	struct pvdma_dev_info __iomem *dev_info;
	struct pvdma_ctl_cmd __iomem *ctrl;
	struct map_cache_entry (__iomem *map_cache)[PVDMA_MAP_ENTRY_NUM];
};

const struct dma_map_ops *mmio_dma_ops;
bool pvdma_is_enabled;

static struct pvdma_mmio_device *g_pvdma;

/**
 * pv_dma_map - map a coherent DMA allocation
 * @gpa: guest physical address
 * @size: size of the coherent allocation
 */
void pv_dma_map(phys_addr_t gpa, size_t size, bool map)
{
	struct pvdma_ctl_cmd *pctl = g_pvdma->ctrl;
	struct map_cache_entry entry, *p;
	size_t i, nr_huge_pages;
	struct pvdma_cmd_op cmd = {
		.gpfn = (gpa >> PMD_SHIFT),
		.opcode = PVDMA_OP_MAP,
	};

	/*
	 * The pvdma acquires a spin lock and then triggers vmexit to dragonball
	 * to build DMA mappings. When the system has plenty of vCPUs, the
	 * spinlock + vmexit causes serious performance regressions to IO intensive
	 * workloads. So fix the performance regression by removing the spinlock.
	 *
	 * The following results come from a simple case`iperf -P 32`. Actually, in
	 * odps tpch testing, the difference is more obvious.
	 *
	 * without fix
	 * [SUM]   0.00-10.00  sec  15.3 GBytes  13.1 Gbits/sec 15012  sender
	 * [SUM]   0.00-10.00  sec  15.2 GBytes  13.1 Gbits/sec                receiver
	 *
	 * with fix
	 * [SUM]   0.00-10.00  sec  24.2 GBytes  20.8 Gbits/sec 55299  sender
	 * [SUM]   0.00-10.00  sec  24.2 GBytes  20.8 Gbits/sec                receiver
	 */
	size += (gpa & (PMD_PAGE_SIZE - 1)) + PMD_PAGE_SIZE - 1;
	nr_huge_pages = size >> PMD_SHIFT;

	for (i = 0; i < nr_huge_pages; i++, cmd.gpfn++) {
		p = &(*g_pvdma->map_cache)[cmd.gpfn];
		if (map) {
			/* increase map_count */
			atomic_set(&entry.val, atomic_inc_return(&p->val));
			if (atomic_read(&entry.val) == 1) {
				/* write cmd to map a huge page */
				writel(cmd.val, &pctl->cmd);
			} else if (!entry.is_pinned) {
				do {
					cpu_relax();
					atomic_set(&entry.val, atomic_read(&p->val));
				} while (!entry.is_pinned);
			}
		} else {
			/* decrease map_count */
			atomic_dec(&p->val);
		}
	}
}

void pv_dma_map_sg(struct scatterlist *sgl, int nents, bool map)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sgl, sg, nents, i) {
		pv_dma_map(sg_phys(sg), sg->length, map);
	}
}

static void *mmio_dma_alloc(struct device *dev, size_t size,
			dma_addr_t *dma_handle, gfp_t gfp,
			unsigned long attrs)
{
	void *vaddr;

	WARN_ON(!is_virtio_mmio(dev));

	vaddr = alloc_pages_exact(PAGE_ALIGN(size), gfp);
	if (vaddr) {
		phys_addr_t phys_addr = virt_to_phys(vaddr);
		*dma_handle = (dma_addr_t)phys_addr;
		pv_dma_map(phys_addr, size, true);
	}

	return vaddr;
}

static void mmio_dma_free(struct device *dev, size_t size,
			void *cpu_addr, dma_addr_t dma_handle,
			unsigned long attrs)
{
	WARN_ON(!is_virtio_mmio(dev));

	/*
	 * We should use dma_handle instead of cpu_addr when doing dma unmap in
	 * pvdma driver, because dma_handle is converted from physical addr, but
	 * cpu_addr is actually virtual addr.
	 */
	pv_dma_map((phys_addr_t)dma_handle, size, false);
	free_pages_exact(cpu_addr, PAGE_ALIGN(size));
}

static dma_addr_t mmio_dma_map_page(struct device *dev, struct page *page,
				unsigned long offset, size_t size,
				enum dma_data_direction dir,
				unsigned long attrs)
{
	phys_addr_t phys_addr;

	WARN_ON(!is_virtio_mmio(dev));

	phys_addr = page_to_phys(page) + offset;
	pv_dma_map(phys_addr, size, true);

	return (dma_addr_t)phys_addr;
}

static void mmio_dma_unmap_page(struct device *dev, dma_addr_t addr,
				size_t size, enum dma_data_direction dir,
				unsigned long attrs)
{
	WARN_ON(!is_virtio_mmio(dev));
	pv_dma_map(addr, size, false);
}

static const struct dma_map_ops pv_mmio_dma_ops = {
	.alloc		= mmio_dma_alloc,
	.free		= mmio_dma_free,
	.map_page	= mmio_dma_map_page,
	.unmap_page	= mmio_dma_unmap_page,
};

static inline void pvdma_mmio_enable(void)
{
	mmio_dma_ops = &pv_mmio_dma_ops;
}

static inline void pvdma_mmio_disable(void)
{
	mmio_dma_ops = NULL;
}

static void pvdma_enable(void)
{
	if (!pvdma_is_enabled) {
		pvdma_mmio_enable();
		pvdma_pci_enable();
		pvdma_is_enabled = true;
		pr_info("pvdma enabled!\n");
	}
}

static void pvdma_disable(void)
{
	if (pvdma_is_enabled) {
		pvdma_pci_disable();
		pvdma_mmio_disable();
		pvdma_is_enabled = false;
		pr_info("pvdma disabled!\n");
	}
}

static int pvdma_mmio_config(struct pvdma_mmio_device *vdev)
{
	unsigned long magic, features;

	vdev->dev_info = vdev->base;
	vdev->ctrl = vdev->base + PVDMA_MMIO_DEV_INFO_REGS_OFFSET;

	/* check magic */
	magic = readl(&vdev->dev_info->magic);
	if (magic != k_pvdma_mmio_magic) {
		pr_err("wrong magic value 0x%08lx!\n", magic);
		return -ENODEV;
	}

	/* two stage negotiation, stage 1: exchange features */
	features = readl(&vdev->dev_info->features);
	if (!(features & PVDMA_F_PERSISTENT_MAPPING)) {
		/* bail out  */
		writel(0, &vdev->ctrl->driver_features);
		pr_err("invalid features request: 0x%lx", features);
		return -ENXIO;
	}
	/* exchange our features */
	writel(PVDMA_F_PERSISTENT_MAPPING, &vdev->ctrl->driver_features);

	/* stage 2: confirm features */
	features = readl(&vdev->dev_info->features);
	if (!(features & PVDMA_F_PERSISTENT_MAPPING)) {
		/* bail out */
		writel(0, &vdev->ctrl->driver_features);
		pr_err("invalid features confirm request: 0x%lx", features);
		return -ENXIO;
	}
	/* confirm */
	writel(features, &vdev->ctrl->driver_features);

	return 0;
}

static int pvdma_mmio_probe(struct platform_device *pdev)
{
	struct pvdma_mmio_device *vdev;
	struct resource *mem;
	int rc;

	/* pvDMA device is a singleton */
	if (pvdma_is_enabled)
		return 0;

	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	if (!mem)
		return -EINVAL;

	if (!devm_request_mem_region(&pdev->dev, mem->start,
				     resource_size(mem), pdev->name))
		return -EBUSY;

	vdev = devm_kzalloc(&pdev->dev, sizeof(*vdev), GFP_KERNEL);
	if (!vdev)
		return -ENOMEM;
#ifdef CONFIG_X86_64
	vdev->base = devm_ioremap(&pdev->dev, mem->start, resource_size(mem));
	vdev->map_cache = vdev->base + PVDMA_MMIO_CTRL_REGS_OFFSET;
#elif CONFIG_ARM64
	/*
	 * There are two type of ioremap operation in arm.
	 *
	 * 1.devm_ioremap(): this function set the map_cache as a device memory.
	 *
	 * 2.devm_ioremap_wc(): this function set the map_cache as a normal memory
	 * without cache.
	 *
	 * The map_cache is ram space reference to dragonball code. So we use
	 * devm_ioremap_wc().
	 *
	 * Beside we need to do atomic operations in map_cache. If the atomic
	 * operation is lse instruction, while the memory is device memory, cpu
	 * will triger dabt exception (not support exclusive operation).
	 */
	vdev->base = devm_ioremap(&pdev->dev, mem->start, PVDMA_MMIO_CTRL_REGS_OFFSET);
	vdev->map_cache = devm_ioremap_wc(&pdev->dev, mem->start + PVDMA_MMIO_CTRL_REGS_OFFSET,
						resource_size(mem) - PVDMA_MMIO_CTRL_REGS_OFFSET);
#endif
	if (vdev->base == NULL)
		return -EFAULT;

	rc = pvdma_mmio_config(vdev);
	if (rc)
		return rc;

	g_pvdma = vdev;
	platform_set_drvdata(pdev, vdev);
	pvdma_enable();

	return 0;
}

static int pvdma_mmio_remove(struct platform_device *pdev)
{
	struct pvdma_mmio_device *vdev = platform_get_drvdata(pdev);

	pr_warn("disable pvdma! it should not happen!\n");
	pvdma_disable();
	devm_kfree(&pdev->dev, vdev);
	platform_set_drvdata(pdev, NULL);
	g_pvdma = NULL;

	return 0;
}

static struct device cmdline_parent = {
	.init_name = "pvdma-mmio-cmdline",
};

static int cmdline_parent_registered;

static int cmdline_set(const char *device, const struct kernel_param *kp)
{
	int err;
	struct resource resources[1] = {};
	char *str;
	long long base, size;
	int processed;
	struct platform_device *pdev;

	/* consume "size" part of the command line parameter */
	size = memparse(device, &str);

	/* get "@<base>" chunks */
	processed = sscanf(str, "@%lli", &base);

	if (processed < 1)
		return -EINVAL;

	resources[0].flags = IORESOURCE_MEM;
	resources[0].start = base;
	resources[0].end = base + size - 1;

	if (!cmdline_parent_registered) {
		err = device_register(&cmdline_parent);
		if (err) {
			pr_err("Failed to register parent device!\n");
			return err;
		}
		cmdline_parent_registered = 1;
	}

	pr_info("Registering device pvdma-mmio at 0x%llx-0x%llx.\n",
		(unsigned long long)resources[0].start,
		(unsigned long long)resources[0].end);

	pdev = platform_device_register_resndata(&cmdline_parent,
						"pvdma-mmio",
						0, resources,
						ARRAY_SIZE(resources),
						NULL, 0);

	return PTR_ERR_OR_ZERO(pdev);
}

static int cmdline_get_device(struct device *dev, void *data)
{
	char *buffer = data;
	unsigned int len = strlen(buffer);
	struct platform_device *pdev = to_platform_device(dev);

	snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%d\n",
		pdev->resource[0].end - pdev->resource[0].start + 1ULL,
		(unsigned long long)pdev->resource[0].start, pdev->id);

	return 0;
}

static int cmdline_get(char *buffer, const struct kernel_param *kp)
{
	buffer[0] = 0;
	device_for_each_child(&cmdline_parent, buffer, cmdline_get_device);

	return strlen(buffer) + 1;
}

static const struct kernel_param_ops cmdline_param_ops = {
	.set = cmdline_set,
	.get = cmdline_get,
};

postcore_param_cb(device, &cmdline_param_ops, NULL, 0400);

/*
 * Kernel 5.10 changed .compatible from "pvdma,mmio" to "pvdma" for 2 reasons:
 *
 * 1. At present, our pvdma uses cmdline to probe, and the compatible parameter
 * has no effect on this process.
 *
 * 2. If compatible uses "pvdma,mmio", where pvdma will be considered as vendor
 * information, but it is not. To standardize compatible information, we modify
 * it to "pvdma".
 */
static const struct of_device_id pvdma_mmio_match[] = {
	{ .compatible = "pvdma", },
	{},
};
MODULE_DEVICE_TABLE(of, pvdma_mmio_match);

static struct platform_driver pvdma_mmio_driver = {
	.probe = pvdma_mmio_probe,
	.remove = pvdma_mmio_remove,
	.driver = {
		.name = "pvdma-mmio",
		.of_match_table = pvdma_mmio_match,
	},
};

static int __init pvdma_mmio_init(void)
{
	return platform_driver_register(&pvdma_mmio_driver);
}
subsys_initcall(pvdma_mmio_init)
