searchusermenu
  • 发布文章
  • 消息中心
点赞
收藏
评论
分享
原创

linux内核vfio模块介绍

2024-10-17 09:34:44
20
0

vfio-pci驱动初始化函数为vfio_pci_init,该函数注册一个名为vfio_pci_driver的PCI驱动,当使用vfio-pci驱动与设备绑定时会调用其probe函数vfio_pci_probe

static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct vfio_pci_device *vdev;
	struct iommu_group *group;
	int ret;

	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
		return -EINVAL;

	group = vfio_iommu_group_get(&pdev->dev); // 获取iommu层group
	if (!group)
		return -EINVAL;

	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
	if (!vdev) {
		vfio_iommu_group_put(group, &pdev->dev);
		return -ENOMEM;
	}

	vdev->pdev = pdev;
	vdev->irq_type = VFIO_PCI_NUM_IRQS;
	mutex_init(&vdev->igate);
	spin_lock_init(&vdev->irqlock);

	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
	if (ret) {
		vfio_iommu_group_put(group, &pdev->dev);
		kfree(vdev);
		return ret;
	}

	if (vfio_pci_is_vga(pdev)) {
		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
		vga_set_legacy_decoding(pdev,
					vfio_pci_set_vga_decode(vdev, false));
	}

	if (!disable_idle_d3) {
		/*
		 * pci-core sets the device power state to an unknown value at
		 * bootup and after being removed from a driver.  The only
		 * transition it allows from this unknown state is to D0, which
		 * typically happens when a driver calls pci_enable_device().
		 * We're not ready to enable the device yet, but we do want to
		 * be able to get to D3.  Therefore first do a D0 transition
		 * before going to D3.
		 */
		pci_set_power_state(pdev, PCI_D0);
		pci_set_power_state(pdev, PCI_D3hot);
	}

	return ret;
}

该函数会分配一个struct vfio_pci_device结构类型变量并设置相关字段,然后调用vfio_add_group_dev函数分配vfio_device结构体并其注册操作回掉接口vfio_pci_ops:

// 创建vfio_device,并绑定到一个vfio group
// 若vfio group未创建则也会创建一个vfio group
int vfio_add_group_dev(struct device *dev,
		       const struct vfio_device_ops *ops, void *device_data)
{
	struct iommu_group *iommu_group;
	struct vfio_group *group;
	struct vfio_device *device;
	// iommu驱动层group,系统在设备初始化时会为每个PCI设备设置其对应group
	// 保存在设备device结构体中的iommu_group成员中
	iommu_group = iommu_group_get(dev);
	if (!iommu_group)
		return -EINVAL;
	// 根据iommu层group生成vfio层group
	group = vfio_group_get_from_iommu(iommu_group); // vfio层group
	// 一个group可包含多个device
	if (!group) { // 为空则创建group
		group = vfio_create_group(iommu_group);
		if (IS_ERR(group)) {
			iommu_group_put(iommu_group);
			return PTR_ERR(group);
		}
	} else {
		/*
		 * A found vfio_group already holds a reference to the
		 * iommu_group.  A created vfio_group keeps the reference.
		 */
		iommu_group_put(iommu_group);
	}
	// 判断该物理设备dev对应vfio_device是否创建,
	// 一个vfio_device只能属于一个vfio group
	device = vfio_group_get_device(group, dev);
	if (device) {
		WARN(1, "Device %s already exists on group %d\n",
		     dev_name(dev), iommu_group_id(iommu_group));
		vfio_device_put(device);
		vfio_group_put(group);
		return -EBUSY;
	}
	// 创建一个vfio层面的设备vfio_device
	device = vfio_group_create_device(group, dev, ops, device_data);
	if (IS_ERR(device)) {
		vfio_group_put(group);
		return PTR_ERR(device);
	}

	/*
	 * Drop all but the vfio_device reference.  The vfio_device holds
	 * a reference to the vfio_group, which holds a reference to the
	 * iommu_group.
	 */
	vfio_group_put(group);

	return 0;
}

当绑定的物理设备iommu层group没有对应的vfio group时,会调用vfio_create_group函数创建vfio group:

static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
{
	struct vfio_group *group, *tmp;
	struct device *dev;
	int ret, minor;

	group = kzalloc(sizeof(*group), GFP_KERNEL); // 分配vfio group
	if (!group)
		return ERR_PTR(-ENOMEM);

	kref_init(&group->kref);
	INIT_LIST_HEAD(&group->device_list); // 初始化该group的设备队列
	mutex_init(&group->device_lock);
	INIT_LIST_HEAD(&group->unbound_list);
	mutex_init(&group->unbound_lock);
	atomic_set(&group->container_users, 0);
	atomic_set(&group->opened, 0);
	group->iommu_group = iommu_group;
#ifdef CONFIG_VFIO_NOIOMMU
	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
#endif
	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);

	group->nb.notifier_call = vfio_iommu_group_notifier;

	/*
	 * blocking notifiers acquire a rwsem around registering and hold
	 * it around callback.  Therefore, need to register outside of
	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
	 * do anything unless it can find the group in vfio.group_list, so
	 * no harm in registering early.
	 */
	ret = iommu_group_register_notifier(iommu_group, &group->nb);
	if (ret) {
		kfree(group);
		return ERR_PTR(ret);
	}

	mutex_lock(&vfio.group_lock);

	/* Did we race creating this group? */
	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
		if (tmp->iommu_group == iommu_group) {
			vfio_group_get(tmp);
			vfio_group_unlock_and_free(group);
			return tmp;
		}
	}

	minor = vfio_alloc_group_minor(group);
	if (minor < 0) {
		vfio_group_unlock_and_free(group);
		return ERR_PTR(minor);
	}
	// 创建/dev/vfio/$group_id设备
	dev = device_create(vfio.class, NULL,
			    MKDEV(MAJOR(vfio.group_devt), minor),
			    group, "%s%d", group->noiommu ? "noiommu-" : "",
			    iommu_group_id(iommu_group));
	if (IS_ERR(dev)) {
		vfio_free_group_minor(minor);
		vfio_group_unlock_and_free(group);
		return (struct vfio_group *)dev; /* ERR_PTR */
	}

	group->minor = minor;
	group->dev = dev;

	list_add(&group->vfio_next, &vfio.group_list); // 将该group挂载到全局链表vfio.group_list上

	mutex_unlock(&vfio.group_lock);

	return group;
}

其调用device_create创建一个设备文件/dev/vfio/$group_id,用户态程序可以通过ioctl打开获取的fd来控制这个vfio group。
然后vfio_add_group_dev会接着调用vfio_group_get_device判断该物理设备是否已经创建对应vfio device,若未创建则会调用vfio_group_create_device来创建:

static
struct vfio_device *vfio_group_create_device(struct vfio_group *group,
					     struct device *dev,
					     const struct vfio_device_ops *ops,
					     void *device_data)
{
	struct vfio_device *device;

	device = kzalloc(sizeof(*device), GFP_KERNEL);
	if (!device)
		return ERR_PTR(-ENOMEM);

	kref_init(&device->kref);
	device->dev = dev;
	device->group = group; // 所属vfio group
	device->ops = ops; // vfio_pci_ops
	device->device_data = device_data; // 私有数据:struct vfio_pci_device
	dev_set_drvdata(dev, device);

	/* No need to get group_lock, caller has group reference */
	vfio_group_get(group);

	mutex_lock(&group->device_lock);
	list_add(&device->group_next, &group->device_list); // 链接到同一个vfio group中的设备
	mutex_unlock(&group->device_lock);

	return device;
}

加载vfio-pci模块时还会加载vfio模块,其初始化入口函数vfio_init会注册一个misc设备vfio_dev:

static struct miscdevice vfio_dev = {
	.minor = VFIO_MINOR,
	.name = "vfio",
	.fops = &vfio_fops, // /dev/vfio/vfio设备的文件操作接口
	.nodename = "vfio/vfio",
	.mode = S_IRUGO | S_IWUGO,
};

static const struct file_operations vfio_fops = {
	.owner		= THIS_MODULE,
	.open		= vfio_fops_open,
	.release	= vfio_fops_release,
	.read		= vfio_fops_read,
	.write		= vfio_fops_write,
	.unlocked_ioctl	= vfio_fops_unl_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= vfio_fops_compat_ioctl,
#endif
	.mmap		= vfio_fops_mmap,
};

注册misc设备vfio_dev后会生成/dev/vfio/vfio文件,其文件操作接口为vfio_fops。当打开/dev/vfio/vfio文件时会调用回掉函数vfio_fops_openvfio_fops_open会分配struct vfio_container类型变量,作为返回的文件描述符控制的container的载体:

static int vfio_fops_open(struct inode *inode, struct file *filep)
{
	struct vfio_container *container;

	container = kzalloc(sizeof(*container), GFP_KERNEL);
	if (!container)
		return -ENOMEM;

	INIT_LIST_HEAD(&container->group_list);
	init_rwsem(&container->group_lock);
	kref_init(&container->kref);
	// 赋值到打开fd的私有结构中
	// 用户态进程在打开“/dev/vfio/vfio”时内核为其分配一个
	// vfio_container作为该进程所有VFIO设备的载体
	filep->private_data = container;

	return 0;
}
0条评论
0 / 1000
c****q
8文章数
0粉丝数
c****q
8 文章 | 0 粉丝
原创

linux内核vfio模块介绍

2024-10-17 09:34:44
20
0

vfio-pci驱动初始化函数为vfio_pci_init,该函数注册一个名为vfio_pci_driver的PCI驱动,当使用vfio-pci驱动与设备绑定时会调用其probe函数vfio_pci_probe

static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct vfio_pci_device *vdev;
	struct iommu_group *group;
	int ret;

	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
		return -EINVAL;

	group = vfio_iommu_group_get(&pdev->dev); // 获取iommu层group
	if (!group)
		return -EINVAL;

	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
	if (!vdev) {
		vfio_iommu_group_put(group, &pdev->dev);
		return -ENOMEM;
	}

	vdev->pdev = pdev;
	vdev->irq_type = VFIO_PCI_NUM_IRQS;
	mutex_init(&vdev->igate);
	spin_lock_init(&vdev->irqlock);

	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
	if (ret) {
		vfio_iommu_group_put(group, &pdev->dev);
		kfree(vdev);
		return ret;
	}

	if (vfio_pci_is_vga(pdev)) {
		vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
		vga_set_legacy_decoding(pdev,
					vfio_pci_set_vga_decode(vdev, false));
	}

	if (!disable_idle_d3) {
		/*
		 * pci-core sets the device power state to an unknown value at
		 * bootup and after being removed from a driver.  The only
		 * transition it allows from this unknown state is to D0, which
		 * typically happens when a driver calls pci_enable_device().
		 * We're not ready to enable the device yet, but we do want to
		 * be able to get to D3.  Therefore first do a D0 transition
		 * before going to D3.
		 */
		pci_set_power_state(pdev, PCI_D0);
		pci_set_power_state(pdev, PCI_D3hot);
	}

	return ret;
}

该函数会分配一个struct vfio_pci_device结构类型变量并设置相关字段,然后调用vfio_add_group_dev函数分配vfio_device结构体并其注册操作回掉接口vfio_pci_ops:

// 创建vfio_device,并绑定到一个vfio group
// 若vfio group未创建则也会创建一个vfio group
int vfio_add_group_dev(struct device *dev,
		       const struct vfio_device_ops *ops, void *device_data)
{
	struct iommu_group *iommu_group;
	struct vfio_group *group;
	struct vfio_device *device;
	// iommu驱动层group,系统在设备初始化时会为每个PCI设备设置其对应group
	// 保存在设备device结构体中的iommu_group成员中
	iommu_group = iommu_group_get(dev);
	if (!iommu_group)
		return -EINVAL;
	// 根据iommu层group生成vfio层group
	group = vfio_group_get_from_iommu(iommu_group); // vfio层group
	// 一个group可包含多个device
	if (!group) { // 为空则创建group
		group = vfio_create_group(iommu_group);
		if (IS_ERR(group)) {
			iommu_group_put(iommu_group);
			return PTR_ERR(group);
		}
	} else {
		/*
		 * A found vfio_group already holds a reference to the
		 * iommu_group.  A created vfio_group keeps the reference.
		 */
		iommu_group_put(iommu_group);
	}
	// 判断该物理设备dev对应vfio_device是否创建,
	// 一个vfio_device只能属于一个vfio group
	device = vfio_group_get_device(group, dev);
	if (device) {
		WARN(1, "Device %s already exists on group %d\n",
		     dev_name(dev), iommu_group_id(iommu_group));
		vfio_device_put(device);
		vfio_group_put(group);
		return -EBUSY;
	}
	// 创建一个vfio层面的设备vfio_device
	device = vfio_group_create_device(group, dev, ops, device_data);
	if (IS_ERR(device)) {
		vfio_group_put(group);
		return PTR_ERR(device);
	}

	/*
	 * Drop all but the vfio_device reference.  The vfio_device holds
	 * a reference to the vfio_group, which holds a reference to the
	 * iommu_group.
	 */
	vfio_group_put(group);

	return 0;
}

当绑定的物理设备iommu层group没有对应的vfio group时,会调用vfio_create_group函数创建vfio group:

static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
{
	struct vfio_group *group, *tmp;
	struct device *dev;
	int ret, minor;

	group = kzalloc(sizeof(*group), GFP_KERNEL); // 分配vfio group
	if (!group)
		return ERR_PTR(-ENOMEM);

	kref_init(&group->kref);
	INIT_LIST_HEAD(&group->device_list); // 初始化该group的设备队列
	mutex_init(&group->device_lock);
	INIT_LIST_HEAD(&group->unbound_list);
	mutex_init(&group->unbound_lock);
	atomic_set(&group->container_users, 0);
	atomic_set(&group->opened, 0);
	group->iommu_group = iommu_group;
#ifdef CONFIG_VFIO_NOIOMMU
	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
#endif
	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);

	group->nb.notifier_call = vfio_iommu_group_notifier;

	/*
	 * blocking notifiers acquire a rwsem around registering and hold
	 * it around callback.  Therefore, need to register outside of
	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
	 * do anything unless it can find the group in vfio.group_list, so
	 * no harm in registering early.
	 */
	ret = iommu_group_register_notifier(iommu_group, &group->nb);
	if (ret) {
		kfree(group);
		return ERR_PTR(ret);
	}

	mutex_lock(&vfio.group_lock);

	/* Did we race creating this group? */
	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
		if (tmp->iommu_group == iommu_group) {
			vfio_group_get(tmp);
			vfio_group_unlock_and_free(group);
			return tmp;
		}
	}

	minor = vfio_alloc_group_minor(group);
	if (minor < 0) {
		vfio_group_unlock_and_free(group);
		return ERR_PTR(minor);
	}
	// 创建/dev/vfio/$group_id设备
	dev = device_create(vfio.class, NULL,
			    MKDEV(MAJOR(vfio.group_devt), minor),
			    group, "%s%d", group->noiommu ? "noiommu-" : "",
			    iommu_group_id(iommu_group));
	if (IS_ERR(dev)) {
		vfio_free_group_minor(minor);
		vfio_group_unlock_and_free(group);
		return (struct vfio_group *)dev; /* ERR_PTR */
	}

	group->minor = minor;
	group->dev = dev;

	list_add(&group->vfio_next, &vfio.group_list); // 将该group挂载到全局链表vfio.group_list上

	mutex_unlock(&vfio.group_lock);

	return group;
}

其调用device_create创建一个设备文件/dev/vfio/$group_id,用户态程序可以通过ioctl打开获取的fd来控制这个vfio group。
然后vfio_add_group_dev会接着调用vfio_group_get_device判断该物理设备是否已经创建对应vfio device,若未创建则会调用vfio_group_create_device来创建:

static
struct vfio_device *vfio_group_create_device(struct vfio_group *group,
					     struct device *dev,
					     const struct vfio_device_ops *ops,
					     void *device_data)
{
	struct vfio_device *device;

	device = kzalloc(sizeof(*device), GFP_KERNEL);
	if (!device)
		return ERR_PTR(-ENOMEM);

	kref_init(&device->kref);
	device->dev = dev;
	device->group = group; // 所属vfio group
	device->ops = ops; // vfio_pci_ops
	device->device_data = device_data; // 私有数据:struct vfio_pci_device
	dev_set_drvdata(dev, device);

	/* No need to get group_lock, caller has group reference */
	vfio_group_get(group);

	mutex_lock(&group->device_lock);
	list_add(&device->group_next, &group->device_list); // 链接到同一个vfio group中的设备
	mutex_unlock(&group->device_lock);

	return device;
}

加载vfio-pci模块时还会加载vfio模块,其初始化入口函数vfio_init会注册一个misc设备vfio_dev:

static struct miscdevice vfio_dev = {
	.minor = VFIO_MINOR,
	.name = "vfio",
	.fops = &vfio_fops, // /dev/vfio/vfio设备的文件操作接口
	.nodename = "vfio/vfio",
	.mode = S_IRUGO | S_IWUGO,
};

static const struct file_operations vfio_fops = {
	.owner		= THIS_MODULE,
	.open		= vfio_fops_open,
	.release	= vfio_fops_release,
	.read		= vfio_fops_read,
	.write		= vfio_fops_write,
	.unlocked_ioctl	= vfio_fops_unl_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= vfio_fops_compat_ioctl,
#endif
	.mmap		= vfio_fops_mmap,
};

注册misc设备vfio_dev后会生成/dev/vfio/vfio文件,其文件操作接口为vfio_fops。当打开/dev/vfio/vfio文件时会调用回掉函数vfio_fops_openvfio_fops_open会分配struct vfio_container类型变量,作为返回的文件描述符控制的container的载体:

static int vfio_fops_open(struct inode *inode, struct file *filep)
{
	struct vfio_container *container;

	container = kzalloc(sizeof(*container), GFP_KERNEL);
	if (!container)
		return -ENOMEM;

	INIT_LIST_HEAD(&container->group_list);
	init_rwsem(&container->group_lock);
	kref_init(&container->kref);
	// 赋值到打开fd的私有结构中
	// 用户态进程在打开“/dev/vfio/vfio”时内核为其分配一个
	// vfio_container作为该进程所有VFIO设备的载体
	filep->private_data = container;

	return 0;
}
文章来自个人专栏
驱动
5 文章 | 1 订阅
0条评论
0 / 1000
请输入你的评论
0
0