vfio-pci驱动初始化函数为vfio_pci_init
,该函数注册一个名为vfio_pci_driver
的PCI驱动,当使用vfio-pci驱动与设备绑定时会调用其probe函数vfio_pci_probe
:
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_device *vdev;
struct iommu_group *group;
int ret;
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
return -EINVAL;
group = vfio_iommu_group_get(&pdev->dev); // 获取iommu层group
if (!group)
return -EINVAL;
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
if (!vdev) {
vfio_iommu_group_put(group, &pdev->dev);
return -ENOMEM;
}
vdev->pdev = pdev;
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
vfio_iommu_group_put(group, &pdev->dev);
kfree(vdev);
return ret;
}
if (vfio_pci_is_vga(pdev)) {
vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
vga_set_legacy_decoding(pdev,
vfio_pci_set_vga_decode(vdev, false));
}
if (!disable_idle_d3) {
/*
* pci-core sets the device power state to an unknown value at
* bootup and after being removed from a driver. The only
* transition it allows from this unknown state is to D0, which
* typically happens when a driver calls pci_enable_device().
* We're not ready to enable the device yet, but we do want to
* be able to get to D3. Therefore first do a D0 transition
* before going to D3.
*/
pci_set_power_state(pdev, PCI_D0);
pci_set_power_state(pdev, PCI_D3hot);
}
return ret;
}
该函数会分配一个struct vfio_pci_device
结构类型变量并设置相关字段,然后调用vfio_add_group_dev
函数分配vfio_device结构体并其注册操作回掉接口vfio_pci_ops
:
// 创建vfio_device,并绑定到一个vfio group
// 若vfio group未创建则也会创建一个vfio group
int vfio_add_group_dev(struct device *dev,
const struct vfio_device_ops *ops, void *device_data)
{
struct iommu_group *iommu_group;
struct vfio_group *group;
struct vfio_device *device;
// iommu驱动层group,系统在设备初始化时会为每个PCI设备设置其对应group
// 保存在设备device结构体中的iommu_group成员中
iommu_group = iommu_group_get(dev);
if (!iommu_group)
return -EINVAL;
// 根据iommu层group生成vfio层group
group = vfio_group_get_from_iommu(iommu_group); // vfio层group
// 一个group可包含多个device
if (!group) { // 为空则创建group
group = vfio_create_group(iommu_group);
if (IS_ERR(group)) {
iommu_group_put(iommu_group);
return PTR_ERR(group);
}
} else {
/*
* A found vfio_group already holds a reference to the
* iommu_group. A created vfio_group keeps the reference.
*/
iommu_group_put(iommu_group);
}
// 判断该物理设备dev对应vfio_device是否创建,
// 一个vfio_device只能属于一个vfio group
device = vfio_group_get_device(group, dev);
if (device) {
WARN(1, "Device %s already exists on group %d\n",
dev_name(dev), iommu_group_id(iommu_group));
vfio_device_put(device);
vfio_group_put(group);
return -EBUSY;
}
// 创建一个vfio层面的设备vfio_device
device = vfio_group_create_device(group, dev, ops, device_data);
if (IS_ERR(device)) {
vfio_group_put(group);
return PTR_ERR(device);
}
/*
* Drop all but the vfio_device reference. The vfio_device holds
* a reference to the vfio_group, which holds a reference to the
* iommu_group.
*/
vfio_group_put(group);
return 0;
}
当绑定的物理设备iommu层group没有对应的vfio group时,会调用vfio_create_group
函数创建vfio group:
static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
{
struct vfio_group *group, *tmp;
struct device *dev;
int ret, minor;
group = kzalloc(sizeof(*group), GFP_KERNEL); // 分配vfio group
if (!group)
return ERR_PTR(-ENOMEM);
kref_init(&group->kref);
INIT_LIST_HEAD(&group->device_list); // 初始化该group的设备队列
mutex_init(&group->device_lock);
INIT_LIST_HEAD(&group->unbound_list);
mutex_init(&group->unbound_lock);
atomic_set(&group->container_users, 0);
atomic_set(&group->opened, 0);
group->iommu_group = iommu_group;
#ifdef CONFIG_VFIO_NOIOMMU
group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
#endif
BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
group->nb.notifier_call = vfio_iommu_group_notifier;
/*
* blocking notifiers acquire a rwsem around registering and hold
* it around callback. Therefore, need to register outside of
* vfio.group_lock to avoid A-B/B-A contention. Our callback won't
* do anything unless it can find the group in vfio.group_list, so
* no harm in registering early.
*/
ret = iommu_group_register_notifier(iommu_group, &group->nb);
if (ret) {
kfree(group);
return ERR_PTR(ret);
}
mutex_lock(&vfio.group_lock);
/* Did we race creating this group? */
list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
if (tmp->iommu_group == iommu_group) {
vfio_group_get(tmp);
vfio_group_unlock_and_free(group);
return tmp;
}
}
minor = vfio_alloc_group_minor(group);
if (minor < 0) {
vfio_group_unlock_and_free(group);
return ERR_PTR(minor);
}
// 创建/dev/vfio/$group_id设备
dev = device_create(vfio.class, NULL,
MKDEV(MAJOR(vfio.group_devt), minor),
group, "%s%d", group->noiommu ? "noiommu-" : "",
iommu_group_id(iommu_group));
if (IS_ERR(dev)) {
vfio_free_group_minor(minor);
vfio_group_unlock_and_free(group);
return (struct vfio_group *)dev; /* ERR_PTR */
}
group->minor = minor;
group->dev = dev;
list_add(&group->vfio_next, &vfio.group_list); // 将该group挂载到全局链表vfio.group_list上
mutex_unlock(&vfio.group_lock);
return group;
}
其调用device_create
创建一个设备文件/dev/vfio/$group_id,用户态程序可以通过ioctl打开获取的fd来控制这个vfio group。
然后vfio_add_group_dev
会接着调用vfio_group_get_device
判断该物理设备是否已经创建对应vfio device,若未创建则会调用vfio_group_create_device
来创建:
static
struct vfio_device *vfio_group_create_device(struct vfio_group *group,
struct device *dev,
const struct vfio_device_ops *ops,
void *device_data)
{
struct vfio_device *device;
device = kzalloc(sizeof(*device), GFP_KERNEL);
if (!device)
return ERR_PTR(-ENOMEM);
kref_init(&device->kref);
device->dev = dev;
device->group = group; // 所属vfio group
device->ops = ops; // vfio_pci_ops
device->device_data = device_data; // 私有数据:struct vfio_pci_device
dev_set_drvdata(dev, device);
/* No need to get group_lock, caller has group reference */
vfio_group_get(group);
mutex_lock(&group->device_lock);
list_add(&device->group_next, &group->device_list); // 链接到同一个vfio group中的设备
mutex_unlock(&group->device_lock);
return device;
}
加载vfio-pci模块时还会加载vfio模块,其初始化入口函数vfio_init
会注册一个misc设备vfio_dev:
static struct miscdevice vfio_dev = {
.minor = VFIO_MINOR,
.name = "vfio",
.fops = &vfio_fops, // /dev/vfio/vfio设备的文件操作接口
.nodename = "vfio/vfio",
.mode = S_IRUGO | S_IWUGO,
};
static const struct file_operations vfio_fops = {
.owner = THIS_MODULE,
.open = vfio_fops_open,
.release = vfio_fops_release,
.read = vfio_fops_read,
.write = vfio_fops_write,
.unlocked_ioctl = vfio_fops_unl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vfio_fops_compat_ioctl,
#endif
.mmap = vfio_fops_mmap,
};
注册misc设备vfio_dev后会生成/dev/vfio/vfio文件,其文件操作接口为vfio_fops
。当打开/dev/vfio/vfio文件时会调用回掉函数vfio_fops_open
,vfio_fops_open
会分配struct vfio_container
类型变量,作为返回的文件描述符控制的container的载体:
static int vfio_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_container *container;
container = kzalloc(sizeof(*container), GFP_KERNEL);
if (!container)
return -ENOMEM;
INIT_LIST_HEAD(&container->group_list);
init_rwsem(&container->group_lock);
kref_init(&container->kref);
// 赋值到打开fd的私有结构中
// 用户态进程在打开“/dev/vfio/vfio”时内核为其分配一个
// vfio_container作为该进程所有VFIO设备的载体
filep->private_data = container;
return 0;
}