searchusermenu
  • 发布文章
  • 消息中心
点赞
收藏
评论
分享
原创

virtio前端驱动收包流程分析

2024-10-14 09:40:27
18
0

virtio net在probe过程中会为每个queue申请硬件中断,其调用流程如下:

virtnet_probe
  init_vqs
    virtnet_find_vqs
      vi->vdev->config->find_vqs(回调vp_modern_find_vqs)
        vp_find_vqs
          vp_find_vqs_msix
            // 为每个queue调用request_irq注册中断处理函数 vring_interrupt
            request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupe, 0, vp_dev->msix_names[msix_vec], vqs[i])

qemu向guest os注入中断后,vring_interrrupt中断处理函数会被执行:

irqreturn_t vring_interrupt(int irq, void *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (!more_used(vq)) { /* 如果uesd desc没有更新则不需要特殊处理直接返回 */
		pr_debug("virtqueue interrupt with no work for %p\n", vq);
		return IRQ_NONE;
	}

	if (unlikely(vq->broken))
		return IRQ_HANDLED;

	pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
	if (vq->vq.callback)
		vq->vq.callback(&vq->vq); // skb_recv_done & skb_xmit_done

	return IRQ_HANDLED;
}

中断处理函数中再调用到skb_recv_done:

static void skb_recv_done(struct virtqueue *rvq)
{
	struct virtnet_info *vi = rvq->vdev->priv;
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];

	virtqueue_napi_schedule(&rq->napi, rvq);
}

virtqueue_napi_schedule将本队列对应的napi结构加入到当前cpu的softnet_data结构的轮询链表中:

virtqueue_napi_schedule
  __napi_schedule
    ___nap_schedule
      list_add_tail(&napi->poll_list, &sd->poll_list); // napi结构加入轮询链表
      __raise_softirq_irqoff(NET_RX_SOFTIRQ); // 触发软中断

软中断将被推迟到一个专门的内核线程——ksoftirqd来处理:

static void run_ksoftirqd(unsigned int cpu)
{
	 /*这里代码表示在处理软中断时,内核是关闭中断响应的,即此时硬中断也无法打断
    该软中断的处理,所以软中断应该要很快的运行完,否则影响效率*/
	local_irq_disable();
	 
    /*获取软中断的pending位,如果pending位有多个,__do_softirq会按优先级依次处理*/
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable(); // 开本地中断
		cond_resched_rcu_qs();
		return;
	}
	local_irq_enable();
}

__do_softirq做具体的中断处理:

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
	unsigned long old_flags = current->flags;
	int max_restart = MAX_SOFTIRQ_RESTART;
	struct softirq_action *h;
	bool in_hardirq;
	__u32 pending;
	int softirq_bit;

	/*
	 * Mask out PF_MEMALLOC s current task context is borrowed for the
	 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
	 * again if the socket is related to swap
	 */
	current->flags &= ~PF_MEMALLOC;

	pending = local_softirq_pending(); // 获取挂起的软中断
	account_irq_enter_time(current);

	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); // 禁用软中断,主要是为了防止和软中断守护进程发生竞争
	in_hardirq = lockdep_softirq_start();

restart:
	/* Reset the pending bitmask before enabling irqs */
	set_softirq_pending(0);

	local_irq_enable(); // 启用硬中断

	h = softirq_vec; // 循环执行待决软中断的回调函数

	// 通过 ffs (find first set) 函数找到挂起的第一个软中断并处理它。每次处理完一个软中断后,
	// 移除其位图中的对应位,直到所有挂起的软中断处理完毕
	while ((softirq_bit = ffs(pending))) {
		// ...处理每个挂起的软中断...
		unsigned int vec_nr;
		int prev_count;

		h += softirq_bit - 1;

		vec_nr = h - softirq_vec;
		prev_count = preempt_count();

		kstat_incr_softirqs_this_cpu(vec_nr);

		trace_softirq_entry(vec_nr);
		h->action(h); // net_rx_action 处理回调函数
		trace_softirq_exit(vec_nr);
		if (unlikely(prev_count != preempt_count())) {
			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
			       vec_nr, softirq_to_name[vec_nr], h->action,
			       prev_count, preempt_count());
			preempt_count_set(prev_count);
		}
		h++;
		pending >>= softirq_bit;
	}

	rcu_bh_qs();
	local_irq_disable(); // 再次禁用中断

	// 如果在规定时间内或最大重启次数内还有挂起的软中断,则重新进入处理循环,
	// 最多重复 MAX_SOFTIRQ_RESTART 次
	// 否则将其交给 ksoftirqd 线程处理
	pending = local_softirq_pending();
	if (pending) {
		if (time_before(jiffies, end) && !need_resched() &&
		    --max_restart)
			goto restart;

		// 当处理轮次到达 MAX_SOFTIRQ_RESTART 阈值时,do_ softirq 必须结束执行,
		// 如果此时依然有未执行的软中断,将唤醒 ksoftirqd 线程来处理
		wakeup_softirqd();
	}

	lockdep_softirq_end(in_hardirq);
	account_irq_exit_time(current);
	__local_bh_enable(SOFTIRQ_OFFSET);
	WARN_ON_ONCE(in_interrupt());
	current_restore_flags(old_flags, PF_MEMALLOC);
}

在__do_softirq函数中最终调用net_rx_action函数完成收包动作:

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget;
	LIST_HEAD(list); // 初始化链表头节点
	LIST_HEAD(repoll);

	local_irq_disable();
	// 转移sd->poll_list链表所有节点(节点表示一个napi设备?)到list链表
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n; // 对于每个设备(多队列设备还会有多个队列)

		if (list_empty(&list)) { // 链表为空?
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto out;
			break;
		}
		// 获取list链表第一个元素,struct napi_struct 表示链表节点类型,
		// poll_list 为 struct napi_struct 结构体中 struct list_head 类型的变量名字
		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll); // 调用该设备(队列)对应收包处理函数

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
out:
	__kfree_skb_flush();
}

napi_poll实现如下:

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	list_del_init(&n->poll_list); // 将该napi结构从链表上删除

	have = netpoll_poll_lock(n);

	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight); // 调用napi关联的收包函数(virtnet_poll)
		trace_napi_poll(n, work, weight);
	}
        // ...

	return work;
}

napi结构会调用virtio net驱动在probe阶段注册的virtnet_poll函数完成描述表的回收处理工作:

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	unsigned int received;

	//将send_queue中used ring的chain descriptor归还到descriptor table
	virtnet_poll_cleantx(rq);

	//接收网络包
	received = virtnet_receive(rq, budget);

	/* Out of packets? */
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);

	return received;
}
0条评论
0 / 1000
c****q
8文章数
0粉丝数
c****q
8 文章 | 0 粉丝
原创

virtio前端驱动收包流程分析

2024-10-14 09:40:27
18
0

virtio net在probe过程中会为每个queue申请硬件中断,其调用流程如下:

virtnet_probe
  init_vqs
    virtnet_find_vqs
      vi->vdev->config->find_vqs(回调vp_modern_find_vqs)
        vp_find_vqs
          vp_find_vqs_msix
            // 为每个queue调用request_irq注册中断处理函数 vring_interrupt
            request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupe, 0, vp_dev->msix_names[msix_vec], vqs[i])

qemu向guest os注入中断后,vring_interrrupt中断处理函数会被执行:

irqreturn_t vring_interrupt(int irq, void *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (!more_used(vq)) { /* 如果uesd desc没有更新则不需要特殊处理直接返回 */
		pr_debug("virtqueue interrupt with no work for %p\n", vq);
		return IRQ_NONE;
	}

	if (unlikely(vq->broken))
		return IRQ_HANDLED;

	pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
	if (vq->vq.callback)
		vq->vq.callback(&vq->vq); // skb_recv_done & skb_xmit_done

	return IRQ_HANDLED;
}

中断处理函数中再调用到skb_recv_done:

static void skb_recv_done(struct virtqueue *rvq)
{
	struct virtnet_info *vi = rvq->vdev->priv;
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];

	virtqueue_napi_schedule(&rq->napi, rvq);
}

virtqueue_napi_schedule将本队列对应的napi结构加入到当前cpu的softnet_data结构的轮询链表中:

virtqueue_napi_schedule
  __napi_schedule
    ___nap_schedule
      list_add_tail(&napi->poll_list, &sd->poll_list); // napi结构加入轮询链表
      __raise_softirq_irqoff(NET_RX_SOFTIRQ); // 触发软中断

软中断将被推迟到一个专门的内核线程——ksoftirqd来处理:

static void run_ksoftirqd(unsigned int cpu)
{
	 /*这里代码表示在处理软中断时,内核是关闭中断响应的,即此时硬中断也无法打断
    该软中断的处理,所以软中断应该要很快的运行完,否则影响效率*/
	local_irq_disable();
	 
    /*获取软中断的pending位,如果pending位有多个,__do_softirq会按优先级依次处理*/
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable(); // 开本地中断
		cond_resched_rcu_qs();
		return;
	}
	local_irq_enable();
}

__do_softirq做具体的中断处理:

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
	unsigned long old_flags = current->flags;
	int max_restart = MAX_SOFTIRQ_RESTART;
	struct softirq_action *h;
	bool in_hardirq;
	__u32 pending;
	int softirq_bit;

	/*
	 * Mask out PF_MEMALLOC s current task context is borrowed for the
	 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
	 * again if the socket is related to swap
	 */
	current->flags &= ~PF_MEMALLOC;

	pending = local_softirq_pending(); // 获取挂起的软中断
	account_irq_enter_time(current);

	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); // 禁用软中断,主要是为了防止和软中断守护进程发生竞争
	in_hardirq = lockdep_softirq_start();

restart:
	/* Reset the pending bitmask before enabling irqs */
	set_softirq_pending(0);

	local_irq_enable(); // 启用硬中断

	h = softirq_vec; // 循环执行待决软中断的回调函数

	// 通过 ffs (find first set) 函数找到挂起的第一个软中断并处理它。每次处理完一个软中断后,
	// 移除其位图中的对应位,直到所有挂起的软中断处理完毕
	while ((softirq_bit = ffs(pending))) {
		// ...处理每个挂起的软中断...
		unsigned int vec_nr;
		int prev_count;

		h += softirq_bit - 1;

		vec_nr = h - softirq_vec;
		prev_count = preempt_count();

		kstat_incr_softirqs_this_cpu(vec_nr);

		trace_softirq_entry(vec_nr);
		h->action(h); // net_rx_action 处理回调函数
		trace_softirq_exit(vec_nr);
		if (unlikely(prev_count != preempt_count())) {
			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
			       vec_nr, softirq_to_name[vec_nr], h->action,
			       prev_count, preempt_count());
			preempt_count_set(prev_count);
		}
		h++;
		pending >>= softirq_bit;
	}

	rcu_bh_qs();
	local_irq_disable(); // 再次禁用中断

	// 如果在规定时间内或最大重启次数内还有挂起的软中断,则重新进入处理循环,
	// 最多重复 MAX_SOFTIRQ_RESTART 次
	// 否则将其交给 ksoftirqd 线程处理
	pending = local_softirq_pending();
	if (pending) {
		if (time_before(jiffies, end) && !need_resched() &&
		    --max_restart)
			goto restart;

		// 当处理轮次到达 MAX_SOFTIRQ_RESTART 阈值时,do_ softirq 必须结束执行,
		// 如果此时依然有未执行的软中断,将唤醒 ksoftirqd 线程来处理
		wakeup_softirqd();
	}

	lockdep_softirq_end(in_hardirq);
	account_irq_exit_time(current);
	__local_bh_enable(SOFTIRQ_OFFSET);
	WARN_ON_ONCE(in_interrupt());
	current_restore_flags(old_flags, PF_MEMALLOC);
}

在__do_softirq函数中最终调用net_rx_action函数完成收包动作:

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget;
	LIST_HEAD(list); // 初始化链表头节点
	LIST_HEAD(repoll);

	local_irq_disable();
	// 转移sd->poll_list链表所有节点(节点表示一个napi设备?)到list链表
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n; // 对于每个设备(多队列设备还会有多个队列)

		if (list_empty(&list)) { // 链表为空?
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto out;
			break;
		}
		// 获取list链表第一个元素,struct napi_struct 表示链表节点类型,
		// poll_list 为 struct napi_struct 结构体中 struct list_head 类型的变量名字
		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll); // 调用该设备(队列)对应收包处理函数

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
out:
	__kfree_skb_flush();
}

napi_poll实现如下:

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	list_del_init(&n->poll_list); // 将该napi结构从链表上删除

	have = netpoll_poll_lock(n);

	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight); // 调用napi关联的收包函数(virtnet_poll)
		trace_napi_poll(n, work, weight);
	}
        // ...

	return work;
}

napi结构会调用virtio net驱动在probe阶段注册的virtnet_poll函数完成描述表的回收处理工作:

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	unsigned int received;

	//将send_queue中used ring的chain descriptor归还到descriptor table
	virtnet_poll_cleantx(rq);

	//接收网络包
	received = virtnet_receive(rq, budget);

	/* Out of packets? */
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);

	return received;
}
文章来自个人专栏
linux虚拟化
3 文章 | 1 订阅
0条评论
0 / 1000
请输入你的评论
0
0