virtio前端驱动收包流程分析-天翼云开发者社区

virtio net在probe过程中会为每个queue申请硬件中断，其调用流程如下：

virtnet_probe
  init_vqs
    virtnet_find_vqs
      vi->vdev->config->find_vqs（回调vp_modern_find_vqs）
        vp_find_vqs
          vp_find_vqs_msix
            // 为每个queue调用request_irq注册中断处理函数 vring_interrupt
            request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupe, 0, vp_dev->msix_names[msix_vec], vqs[i])

qemu向guest os注入中断后，vring_interrrupt中断处理函数会被执行：

irqreturn_t vring_interrupt(int irq, void *_vq)
{
	struct vring_virtqueue *vq = to_vvq(_vq);

	if (!more_used(vq)) { /* 如果uesd desc没有更新则不需要特殊处理直接返回 */
		pr_debug("virtqueue interrupt with no work for %p\n", vq);
		return IRQ_NONE;
	}

	if (unlikely(vq->broken))
		return IRQ_HANDLED;

	pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
	if (vq->vq.callback)
		vq->vq.callback(&vq->vq); // skb_recv_done & skb_xmit_done

	return IRQ_HANDLED;
}

中断处理函数中再调用到skb_recv_done:

static void skb_recv_done(struct virtqueue *rvq)
{
	struct virtnet_info *vi = rvq->vdev->priv;
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];

	virtqueue_napi_schedule(&rq->napi, rvq);
}

virtqueue_napi_schedule将本队列对应的napi结构加入到当前cpu的softnet_data结构的轮询链表中：

virtqueue_napi_schedule
  __napi_schedule
    ___nap_schedule
      list_add_tail(&napi->poll_list, &sd->poll_list); // napi结构加入轮询链表
      __raise_softirq_irqoff(NET_RX_SOFTIRQ); // 触发软中断

软中断将被推迟到一个专门的内核线程——ksoftirqd来处理：

static void run_ksoftirqd(unsigned int cpu)
{
	 /*这里代码表示在处理软中断时，内核是关闭中断响应的，即此时硬中断也无法打断
    该软中断的处理，所以软中断应该要很快的运行完，否则影响效率*/
	local_irq_disable();
	 
    /*获取软中断的pending位，如果pending位有多个，__do_softirq会按优先级依次处理*/
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable(); // 开本地中断
		cond_resched_rcu_qs();
		return;
	}
	local_irq_enable();
}

__do_softirq做具体的中断处理：

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
	unsigned long old_flags = current->flags;
	int max_restart = MAX_SOFTIRQ_RESTART;
	struct softirq_action *h;
	bool in_hardirq;
	__u32 pending;
	int softirq_bit;

	/*
	 * Mask out PF_MEMALLOC s current task context is borrowed for the
	 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
	 * again if the socket is related to swap
	 */
	current->flags &= ~PF_MEMALLOC;

	pending = local_softirq_pending(); // 获取挂起的软中断
	account_irq_enter_time(current);

	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); // 禁用软中断，主要是为了防止和软中断守护进程发生竞争
	in_hardirq = lockdep_softirq_start();

restart:
	/* Reset the pending bitmask before enabling irqs */
	set_softirq_pending(0);

	local_irq_enable(); // 启用硬中断

	h = softirq_vec; // 循环执行待决软中断的回调函数

	// 通过 ffs (find first set) 函数找到挂起的第一个软中断并处理它。每次处理完一个软中断后，
	// 移除其位图中的对应位，直到所有挂起的软中断处理完毕
	while ((softirq_bit = ffs(pending))) {
		// ...处理每个挂起的软中断...
		unsigned int vec_nr;
		int prev_count;

		h += softirq_bit - 1;

		vec_nr = h - softirq_vec;
		prev_count = preempt_count();

		kstat_incr_softirqs_this_cpu(vec_nr);

		trace_softirq_entry(vec_nr);
		h->action(h); // net_rx_action 处理回调函数
		trace_softirq_exit(vec_nr);
		if (unlikely(prev_count != preempt_count())) {
			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
			       vec_nr, softirq_to_name[vec_nr], h->action,
			       prev_count, preempt_count());
			preempt_count_set(prev_count);
		}
		h++;
		pending >>= softirq_bit;
	}

	rcu_bh_qs();
	local_irq_disable(); // 再次禁用中断

	// 如果在规定时间内或最大重启次数内还有挂起的软中断，则重新进入处理循环，
	// 最多重复 MAX_SOFTIRQ_RESTART 次
	// 否则将其交给 ksoftirqd 线程处理
	pending = local_softirq_pending();
	if (pending) {
		if (time_before(jiffies, end) && !need_resched() &&
		    --max_restart)
			goto restart;

		// 当处理轮次到达 MAX_SOFTIRQ_RESTART 阈值时，do_ softirq 必须结束执行，
		// 如果此时依然有未执行的软中断，将唤醒 ksoftirqd 线程来处理
		wakeup_softirqd();
	}

	lockdep_softirq_end(in_hardirq);
	account_irq_exit_time(current);
	__local_bh_enable(SOFTIRQ_OFFSET);
	WARN_ON_ONCE(in_interrupt());
	current_restore_flags(old_flags, PF_MEMALLOC);
}

在__do_softirq函数中最终调用net_rx_action函数完成收包动作：

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
	unsigned long time_limit = jiffies +
		usecs_to_jiffies(netdev_budget_usecs);
	int budget = netdev_budget;
	LIST_HEAD(list); // 初始化链表头节点
	LIST_HEAD(repoll);

	local_irq_disable();
	// 转移sd->poll_list链表所有节点(节点表示一个napi设备?)到list链表
	list_splice_init(&sd->poll_list, &list);
	local_irq_enable();

	for (;;) {
		struct napi_struct *n; // 对于每个设备(多队列设备还会有多个队列)

		if (list_empty(&list)) { // 链表为空?
			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
				goto out;
			break;
		}
		// 获取list链表第一个元素，struct napi_struct 表示链表节点类型，
		// poll_list 为 struct napi_struct 结构体中 struct list_head 类型的变量名字
		n = list_first_entry(&list, struct napi_struct, poll_list);
		budget -= napi_poll(n, &repoll); // 调用该设备(队列)对应收包处理函数

		/* If softirq window is exhausted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
		if (unlikely(budget <= 0 ||
			     time_after_eq(jiffies, time_limit))) {
			sd->time_squeeze++;
			break;
		}
	}

	local_irq_disable();

	list_splice_tail_init(&sd->poll_list, &list);
	list_splice_tail(&repoll, &list);
	list_splice(&list, &sd->poll_list);
	if (!list_empty(&sd->poll_list))
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	net_rps_action_and_irq_enable(sd);
out:
	__kfree_skb_flush();
}

napi_poll实现如下：

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
	void *have;
	int work, weight;

	list_del_init(&n->poll_list); // 将该napi结构从链表上删除

	have = netpoll_poll_lock(n);

	weight = n->weight;

	/* This NAPI_STATE_SCHED test is for avoiding a race
	 * with netpoll's poll_napi().  Only the entity which
	 * obtains the lock and sees NAPI_STATE_SCHED set will
	 * actually make the ->poll() call.  Therefore we avoid
	 * accidentally calling ->poll() when NAPI is not scheduled.
	 */
	work = 0;
	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
		work = n->poll(n, weight); // 调用napi关联的收包函数(virtnet_poll)
		trace_napi_poll(n, work, weight);
	}
        // ...

	return work;
}

napi结构会调用virtio net驱动在probe阶段注册的virtnet_poll函数完成描述表的回收处理工作：

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	unsigned int received;

	//将send_queue中used ring的chain descriptor归还到descriptor table
	virtnet_poll_cleantx(rq);

	//接收网络包
	received = virtnet_receive(rq, budget);

	/* Out of packets? */
	if (received < budget)
		virtqueue_napi_complete(napi, rq->vq, received);

	return received;
}

virtnet_probe init_vqs virtnet_find_vqs vi->vdev->config->find_vqs（回调vp_modern_find_vqs） vp_find_vqs vp_find_vqs_msix // 为每个queue调用request_irq注册中断处理函数 vring_interrupt request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupe, 0, vp_dev->msix_names[msix_vec], vqs[i])

irqreturn_t vring_interrupt(int irq, void *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (!more_used(vq)) { /* 如果uesd desc没有更新则不需要特殊处理直接返回 */ pr_debug("virtqueue interrupt with no work for %p\n", vq); return IRQ_NONE; } if (unlikely(vq->broken)) return IRQ_HANDLED; pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); // skb_recv_done & skb_xmit_done return IRQ_HANDLED; }

virtqueue_napi_schedule __napi_schedule ___nap_schedule list_add_tail(&napi->poll_list, &sd->poll_list); // napi结构加入轮询链表 __raise_softirq_irqoff(NET_RX_SOFTIRQ); // 触发软中断

static void run_ksoftirqd(unsigned int cpu) { /*这里代码表示在处理软中断时，内核是关闭中断响应的，即此时硬中断也无法打断该软中断的处理，所以软中断应该要很快的运行完，否则影响效率*/ local_irq_disable(); /*获取软中断的pending位，如果pending位有多个，__do_softirq会按优先级依次处理*/ if (local_softirq_pending()) { /* * We can safely run softirq on inline stack, as we are not deep * in the task stack here. */ __do_softirq(); local_irq_enable(); // 开本地中断 cond_resched_rcu_qs(); return; } local_irq_enable(); }

asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; struct softirq_action *h; bool in_hardirq; __u32 pending; int softirq_bit; /* * Mask out PF_MEMALLOC s current task context is borrowed for the * softirq. A softirq handled such as network RX might set PF_MEMALLOC * again if the socket is related to swap */ current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); // 获取挂起的软中断 account_irq_enter_time(current); __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); // 禁用软中断，主要是为了防止和软中断守护进程发生竞争 in_hardirq = lockdep_softirq_start(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); local_irq_enable(); // 启用硬中断 h = softirq_vec; // 循环执行待决软中断的回调函数 // 通过 ffs (find first set) 函数找到挂起的第一个软中断并处理它。每次处理完一个软中断后， // 移除其位图中的对应位，直到所有挂起的软中断处理完毕 while ((softirq_bit = ffs(pending))) { // ...处理每个挂起的软中断... unsigned int vec_nr; int prev_count; h += softirq_bit - 1; vec_nr = h - softirq_vec; prev_count = preempt_count(); kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); h->action(h); // net_rx_action 处理回调函数 trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); preempt_count_set(prev_count); } h++; pending >>= softirq_bit; } rcu_bh_qs(); local_irq_disable(); // 再次禁用中断 // 如果在规定时间内或最大重启次数内还有挂起的软中断，则重新进入处理循环， // 最多重复 MAX_SOFTIRQ_RESTART 次 // 否则将其交给 ksoftirqd 线程处理 pending = local_softirq_pending(); if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; // 当处理轮次到达 MAX_SOFTIRQ_RESTART 阈值时，do_ softirq 必须结束执行， // 如果此时依然有未执行的软中断，将唤醒 ksoftirqd 线程来处理 wakeup_softirqd(); } lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); current_restore_flags(old_flags, PF_MEMALLOC); }

static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; LIST_HEAD(list); // 初始化链表头节点 LIST_HEAD(repoll); local_irq_disable(); // 转移sd->poll_list链表所有节点(节点表示一个napi设备?)到list链表 list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; // 对于每个设备(多队列设备还会有多个队列) if (list_empty(&list)) { // 链表为空? if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) goto out; break; } // 获取list链表第一个元素，struct napi_struct 表示链表节点类型， // poll_list 为 struct napi_struct 结构体中 struct list_head 类型的变量名字 n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); // 调用该设备(队列)对应收包处理函数 /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); out: __kfree_skb_flush(); }

static int napi_poll(struct napi_struct *n, struct list_head *repoll) { void *have; int work, weight; list_del_init(&n->poll_list); // 将该napi结构从链表上删除 have = netpoll_poll_lock(n); weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); // 调用napi关联的收包函数(virtnet_poll) trace_napi_poll(n, work, weight); } // ... return work; }

static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = container_of(napi, struct receive_queue, napi); unsigned int received; //将send_queue中used ring的chain descriptor归还到descriptor table virtnet_poll_cleantx(rq); //接收网络包 received = virtnet_receive(rq, budget); /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); return received; }

智算服务

应用商城

合作伙伴

开发者

支持与服务

了解天翼云

virtio前端驱动收包流程分析

virtio前端驱动收包流程分析

活动

智算服务

应用商城

合作伙伴

开发者

支持与服务

了解天翼云

virtio前端驱动收包流程分析

virtio前端驱动收包流程分析