virtio net在probe过程中会为每个queue申请硬件中断,其调用流程如下:
virtnet_probe
init_vqs
virtnet_find_vqs
vi->vdev->config->find_vqs(回调vp_modern_find_vqs)
vp_find_vqs
vp_find_vqs_msix
// 为每个queue调用request_irq注册中断处理函数 vring_interrupt
request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupe, 0, vp_dev->msix_names[msix_vec], vqs[i])
qemu向guest os注入中断后,vring_interrrupt中断处理函数会被执行:
irqreturn_t vring_interrupt(int irq, void *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
if (!more_used(vq)) { /* 如果uesd desc没有更新则不需要特殊处理直接返回 */
pr_debug("virtqueue interrupt with no work for %p\n", vq);
return IRQ_NONE;
}
if (unlikely(vq->broken))
return IRQ_HANDLED;
pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
if (vq->vq.callback)
vq->vq.callback(&vq->vq); // skb_recv_done & skb_xmit_done
return IRQ_HANDLED;
}
中断处理函数中再调用到skb_recv_done
:
static void skb_recv_done(struct virtqueue *rvq)
{
struct virtnet_info *vi = rvq->vdev->priv;
struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
virtqueue_napi_schedule(&rq->napi, rvq);
}
virtqueue_napi_schedule
将本队列对应的napi结构加入到当前cpu的softnet_data
结构的轮询链表中:
virtqueue_napi_schedule
__napi_schedule
___nap_schedule
list_add_tail(&napi->poll_list, &sd->poll_list); // napi结构加入轮询链表
__raise_softirq_irqoff(NET_RX_SOFTIRQ); // 触发软中断
软中断将被推迟到一个专门的内核线程——ksoftirqd
来处理:
static void run_ksoftirqd(unsigned int cpu)
{
/*这里代码表示在处理软中断时,内核是关闭中断响应的,即此时硬中断也无法打断
该软中断的处理,所以软中断应该要很快的运行完,否则影响效率*/
local_irq_disable();
/*获取软中断的pending位,如果pending位有多个,__do_softirq会按优先级依次处理*/
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
local_irq_enable(); // 开本地中断
cond_resched_rcu_qs();
return;
}
local_irq_enable();
}
__do_softirq
做具体的中断处理:
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending(); // 获取挂起的软中断
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); // 禁用软中断,主要是为了防止和软中断守护进程发生竞争
in_hardirq = lockdep_softirq_start();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable(); // 启用硬中断
h = softirq_vec; // 循环执行待决软中断的回调函数
// 通过 ffs (find first set) 函数找到挂起的第一个软中断并处理它。每次处理完一个软中断后,
// 移除其位图中的对应位,直到所有挂起的软中断处理完毕
while ((softirq_bit = ffs(pending))) {
// ...处理每个挂起的软中断...
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h); // net_rx_action 处理回调函数
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
rcu_bh_qs();
local_irq_disable(); // 再次禁用中断
// 如果在规定时间内或最大重启次数内还有挂起的软中断,则重新进入处理循环,
// 最多重复 MAX_SOFTIRQ_RESTART 次
// 否则将其交给 ksoftirqd 线程处理
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;
// 当处理轮次到达 MAX_SOFTIRQ_RESTART 阈值时,do_ softirq 必须结束执行,
// 如果此时依然有未执行的软中断,将唤醒 ksoftirqd 线程来处理
wakeup_softirqd();
}
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
}
在__do_softirq函数中最终调用net_rx_action
函数完成收包动作:
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list); // 初始化链表头节点
LIST_HEAD(repoll);
local_irq_disable();
// 转移sd->poll_list链表所有节点(节点表示一个napi设备?)到list链表
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n; // 对于每个设备(多队列设备还会有多个队列)
if (list_empty(&list)) { // 链表为空?
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}
// 获取list链表第一个元素,struct napi_struct 表示链表节点类型,
// poll_list 为 struct napi_struct 结构体中 struct list_head 类型的变量名字
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll); // 调用该设备(队列)对应收包处理函数
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}
napi_poll
实现如下:
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
int work, weight;
list_del_init(&n->poll_list); // 将该napi结构从链表上删除
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight); // 调用napi关联的收包函数(virtnet_poll)
trace_napi_poll(n, work, weight);
}
// ...
return work;
}
napi结构会调用virtio net驱动在probe阶段注册的virtnet_poll
函数完成描述表的回收处理工作:
static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct receive_queue *rq =
container_of(napi, struct receive_queue, napi);
unsigned int received;
//将send_queue中used ring的chain descriptor归还到descriptor table
virtnet_poll_cleantx(rq);
//接收网络包
received = virtnet_receive(rq, budget);
/* Out of packets? */
if (received < budget)
virtqueue_napi_complete(napi, rq->vq, received);
return received;
}