【驱动】ifconfig up后内核网络驱动做了什么.md

背景

最近在排查一个网络问题,ifconfig eth0 up 后,网卡link up比较慢。因此,分析了下从ifconfig up 到网络驱动的调用流程。这里顺便作个记录。

ifconfig eth0 up 调用的是busybox 的命令,因此从busybox 源码入手,逐步分析下调用流程。代码介绍文件位于:networking/ifenslave.c

ifconfig eth0 up

ifconfig eth0 upifconfig eth0 down 分别对应busybox 的set_if_up()set_if_down().

static int set_if_down(char *ifname, int flags)
{
	int res = set_if_flags(ifname, flags & ~IFF_UP);
	if (res)
		bb_perror_msg("%s: can't down", ifname);
	return res;
}
static int set_if_up(char *ifname, int flags)
{
	int res = set_if_flags(ifname, flags | IFF_UP);
	if (res)
		bb_perror_msg("%s: can't up", ifname);
	return res;
}

比如,当我们敲ifconfig eth0 down时,实则就是调用:

set_if_down("eth0", master_flags.ifr_flags);

set_if_flags()会将网卡名,up / down 标志位flags通过ioctl命令SIOCSIFFLAGS 传递给内核网卡驱动。

static int set_if_flags(char *ifname, int flags)
{
	struct ifreq ifr;

	ifr.ifr_flags = flags;
	return set_ifrname_and_do_ioctl(SIOCSIFFLAGS, &ifr, ifname);
}

dev_ifsioc

接着深入到内核代码中,看下SIOCSIFFLAGS命令在哪里实现。代码位于kernel\net\core\dev_ioctl.c

static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
	int err;
	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
	const struct net_device_ops *ops;

	if (!dev)
		return -ENODEV;

	ops = dev->netdev_ops;

	switch (cmd) {
	case SIOCSIFFLAGS:	/* Set interface flags */
		return dev_change_flags(dev, ifr->ifr_flags);

	case SIOCSIFMETRIC:	/* Set the metric on the interface
				   (currently unused) */
		return -EOPNOTSUPP;

...................

	}
	return err;
}

dev_ifsioc()会调用__dev_get_by_name()根据 网卡名遍历 net链表,如果匹配到则返回net_device结构体指针。接着,SIOCSIFFLAGS会调用到dev_change_flags(),最后调用到__dev_change_flags()

dev_change_flags

int dev_change_flags(struct net_device *dev, unsigned int flags)
{
	int ret;
	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;

	ret = __dev_change_flags(dev, flags);
	if (ret < 0)
		return ret;

	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
	__dev_notify_flags(dev, old_flags, changes);
	return ret;
}
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
	unsigned int old_flags = dev->flags;
	int ret;

	ASSERT_RTNL();

	/*
	 *	Set the flags on our device.
	 */

	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
			       IFF_AUTOMEDIA)) |
		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
				    IFF_ALLMULTI));

	/*
	 *	Load in the correct multicast list now the flags have changed.
	 */

	if ((old_flags ^ flags) & IFF_MULTICAST)
		dev_change_rx_flags(dev, IFF_MULTICAST);

	dev_set_rx_mode(dev);

	/*
	 *	Have we downed the interface. We handle IFF_UP ourselves
	 *	according to user attempts to set it, rather than blindly
	 *	setting it.
	 */

	ret = 0;
    /* 两个标识有一个是IFF_UP */
	if ((old_flags ^ flags) & IFF_UP)
		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); // 通过flags 判断调用__dev_close 还是 __dev_open

	if ((flags ^ dev->gflags) & IFF_PROMISC) {
		int inc = (flags & IFF_PROMISC) ? 1 : -1;
		unsigned int old_flags = dev->flags;

		dev->gflags ^= IFF_PROMISC;

		if (__dev_set_promiscuity(dev, inc, false) >= 0)
			if (dev->flags != old_flags)
				dev_set_rx_mode(dev);
	}

	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
	   is important. Some (broken) drivers set IFF_PROMISC, when
	   IFF_ALLMULTI is requested not asking us and not reporting.
	 */
	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

		dev->gflags ^= IFF_ALLMULTI;
		__dev_set_allmulti(dev, inc, false);
	}

	return ret;
}

__dev_change_flags(dev, flags)函数中,通过判断flag的IFF_UP位上的值是否相反,来实现是调用__dev_close()还是__dev_open()来开关eth0。

__dev_close

__dev_close中会将当前的net_device加入到等待设备关闭列表中。

static int __dev_close(struct net_device *dev)
{
	int retval;
	LIST_HEAD(single);

	list_add(&dev->close_list, &single);
	retval = __dev_close_many(&single);
	list_del(&single);

	return retval;
}

__dev_close_many

__dev_close_many通知设备正在关闭,等待未发送完的数据发送完,最后清除开启标记。

static int __dev_close_many(struct list_head *head)
{
	struct net_device *dev;

	ASSERT_RTNL();
	might_sleep();

	list_for_each_entry(dev, head, close_list) {
		/* Temporarily disable netpoll until the interface is down */
          /* 禁用netpoll */
		netpoll_poll_disable(dev);
		/* 通知设备正在关闭 */
		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
		 /* 清除start标志位 */
		clear_bit(__LINK_STATE_START, &dev->state);

		/* Synchronize to scheduled poll. We cannot touch poll list, it
		 * can be even on different cpu. So just clear netif_running().
		 *
		 * dev->stop() will invoke napi_disable() on all of it's
		 * napi_struct instances on this device.
		 */
		smp_mb__after_atomic(); /* Commit netif_running(). */
	}
	 /* 未发送完的数据发送完 */
	dev_deactivate_many(head);

	list_for_each_entry(dev, head, close_list) {
		const struct net_device_ops *ops = dev->netdev_ops;

		/*
		 *	Call the device specific close. This cannot fail.
		 *	Only if device is UP
		 *
		 *	We allow it to be called even after a DETACH hot-plug
		 *	event.
		 */
         /* 调用设备关闭操作 */
		if (ops->ndo_stop)
			ops->ndo_stop(dev);
		/* 标记设备关闭 */
		dev->flags &= ~IFF_UP;
        /* 启用netpoll */
		netpoll_poll_enable(dev);
	}

	return 0;
}

ndo_stop

ndo_stop为关闭网卡时,不同网卡驱动注册的不同的关闭函数,我们以海思的网卡驱动为例,分析下ndo_stop函数的实现。代码位于kernel\drivers\net\ethernet\hisilicon\hns\hns_enet.c

hns_nic_net_stop

static int hns_nic_net_stop(struct net_device *ndev)
{
	hns_nic_net_down(ndev);

	return 0;
}

hns_nic_net_down

static void hns_nic_net_down(struct net_device *ndev)
{
	int i;
	struct hnae_ae_ops *ops;
	struct hns_nic_priv *priv = netdev_priv(ndev);
	
	if (test_and_set_bit(NIC_STATE_DOWN, &priv->state))
		return;

	(void)del_timer_sync(&priv->service_timer);
	netif_tx_stop_all_queues(ndev);
	netif_carrier_off(ndev);
	netif_tx_disable(ndev);
	priv->link = 0;

	if (priv->phy)
		phy_stop(priv->phy);

	ops = priv->ae_handle->dev->ops;

	if (ops->stop)
		ops->stop(priv->ae_handle);

	netif_tx_stop_all_queues(ndev);

	for (i = priv->ae_handle->q_num - 1; i >= 0; i--) {
		hns_nic_ring_close(ndev, i);
		hns_nic_ring_close(ndev, i + priv->ae_handle->q_num);

		/* clean tx buffers*/
		hns_nic_tx_clr_all_bufs(priv->ring_data + i);
	}
}

hns_nic_net_down()中会调用netif_carrier_off()通知内核子系统网络断开。下面我们详细分析下netif_carrier_off()的实现。

netif_carrier_off()

void netif_carrier_off(struct net_device *dev)
{
     /* 设置网卡为载波断开状态 即nocarrier状态,上行时软中断下半部读到该状态不会进行网卡收包 */
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
        /* 增加设备改变状态 */
		atomic_inc(&dev->carrier_changes);
        /* 加入事件处理队列进行处理 */
		linkwatch_fire_event(dev);
	}
}

linkwatch_fire_event()

linkwatch_fire_event()函数将设备加入到事件队列,并且进行事件调度,调度中会根据是否为紧急事件做不同处理。

void linkwatch_fire_event(struct net_device *dev)
{
    /* 判断是否是紧急处理的事件 */
	bool urgent = linkwatch_urgent_event(dev);
	/* 判断是否是紧急处理的事件 */
	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
         /* 添加事件到事件列表 */
		linkwatch_add_event(dev);
	} else if (!urgent)
         /* 设备以前已经设置了pending标记,不是紧急事件,直接返回 */
		return;
	/* 事件调度 */
	linkwatch_schedule_work(urgent);
}

linkwatch_urgent_event()

linkwatch_urgent_event()判断是否是否需要紧急处理。

static bool linkwatch_urgent_event(struct net_device *dev)
{
    /* 设备未运行,非紧急 */
	if (!netif_running(dev))
		return false;
	 /* 设备的索引号与连接索引号不等,紧急 */
	if (dev->ifindex != dev_get_iflink(dev))
		return true;
	/* 设备作为team port,紧急 */
	if (dev->priv_flags & IFF_TEAM_PORT)
		return true;
	/* 连接与否 && 发送队列排队规则改变与否 */
	return netif_carrier_ok(dev) &&	qdisc_tx_changing(dev);
}

linkwatch_add_event()

linkwatch_add_event()将设备加入到事件处理链表。

static void linkwatch_add_event(struct net_device *dev)
{
	unsigned long flags;

	spin_lock_irqsave(&lweventlist_lock, flags);
    /* 若未添加,则添加设备到事件列表 */
	if (list_empty(&dev->link_watch_list)) {
		list_add_tail(&dev->link_watch_list, &lweventlist);
		dev_hold(dev);
	}
	spin_unlock_irqrestore(&lweventlist_lock, flags);
}

linkwatch_schedule_work()

linkwatch_schedule_work()对事件处理进行调度,紧急事件立即执行,非紧急事件延后执行。

static void linkwatch_schedule_work(int urgent)
{
	unsigned long delay = linkwatch_nextevent - jiffies;
	/* 已经设置了紧急标记,则返回 */
	if (test_bit(LW_URGENT, &linkwatch_flags))
		return;

	/* 需要紧急调度 */
	if (urgent) {
        /* 之前设置了,则返回 */
		if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
			return;
        /* 未设置紧急,则立即执行 */
		delay = 0;
	}

	/* 如果大于1s则立即执行 */
	if (delay > HZ)
		delay = 0;

	/* 如果设置了紧急标记,则立即执行 */
	if (test_bit(LW_URGENT, &linkwatch_flags))
		mod_delayed_work(system_wq, &linkwatch_work, 0);
	else
        /* 未设置紧急标记,则按照delay执行 */
		schedule_delayed_work(&linkwatch_work, delay);
}

__linkwatch_run_queue()

__linkwatch_run_queue()完成对事件调度队列中设备的处理。

static void __linkwatch_run_queue(int urgent_only)
{
	struct net_device *dev;
	LIST_HEAD(wrk);

	/*
	 * Limit the number of linkwatch events to one
	 * per second so that a runaway driver does not
	 * cause a storm of messages on the netlink
	 * socket.  This limit does not apply to up events
	 * while the device qdisc is down.
	 */
    /* 已达到调度时间 */
	if (!urgent_only)
		linkwatch_nextevent = jiffies + HZ;
	/* Limit wrap-around effect on delay. */
    /*
     未到达调度时间,并且下一次调度在当前时间的1s以后 
     那么设置调度时间是当前时间
     */
	else if (time_after(linkwatch_nextevent, jiffies + HZ))
		linkwatch_nextevent = jiffies;
	/* 清除紧急标识 */
	clear_bit(LW_URGENT, &linkwatch_flags);

	spin_lock_irq(&lweventlist_lock);
	list_splice_init(&lweventlist, &wrk);
	/* 遍历链表 */
	while (!list_empty(&wrk)) {
		/* 获取设备 */
		dev = list_first_entry(&wrk, struct net_device, link_watch_list);
        /* 从链表移除设备 */
		list_del_init(&dev->link_watch_list);
		/* 未到达调度时间 &&  不需要紧急处理  */
		if (urgent_only && !linkwatch_urgent_event(dev)) {
            /* 添加到链表尾部 */
			list_add_tail(&dev->link_watch_list, &lweventlist);
            /* 继续处理 */
			continue;
		}
		spin_unlock_irq(&lweventlist_lock);
        /* 处理设备 */
		linkwatch_do_dev(dev);
		spin_lock_irq(&lweventlist_lock);
	}
	/* 链表有未处理事件,则以非紧急状态调度队列 */
	if (!list_empty(&lweventlist))
		linkwatch_schedule_work(0);
	spin_unlock_irq(&lweventlist_lock);
}

linkwatch_do_dev()

linkwatch_do_dev()完成对某个设备的状态改变处理。

static void linkwatch_do_dev(struct net_device *dev)
{
	/*
	 * Make sure the above read is complete since it can be
	 * rewritten as soon as we clear the bit below.
	 */
	smp_mb__before_atomic();

	/* We are about to handle this device,
	 * so new events can be accepted
	 */
    /* 清除pending标记 */
	clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);

	rfc2863_policy(dev);
     /* 如果设备启动状态 */
	if (dev->flags & IFF_UP) {
        /* 链路连接 */
		if (netif_carrier_ok(dev))
            /* 启用排队规则 */
			dev_activate(dev);
		else
            /* 关闭排队规则 */
			dev_deactivate(dev);
		 /* 设备状态改变处理,执行netdev_chain上设备状态变更回调 */
		netdev_state_change(dev);
	}
	dev_put(dev);
}

phy_stop()

最后,hns_nic_net_down()中会调用phy_stop()将网卡link down。

void phy_stop(struct phy_device *phydev)
{
	mutex_lock(&phydev->lock);

	if (PHY_HALTED == phydev->state)
		goto out_unlock;

	if (phy_interrupt_is_valid(phydev)) {
		/* Disable PHY Interrupts */
		phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);

		/* Clear any pending interrupts */
		phy_clear_interrupt(phydev);
	}

	phydev->state = PHY_HALTED;

out_unlock:
	mutex_unlock(&phydev->lock);

	/* Cannot call flush_scheduled_work() here as desired because
	 * of rtnl_lock(), but PHY_HALTED shall guarantee phy_change()
	 * will not reenable interrupts.
	 */
}

phy_stop()将phydev->state设置为PHY_HALTED,将网卡关闭。

__dev_open

__dev_open为设备启用核心函数,该函数打开eth0,设置启用标记,并且设置接收模式,排队规则等。

static int __dev_open(struct net_device *dev)
{
	const struct net_device_ops *ops = dev->netdev_ops;
	int ret;

	ASSERT_RTNL();
	 /* 设备不可用 */
	if (!netif_device_present(dev))
		return -ENODEV;

	/* Block netpoll from trying to do any rx path servicing.
	 * If we don't do this there is a chance ndo_poll_controller
	 * or ndo_poll may be running while we open the device
	 */
     /* 禁用netpoll */
	netpoll_poll_disable(dev);
	/* 设备打开前通知 */
	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
	ret = notifier_to_errno(ret);
	if (ret)
		return ret;
	 /* 设置设备打开标记,设备将设置IFF_UP标志位*/
	set_bit(__LINK_STATE_START, &dev->state);
	/* 校验地址 */
	if (ops->ndo_validate_addr)
		ret = ops->ndo_validate_addr(dev);
	 /* 执行打开 */
	if (!ret && ops->ndo_open)
		ret = ops->ndo_open(dev);
	/* 启用netpoll */
	netpoll_poll_enable(dev);
	/* 失败,清除打开标记 */
	if (ret)
		clear_bit(__LINK_STATE_START, &dev->state);
    /* 设备打开操作 */
	else {
         /* 设置打开标记 */
		dev->flags |= IFF_UP;
         /* 设置接收模式 */
		dev_set_rx_mode(dev);
         /* 初始化排队规则 */
		dev_activate(dev);
        /* 加入设备数据到熵池 */
		add_device_randomness(dev->dev_addr, dev->addr_len);
	}

	return ret;
}

hns_nic_net_open()

我们以海思的网卡驱动为例,分析下ndo_open()函数的实现。代码位于kernel\drivers\net\ethernet\hisilicon\hns\hns_enet.c

static int hns_nic_net_open(struct net_device *ndev)
{
	struct hns_nic_priv *priv = netdev_priv(ndev);
	struct hnae_handle *h = priv->ae_handle;
	int ret;

	if (test_bit(NIC_STATE_TESTING, &priv->state))
		return -EBUSY;

	priv->link = 0;
	netif_carrier_off(ndev);
	/*设置tx queue的个数*/
	ret = netif_set_real_num_tx_queues(ndev, h->q_num);
	if (ret < 0) {
		netdev_err(ndev, "netif_set_real_num_tx_queues fail, ret=%d!\n",
			   ret);
		return ret;
	}
	/*设置rx queue的个数*/
	ret = netif_set_real_num_rx_queues(ndev, h->q_num);
	if (ret < 0) {
		netdev_err(ndev,
			   "netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
		return ret;
	}
	/*启动网卡*/
	ret = hns_nic_net_up(ndev);
	if (ret) {
		netdev_err(ndev,
			   "hns net up fail, ret=%d!\n", ret);
		return ret;
	}

	return 0;
}

hns_nic_net_up()

static int hns_nic_net_up(struct net_device *ndev)
{
	struct hns_nic_priv *priv = netdev_priv(ndev);
	struct hnae_handle *h = priv->ae_handle;
	int i, j, k;
	int ret;
	/*初始化中断,并设置中断函数为hns_irq_handle,每个rx和tx queue都对应一个中断*/
	ret = hns_nic_init_irq(priv);
	if (ret != 0) {
		netdev_err(ndev, "hns init irq failed! ret=%d\n", ret);
		return ret;
	}

	for (i = 0; i < h->q_num * 2; i++) {
        /*使能中断,使能napi*/
		ret = hns_nic_ring_open(ndev, i);
		if (ret)
			goto out_has_some_queues;
	}

	for (k = 0; k < h->q_num; k++)
		h->dev->ops->toggle_queue_status(h->qs[k], 1);
	/*设置mac地址*/
	ret = h->dev->ops->set_mac_addr(h, ndev->dev_addr);
	if (ret)
		goto out_set_mac_addr_err;
	/*hns的start函数为null*/
	ret = h->dev->ops->start ? h->dev->ops->start(h) : 0;
	if (ret)
		goto out_start_err;

	if (priv->phy)
        /*启动phy*/
		phy_start(priv->phy);

	clear_bit(NIC_STATE_DOWN, &priv->state);
    /*修改time 每一秒到期一次*/
	(void)mod_timer(&priv->service_timer, jiffies + SERVICE_TIMER_HZ);

	return 0;

out_start_err:
	netif_stop_queue(ndev);
out_set_mac_addr_err:
	for (k = 0; k < h->q_num; k++)
		h->dev->ops->toggle_queue_status(h->qs[k], 0);
out_has_some_queues:
	for (j = i - 1; j >= 0; j--)
		hns_nic_ring_close(ndev, j);

	set_bit(NIC_STATE_DOWN, &priv->state);

	return ret;
}

phy_start()

最后会调用到phy_start()启动网卡。

void phy_start(struct phy_device *phydev)
{
	bool do_resume = false;
	int err = 0;

	mutex_lock(&phydev->lock);

	switch (phydev->state) {
	case PHY_STARTING:
		phydev->state = PHY_PENDING;
		break;
	case PHY_READY:
		phydev->state = PHY_UP;
		break;
	case PHY_HALTED:
		/* make sure interrupts are re-enabled for the PHY */
		err = phy_enable_interrupts(phydev);
		if (err < 0)
			break;

		phydev->state = PHY_RESUMING;
		do_resume = true;
		break;
	default:
		break;
	}
	mutex_unlock(&phydev->lock);

	/* if phy was suspended, bring the physical link up again */
	if (do_resume)
		phy_resume(phydev);
}

参考

https://blog.csdn.net/qq_29044159/article/details/118030335

https://www.likecs.com/show-308571259.html

https://blog.csdn.net/Longyu_wlz/article/details/108026902

http://bbs.chinaunix.net/thread-2020457-1-1.html

https://blog.csdn.net/tiantao2012/article/details/75283527

https://blog.csdn.net/sinat_20184565/article/details/104353185

posted @ 2023-11-28 22:57  学习,积累,成长  阅读(703)  评论(0编辑  收藏  举报