【驱动】ifconfig up后内核网络驱动做了什么.md
背景
最近在排查一个网络问题,ifconfig eth0 up
后,网卡link up比较慢。因此,分析了下从ifconfig up
到网络驱动的调用流程。这里顺便作个记录。
ifconfig eth0 up
调用的是busybox 的命令,因此从busybox 源码入手,逐步分析下调用流程。代码介绍文件位于:networking/ifenslave.c
ifconfig eth0 up
ifconfig eth0 up
和 ifconfig eth0 down
分别对应busybox 的set_if_up()
和set_if_down()
.
static int set_if_down(char *ifname, int flags)
{
int res = set_if_flags(ifname, flags & ~IFF_UP);
if (res)
bb_perror_msg("%s: can't down", ifname);
return res;
}
static int set_if_up(char *ifname, int flags)
{
int res = set_if_flags(ifname, flags | IFF_UP);
if (res)
bb_perror_msg("%s: can't up", ifname);
return res;
}
比如,当我们敲ifconfig eth0 down
时,实则就是调用:
set_if_down("eth0", master_flags.ifr_flags);
set_if_flags()
会将网卡名,up / down
标志位flags
通过ioctl命令SIOCSIFFLAGS
传递给内核网卡驱动。
static int set_if_flags(char *ifname, int flags)
{
struct ifreq ifr;
ifr.ifr_flags = flags;
return set_ifrname_and_do_ioctl(SIOCSIFFLAGS, &ifr, ifname);
}
dev_ifsioc
接着深入到内核代码中,看下SIOCSIFFLAGS
命令在哪里实现。代码位于kernel\net\core\dev_ioctl.c
。
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
if (!dev)
return -ENODEV;
ops = dev->netdev_ops;
switch (cmd) {
case SIOCSIFFLAGS: /* Set interface flags */
return dev_change_flags(dev, ifr->ifr_flags);
case SIOCSIFMETRIC: /* Set the metric on the interface
(currently unused) */
return -EOPNOTSUPP;
...................
}
return err;
}
dev_ifsioc()
会调用__dev_get_by_name()
根据 网卡名遍历 net链表,如果匹配到则返回net_device
结构体指针。接着,SIOCSIFFLAGS会调用到dev_change_flags()
,最后调用到__dev_change_flags()
。
dev_change_flags
int dev_change_flags(struct net_device *dev, unsigned int flags)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
return ret;
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
__dev_notify_flags(dev, old_flags, changes);
return ret;
}
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
/*
* Set the flags on our device.
*/
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
IFF_AUTOMEDIA)) |
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
IFF_ALLMULTI));
/*
* Load in the correct multicast list now the flags have changed.
*/
if ((old_flags ^ flags) & IFF_MULTICAST)
dev_change_rx_flags(dev, IFF_MULTICAST);
dev_set_rx_mode(dev);
/*
* Have we downed the interface. We handle IFF_UP ourselves
* according to user attempts to set it, rather than blindly
* setting it.
*/
ret = 0;
/* 两个标识有一个是IFF_UP */
if ((old_flags ^ flags) & IFF_UP)
ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); // 通过flags 判断调用__dev_close 还是 __dev_open
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
unsigned int old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
if (__dev_set_promiscuity(dev, inc, false) >= 0)
if (dev->flags != old_flags)
dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
is important. Some (broken) drivers set IFF_PROMISC, when
IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
__dev_set_allmulti(dev, inc, false);
}
return ret;
}
在__dev_change_flags(dev, flags)
函数中,通过判断flag的IFF_UP
位上的值是否相反,来实现是调用__dev_close()
还是__dev_open()
来开关eth0。
__dev_close
__dev_close
中会将当前的net_device
加入到等待设备关闭列表中。
static int __dev_close(struct net_device *dev)
{
int retval;
LIST_HEAD(single);
list_add(&dev->close_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
return retval;
}
__dev_close_many
__dev_close_many
通知设备正在关闭,等待未发送完的数据发送完,最后清除开启标记。
static int __dev_close_many(struct list_head *head)
{
struct net_device *dev;
ASSERT_RTNL();
might_sleep();
list_for_each_entry(dev, head, close_list) {
/* Temporarily disable netpoll until the interface is down */
/* 禁用netpoll */
netpoll_poll_disable(dev);
/* 通知设备正在关闭 */
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
/* 清除start标志位 */
clear_bit(__LINK_STATE_START, &dev->state);
/* Synchronize to scheduled poll. We cannot touch poll list, it
* can be even on different cpu. So just clear netif_running().
*
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
smp_mb__after_atomic(); /* Commit netif_running(). */
}
/* 未发送完的数据发送完 */
dev_deactivate_many(head);
list_for_each_entry(dev, head, close_list) {
const struct net_device_ops *ops = dev->netdev_ops;
/*
* Call the device specific close. This cannot fail.
* Only if device is UP
*
* We allow it to be called even after a DETACH hot-plug
* event.
*/
/* 调用设备关闭操作 */
if (ops->ndo_stop)
ops->ndo_stop(dev);
/* 标记设备关闭 */
dev->flags &= ~IFF_UP;
/* 启用netpoll */
netpoll_poll_enable(dev);
}
return 0;
}
ndo_stop
ndo_stop
为关闭网卡时,不同网卡驱动注册的不同的关闭函数,我们以海思的网卡驱动为例,分析下ndo_stop函数的实现。代码位于kernel\drivers\net\ethernet\hisilicon\hns\hns_enet.c
。
hns_nic_net_stop
static int hns_nic_net_stop(struct net_device *ndev)
{
hns_nic_net_down(ndev);
return 0;
}
hns_nic_net_down
static void hns_nic_net_down(struct net_device *ndev)
{
int i;
struct hnae_ae_ops *ops;
struct hns_nic_priv *priv = netdev_priv(ndev);
if (test_and_set_bit(NIC_STATE_DOWN, &priv->state))
return;
(void)del_timer_sync(&priv->service_timer);
netif_tx_stop_all_queues(ndev);
netif_carrier_off(ndev);
netif_tx_disable(ndev);
priv->link = 0;
if (priv->phy)
phy_stop(priv->phy);
ops = priv->ae_handle->dev->ops;
if (ops->stop)
ops->stop(priv->ae_handle);
netif_tx_stop_all_queues(ndev);
for (i = priv->ae_handle->q_num - 1; i >= 0; i--) {
hns_nic_ring_close(ndev, i);
hns_nic_ring_close(ndev, i + priv->ae_handle->q_num);
/* clean tx buffers*/
hns_nic_tx_clr_all_bufs(priv->ring_data + i);
}
}
hns_nic_net_down()
中会调用netif_carrier_off()
通知内核子系统网络断开。下面我们详细分析下netif_carrier_off()的实现。
netif_carrier_off()
void netif_carrier_off(struct net_device *dev)
{
/* 设置网卡为载波断开状态 即nocarrier状态,上行时软中断下半部读到该状态不会进行网卡收包 */
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
/* 增加设备改变状态 */
atomic_inc(&dev->carrier_changes);
/* 加入事件处理队列进行处理 */
linkwatch_fire_event(dev);
}
}
linkwatch_fire_event()
linkwatch_fire_event()函数将设备加入到事件队列,并且进行事件调度,调度中会根据是否为紧急事件做不同处理。
void linkwatch_fire_event(struct net_device *dev)
{
/* 判断是否是紧急处理的事件 */
bool urgent = linkwatch_urgent_event(dev);
/* 判断是否是紧急处理的事件 */
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
/* 添加事件到事件列表 */
linkwatch_add_event(dev);
} else if (!urgent)
/* 设备以前已经设置了pending标记,不是紧急事件,直接返回 */
return;
/* 事件调度 */
linkwatch_schedule_work(urgent);
}
linkwatch_urgent_event()
linkwatch_urgent_event()
判断是否是否需要紧急处理。
static bool linkwatch_urgent_event(struct net_device *dev)
{
/* 设备未运行,非紧急 */
if (!netif_running(dev))
return false;
/* 设备的索引号与连接索引号不等,紧急 */
if (dev->ifindex != dev_get_iflink(dev))
return true;
/* 设备作为team port,紧急 */
if (dev->priv_flags & IFF_TEAM_PORT)
return true;
/* 连接与否 && 发送队列排队规则改变与否 */
return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
linkwatch_add_event()
linkwatch_add_event()
将设备加入到事件处理链表。
static void linkwatch_add_event(struct net_device *dev)
{
unsigned long flags;
spin_lock_irqsave(&lweventlist_lock, flags);
/* 若未添加,则添加设备到事件列表 */
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
dev_hold(dev);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
linkwatch_schedule_work()
linkwatch_schedule_work()
对事件处理进行调度,紧急事件立即执行,非紧急事件延后执行。
static void linkwatch_schedule_work(int urgent)
{
unsigned long delay = linkwatch_nextevent - jiffies;
/* 已经设置了紧急标记,则返回 */
if (test_bit(LW_URGENT, &linkwatch_flags))
return;
/* 需要紧急调度 */
if (urgent) {
/* 之前设置了,则返回 */
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
return;
/* 未设置紧急,则立即执行 */
delay = 0;
}
/* 如果大于1s则立即执行 */
if (delay > HZ)
delay = 0;
/* 如果设置了紧急标记,则立即执行 */
if (test_bit(LW_URGENT, &linkwatch_flags))
mod_delayed_work(system_wq, &linkwatch_work, 0);
else
/* 未设置紧急标记,则按照delay执行 */
schedule_delayed_work(&linkwatch_work, delay);
}
__linkwatch_run_queue()
__linkwatch_run_queue()
完成对事件调度队列中设备的处理。
static void __linkwatch_run_queue(int urgent_only)
{
struct net_device *dev;
LIST_HEAD(wrk);
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
* cause a storm of messages on the netlink
* socket. This limit does not apply to up events
* while the device qdisc is down.
*/
/* 已达到调度时间 */
if (!urgent_only)
linkwatch_nextevent = jiffies + HZ;
/* Limit wrap-around effect on delay. */
/*
未到达调度时间,并且下一次调度在当前时间的1s以后
那么设置调度时间是当前时间
*/
else if (time_after(linkwatch_nextevent, jiffies + HZ))
linkwatch_nextevent = jiffies;
/* 清除紧急标识 */
clear_bit(LW_URGENT, &linkwatch_flags);
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
/* 遍历链表 */
while (!list_empty(&wrk)) {
/* 获取设备 */
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
/* 从链表移除设备 */
list_del_init(&dev->link_watch_list);
/* 未到达调度时间 && 不需要紧急处理 */
if (urgent_only && !linkwatch_urgent_event(dev)) {
/* 添加到链表尾部 */
list_add_tail(&dev->link_watch_list, &lweventlist);
/* 继续处理 */
continue;
}
spin_unlock_irq(&lweventlist_lock);
/* 处理设备 */
linkwatch_do_dev(dev);
spin_lock_irq(&lweventlist_lock);
}
/* 链表有未处理事件,则以非紧急状态调度队列 */
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
}
linkwatch_do_dev()
linkwatch_do_dev()
完成对某个设备的状态改变处理。
static void linkwatch_do_dev(struct net_device *dev)
{
/*
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
*/
/* 清除pending标记 */
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
/* 如果设备启动状态 */
if (dev->flags & IFF_UP) {
/* 链路连接 */
if (netif_carrier_ok(dev))
/* 启用排队规则 */
dev_activate(dev);
else
/* 关闭排队规则 */
dev_deactivate(dev);
/* 设备状态改变处理,执行netdev_chain上设备状态变更回调 */
netdev_state_change(dev);
}
dev_put(dev);
}
phy_stop()
最后,hns_nic_net_down()
中会调用phy_stop()
将网卡link down。
void phy_stop(struct phy_device *phydev)
{
mutex_lock(&phydev->lock);
if (PHY_HALTED == phydev->state)
goto out_unlock;
if (phy_interrupt_is_valid(phydev)) {
/* Disable PHY Interrupts */
phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
/* Clear any pending interrupts */
phy_clear_interrupt(phydev);
}
phydev->state = PHY_HALTED;
out_unlock:
mutex_unlock(&phydev->lock);
/* Cannot call flush_scheduled_work() here as desired because
* of rtnl_lock(), but PHY_HALTED shall guarantee phy_change()
* will not reenable interrupts.
*/
}
phy_stop()
将phydev->state设置为PHY_HALTED,将网卡关闭。
__dev_open
__dev_open
为设备启用核心函数,该函数打开eth0,设置启用标记,并且设置接收模式,排队规则等。
static int __dev_open(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
/* 设备不可用 */
if (!netif_device_present(dev))
return -ENODEV;
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
/* 禁用netpoll */
netpoll_poll_disable(dev);
/* 设备打开前通知 */
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
if (ret)
return ret;
/* 设置设备打开标记,设备将设置IFF_UP标志位*/
set_bit(__LINK_STATE_START, &dev->state);
/* 校验地址 */
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
/* 执行打开 */
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
/* 启用netpoll */
netpoll_poll_enable(dev);
/* 失败,清除打开标记 */
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
/* 设备打开操作 */
else {
/* 设置打开标记 */
dev->flags |= IFF_UP;
/* 设置接收模式 */
dev_set_rx_mode(dev);
/* 初始化排队规则 */
dev_activate(dev);
/* 加入设备数据到熵池 */
add_device_randomness(dev->dev_addr, dev->addr_len);
}
return ret;
}
hns_nic_net_open()
我们以海思的网卡驱动为例,分析下ndo_open()
函数的实现。代码位于kernel\drivers\net\ethernet\hisilicon\hns\hns_enet.c
。
static int hns_nic_net_open(struct net_device *ndev)
{
struct hns_nic_priv *priv = netdev_priv(ndev);
struct hnae_handle *h = priv->ae_handle;
int ret;
if (test_bit(NIC_STATE_TESTING, &priv->state))
return -EBUSY;
priv->link = 0;
netif_carrier_off(ndev);
/*设置tx queue的个数*/
ret = netif_set_real_num_tx_queues(ndev, h->q_num);
if (ret < 0) {
netdev_err(ndev, "netif_set_real_num_tx_queues fail, ret=%d!\n",
ret);
return ret;
}
/*设置rx queue的个数*/
ret = netif_set_real_num_rx_queues(ndev, h->q_num);
if (ret < 0) {
netdev_err(ndev,
"netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
return ret;
}
/*启动网卡*/
ret = hns_nic_net_up(ndev);
if (ret) {
netdev_err(ndev,
"hns net up fail, ret=%d!\n", ret);
return ret;
}
return 0;
}
hns_nic_net_up()
static int hns_nic_net_up(struct net_device *ndev)
{
struct hns_nic_priv *priv = netdev_priv(ndev);
struct hnae_handle *h = priv->ae_handle;
int i, j, k;
int ret;
/*初始化中断,并设置中断函数为hns_irq_handle,每个rx和tx queue都对应一个中断*/
ret = hns_nic_init_irq(priv);
if (ret != 0) {
netdev_err(ndev, "hns init irq failed! ret=%d\n", ret);
return ret;
}
for (i = 0; i < h->q_num * 2; i++) {
/*使能中断,使能napi*/
ret = hns_nic_ring_open(ndev, i);
if (ret)
goto out_has_some_queues;
}
for (k = 0; k < h->q_num; k++)
h->dev->ops->toggle_queue_status(h->qs[k], 1);
/*设置mac地址*/
ret = h->dev->ops->set_mac_addr(h, ndev->dev_addr);
if (ret)
goto out_set_mac_addr_err;
/*hns的start函数为null*/
ret = h->dev->ops->start ? h->dev->ops->start(h) : 0;
if (ret)
goto out_start_err;
if (priv->phy)
/*启动phy*/
phy_start(priv->phy);
clear_bit(NIC_STATE_DOWN, &priv->state);
/*修改time 每一秒到期一次*/
(void)mod_timer(&priv->service_timer, jiffies + SERVICE_TIMER_HZ);
return 0;
out_start_err:
netif_stop_queue(ndev);
out_set_mac_addr_err:
for (k = 0; k < h->q_num; k++)
h->dev->ops->toggle_queue_status(h->qs[k], 0);
out_has_some_queues:
for (j = i - 1; j >= 0; j--)
hns_nic_ring_close(ndev, j);
set_bit(NIC_STATE_DOWN, &priv->state);
return ret;
}
phy_start()
最后会调用到phy_start()
启动网卡。
void phy_start(struct phy_device *phydev)
{
bool do_resume = false;
int err = 0;
mutex_lock(&phydev->lock);
switch (phydev->state) {
case PHY_STARTING:
phydev->state = PHY_PENDING;
break;
case PHY_READY:
phydev->state = PHY_UP;
break;
case PHY_HALTED:
/* make sure interrupts are re-enabled for the PHY */
err = phy_enable_interrupts(phydev);
if (err < 0)
break;
phydev->state = PHY_RESUMING;
do_resume = true;
break;
default:
break;
}
mutex_unlock(&phydev->lock);
/* if phy was suspended, bring the physical link up again */
if (do_resume)
phy_resume(phydev);
}
参考
https://blog.csdn.net/qq_29044159/article/details/118030335
https://www.likecs.com/show-308571259.html
https://blog.csdn.net/Longyu_wlz/article/details/108026902
http://bbs.chinaunix.net/thread-2020457-1-1.html
https://blog.csdn.net/tiantao2012/article/details/75283527
https://blog.csdn.net/sinat_20184565/article/details/104353185