转载： Linux内核中netlink协议族的实现

本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整性，严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源：http://yfydz.cublog.cn

1. 前言

netlink协议族是Linux内核网络部分的一个固定部分, 一旦在内核配置中选了网络支持就自动带了而不能单独去掉。
netlink的实现源码在net/netlink目录下，主要是net/netlink/af_netlink.c文件。

以下内核代码版本为2.6.19.2, 如无特别说明代码取自net/netlink/af_netlink.c。

2. 数据结构

netlink套接口结构:
/* net/netlink/af_netlink.c */
struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */
struct sock sk;
u32   pid; // 自己的pid, 通常是0
u32   dst_pid; // 对方的pid
u32   dst_group; // 对方的组
u32   flags;
u32   subscriptions;
u32   ngroups; // 多播组数量
unsigned long *groups; // 多播组号
unsigned long state;
wait_queue_head_t wait; // 等待队列,用于处理接收发送包时的top half
struct netlink_callback *cb; // 回调结构,包含回调函数
spinlock_t cb_lock;
void   (*data_ready)(struct sock *sk, int bytes); // 数据到达时
                                //的操作, netlink可有不同类型, 如ROUTE, FIREWALL, ARPD等,                                  //每种类型都自己定义的data_ready处理
struct module *module;
};
这个结构先是包含一个标准的struct sock结构,后面又跟和netlink相关的特有相关数据,内核中其他协议的sock也是类似定义的, 注意sock结构必须放在第一位,这是为了可以直接将sock的指针转为netlink_sock的指针。

netlink sock的表:
struct netlink_table {
struct nl_pid_hash hash; // 根据pid进行HASH的netlink sock链表, 相当于客户端链表
struct hlist_head mc_list; // 多播的sock链表
unsigned long *listeners; // 监听者标志
unsigned int nl_nonroot;
unsigned int groups; // 每个netlink的协议类型可以定义多个组, 8的倍数,最小是32
struct module *module;
int registered;
};
最大可有MAX_LINKS(32)个表，处理不同协议类型的netlink套接口, 注意由于是自身的通信, 本机同时作为服务器和客户端, 服务端需要一个套接口对应, 每个客户端也要有一个套接口对应, 多个客户端的套接口形成一个链表.
struct nl_pid_hash {
struct hlist_head *table; // 链表节点
unsigned long rehash_time; // 重新计算HASH的时间间隔
unsigned int mask;
unsigned int shift;
unsigned int entries; // 链表节点数
unsigned int max_shift; // 最大幂值
u32 rnd; // 随机数
};
其他和netlink数据相关的数据结构在include/linux/netlink.h中定义, 不过这些结构更多用在各具体的netlink对象的实现中, 在基本netlink套接口中到是用得不多。

3. af_netlink协议初始化

static int __init netlink_proto_init(void)
{
struct sk_buff *dummy_skb;
int i;
unsigned long max;
unsigned int order;
// 登记netlink_proto结构, 该结构定义如下:
// static struct proto netlink_proto = {
// .name   = "NETLINK",
// .owner   = THIS_MODULE,
// .obj_size = sizeof(struct netlink_sock),
// };
// 最后一个参数为0, 表示不进行slab的分配, 只是简单的将netlink_proto结构
// 挂接到系统的网络协议链表中,这个结构最主要是告知了netlink sock结构的大小
int err = proto_register(&netlink_proto, 0);
if (err != 0)
goto out;
BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb));
// 分配MAX_LINKS个netlink表结构
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
if (!nl_table)
goto panic;
// 以下根据系统内存大小计算最大链表元素个数
// PAGE_SHIFT是每页大小的2的幂,对i386是12,即每页是4K,2^12
// 对于128M内存的机器,max计算值是(128*1024) >> (21-12) = 256
// 对于64M内存的机器,max计算值是(64*1024) >> (23-12) = 32
if (num_physpages >= (128 * 1024))
max = num_physpages >> (21 - PAGE_SHIFT);
else
max = num_physpages >> (23 - PAGE_SHIFT);
// 根据max再和PAGE_SHIFT计算总内存空间相应的幂值order
order = get_bitmask_order(max) - 1 + PAGE_SHIFT;
// max是最大节点数
max = (1UL << order) / sizeof(struct hlist_head);
// order是max对于2的幂数
order = get_bitmask_order(max > UINT_MAX ? UINT_MAX : max) - 1;
for (i = 0; i < MAX_LINKS; i++) {
struct nl_pid_hash *hash = &nl_table[i].hash;
// 为netlink的每个协议类型分配HASH表链表头
hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table));
if (!hash->table) {
   while (i-- > 0)
    nl_pid_hash_free(nl_table[i].hash.table,
       1 * sizeof(*hash->table));
   kfree(nl_table);
   goto panic;
}
// 初始化HASH表参数
memset(hash->table, 0, 1 * sizeof(*hash->table));
// 最大幂数
hash->max_shift = order;
hash->shift = 0;
hash->mask = 0;
hash->rehash_time = jiffies;
}
// 登记netlink协议族的的操作结构
sock_register(&netlink_family_ops);
#ifdef CONFIG_PROC_FS
proc_net_fops_create("netlink", 0, &netlink_seq_fops);
#endif
/* The netlink device handler may be needed early. */
// 初始化路由netlink
rtnetlink_init();
out:
return err;
panic:
panic("netlink_init: Cannot allocate nl_table\n");
}
core_initcall(netlink_proto_init);

4. 建立netlink套接口

4.1 建立对应客户端的套接口
// netlink协议族操作, 在用户程序使用socket打开netlink类型的socket时调用,
// 相应的create函数在__sock_create(net/socket.c)函数中调用:
static struct net_proto_family netlink_family_ops = {
.family = PF_NETLINK,
.create = netlink_create,
.owner = THIS_MODULE, /* for consistency */
};
// 在用户空间每次打开netlink socket时都会调用此函数:
static int netlink_create(struct socket *sock, int protocol)
{
struct module *module = NULL;
struct netlink_sock *nlk;
unsigned int groups;
int err = 0;
// sock状态初始化
sock->state = SS_UNCONNECTED;
// 对netlink sock的类型和协议(实际是netlink_family类型)限制检查
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
return -ESOCKTNOSUPPORT;
if (protocol<0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;
netlink_lock_table();
#ifdef CONFIG_KMOD
// 如果相应的netlink协议是模块又没有加载的话先加载该模块
if (!nl_table[protocol].registered) {
netlink_unlock_table();
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
netlink_lock_table();
}
#endif
if (nl_table[protocol].registered &&
     try_module_get(nl_table[protocol].module))
module = nl_table[protocol].module;
// groups这个值在函数后面也没见用上, 这句没意义
groups = nl_table[protocol].groups;
netlink_unlock_table();
// 真正的建立netlink sock的函数
if ((err = __netlink_create(sock, protocol)) < 0)
goto out_module;
nlk = nlk_sk(sock->sk);
nlk->module = module;
out:
return err;
out_module:
module_put(module);
goto out;
}

// 基本函数
static int __netlink_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct netlink_sock *nlk;
// netlink sock的基本操作
sock->ops = &netlink_ops;
// 分配sock结构, 通过netlink_proto中的obj_size指出了netlink sock的大小
sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
if (!sk)
return -ENOMEM;
// 初始化sock基本数据, 将sock和socket关联起来
sock_init_data(sock, sk);
// 将普通sock转为netlink sock,实际只是重新定义的一下指针类型,指针本身值不变
nlk = nlk_sk(sk);
// 初始化sock的锁
spin_lock_init(&nlk->cb_lock);
// 初始化等待队列
init_waitqueue_head(&nlk->wait);
// sock的析构函数,释放接收队列中的skb数据包
sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
// 注意这里没有重新定义sk的sk_data_ready函数
// 在sock_init_data()函数中将sk_data_ready定义为sock_def_readable()函数
return 0;
}

用户空间使用socket(2)系统调用打开netlink类型的套接口时, 在内核中会调用sys_sock()函数, 然后是调用__sock_create()函数, 在其中调用netlink协议族的create()函数, 即netlink_create()函数.

4.2 建立服务器端的套接口

以前也介绍过另一个建立netlink sock的函数netlink_kernel_create, 一般是在netlink的各种协议类型模块初始化时调用的, 而不是socket系统调用时调用的, 每个netlink协议初始化是只调用一次, 建立一个内核中的netlink接口, 相当于服务器, 其中也调用了__netlink_create()函数:
/*
* We export these functions to other modules. They provide a
* complete set of kernel non-blocking support for message
* queueing.
*/
struct sock *
netlink_kernel_create(int unit, unsigned int groups,
                      void (*input)(struct sock *sk, int len),
                      struct module *module)
{
struct socket *sock;
struct sock *sk;
struct netlink_sock *nlk;
unsigned long *listeners = NULL;
BUG_ON(!nl_table);
if (unit<0 || unit>=MAX_LINKS)
return NULL;
// 这里的lite表示只是简单分配一个socket,没有真正初始化
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
// 用这个lite sock再建立netlink sock
if (__netlink_create(sock, unit) < 0)
goto out_sock_release;
if (groups < 32)
groups = 32;
// listerns是个位图对应groups中每个元素
listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
if (!listeners)
goto out_sock_release;
sk = sock->sk;
// 重新定义了sk_data_ready函数
sk->sk_data_ready = netlink_data_ready;
// 这个是相应的各netlink协议数据处理函数
if (input)
nlk_sk(sk)->data_ready = input;
if (netlink_insert(sk, 0))
goto out_sock_release;
nlk = nlk_sk(sk);
nlk->flags |= NETLINK_KERNEL_SOCKET;
netlink_table_grab();
// 注册到相应unit的netlink协议表中
nl_table[unit].groups = groups;
nl_table[unit].listeners = listeners;
nl_table[unit].module = module;
// 该标志表示该项被登记
nl_table[unit].registered = 1;
netlink_table_ungrab();
return sk;
out_sock_release:
kfree(listeners);
sock_release(sock);
return NULL;
}

5. netlink套接口的操作

在__netlink_create函数中定义了netlink套接口的操作结构为netlink_ops:
sock->ops = &netlink_ops;
该结构定义如下:
static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
.bind = netlink_bind,
.connect = netlink_connect,
.socketpair = sock_no_socketpair, // 无定义
.accept = sock_no_accept, // 无定义
.getname = netlink_getname,
.poll = datagram_poll,
.ioctl = sock_no_ioctl, // 无定义
.listen = sock_no_listen, // 无定义
.shutdown = sock_no_shutdown, // 无定义
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap, // 无定义
.sendpage = sock_no_sendpage, // 无定义
};

5.1 释放
在close(2)时调用
static int netlink_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk;
if (!sk)
return 0;
// 将套接口sk从系统sk链表和绑定链表中断开
netlink_remove(sk);
nlk = nlk_sk(sk);
spin_lock(&nlk->cb_lock);
if (nlk->cb) {
// 释放netlink控制块处理
if (nlk->cb->done)
   nlk->cb->done(nlk->cb);
netlink_destroy_callback(nlk->cb);
nlk->cb = NULL;
}
spin_unlock(&nlk->cb_lock);
/* OK. Socket is unlinked, and, therefore,
    no new packets will arrive */
// 设置sk状态为SOCK_DEAD, 断开sock和sk的互指
sock_orphan(sk);
sock->sk = NULL;
// 唤醒所有等待队列
wake_up_interruptible_all(&nlk->wait);
// 清空写队列
skb_queue_purge(&sk->sk_write_queue);
if (nlk->pid && !nlk->subscriptions) {
// 发送释放通知
struct netlink_notify n = {
      .protocol = sk->sk_protocol,
      .pid = nlk->pid,
       };
atomic_notifier_call_chain(&netlink_chain,
    NETLINK_URELEASE, &n);
}
// 减少模块计数
if (nlk->module)
module_put(nlk->module);
// 相当于加锁
netlink_table_grab();
if (nlk->flags & NETLINK_KERNEL_SOCKET) {
// 释放内核中的netlink服务器端
kfree(nl_table[sk->sk_protocol].listeners);
nl_table[sk->sk_protocol].module = NULL;
nl_table[sk->sk_protocol].registered = 0;
} else if (nlk->subscriptions)
netlink_update_listeners(sk);
// 相当于解锁
netlink_table_ungrab();
// 释放该netlink sock的多播组
kfree(nlk->groups);
nlk->groups = NULL;
// 释放sock
sock_put(sk);
return 0;
}

5.2 绑定bind
绑定通常是针对服务端
static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
int err;
// 检查一下地址的协议族是否为AF_NETLINK
if (nladdr->nl_family != AF_NETLINK)
return -EINVAL;
/* Only superuser is allowed to listen multicasts */
if (nladdr->nl_groups) {
// 指定了多播组, 这是需要root权限
if (!netlink_capable(sock, NL_NONROOT_RECV))
   return -EPERM;
if (nlk->groups == NULL) {
// 分配多播组空间
   err = netlink_alloc_groups(sk);
   if (err)
    return err;
}
}
if (nlk->pid) {
// 如果sock的pid非0, 检查是否匹配在nladdr地址结构中指定的pid
if (nladdr->nl_pid != nlk->pid)
   return -EINVAL;
} else {
// sock的pid为0, 根据nladdr是否指定pid来执行插入或
err = nladdr->nl_pid ?
   netlink_insert(sk, nladdr->nl_pid) :
   netlink_autobind(sock);
if (err)
   return err;
}
// 非多播情况时就可以返回成功了
if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
return 0;
netlink_table_grab();
// 多播情况下更新sock参数
netlink_update_subscriptions(sk, nlk->subscriptions +
                                  hweight32(nladdr->nl_groups) -
                                  hweight32(nlk->groups[0]));
nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
netlink_update_listeners(sk);
netlink_table_ungrab();
return 0;
}

// 根据pid插入
static int netlink_insert(struct sock *sk, u32 pid)
{
// netlink相应协议的HASH结构
struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
struct hlist_head *head;
// 缺省错误为地址已经被使用
int err = -EADDRINUSE;
struct sock *osk;
struct hlist_node *node;
int len;
netlink_table_grab();
// 根据pid查找相应HASH链表头
head = nl_pid_hashfn(hash, pid);
len = 0;
// 检查pid是否已经在链表中, 有则失败
sk_for_each(osk, node, head) {
if (nlk_sk(osk)->pid == pid)
   break;
len++;
}
if (node)
goto err;
// 缺省错误改为系统忙
err = -EBUSY;
// 如果sock的pid不为0, 错误, 只有pid为0的sock才能执行该函数
// sock的pid不为0时不会再进行insert操作了
if (nlk_sk(sk)->pid)
goto err;

// 缺省错误改为无内存空间
err = -ENOMEM;
if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
goto err;
// 如果链表不为空而且链表长度数量过长,会调整HASH表,重新获取HASH链表头
// 不过这种情况很少发生
if (len && nl_pid_hash_dilute(hash, len))
head = nl_pid_hashfn(hash, pid);
hash->entries++;
// 将pid赋值给sock的pid参数
nlk_sk(sk)->pid = pid;
// 将sock节点添加进HASH链表
sk_add_node(sk, head);
err = 0;
err:
netlink_table_ungrab();
return err;
}

// 未指定pid时的自动绑定
// 实际是选一个没用过的pid后再进行插入操作
static int netlink_autobind(struct socket *sock)
{
// 从socket找到sock
struct sock *sk = sock->sk;
// netlink相应协议的HASH结构
struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
struct hlist_head *head;
struct sock *osk;
struct hlist_node *node;
// pid取为当前进程的组ID
s32 pid = current->tgid;
int err;
// 有符号32位数
static s32 rover = -4097;
retry:
cond_resched();
netlink_table_grab();
// 找合适的HASH链表头
head = nl_pid_hashfn(hash, pid);
sk_for_each(osk, node, head) {
// 查找链表中是否已经有该pid
if (nlk_sk(osk)->pid == pid) {
// 存在, 则更新pid, 重新检查, 注意这时的pid是个负数
   /* Bind collision, search negative pid values. */
   pid = rover--;
   if (rover > -4097)
    rover = -4097;
   netlink_table_ungrab();
   goto retry;
}
}
netlink_table_ungrab();
// 此时的pid是一个负数转换为无符号32位数, 将是一个非常大的数
// 执行正常的pid插入
err = netlink_insert(sk, pid);
if (err == -EADDRINUSE)
goto retry;
/* If 2 threads race to autobind, that is fine. */
if (err == -EBUSY)
err = 0;
return err;
}
// 更新subscriotions
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (nlk->subscriptions && !subscriptions)
__sk_del_bind_node(sk);
else if (!nlk->subscriptions && subscriptions)
sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
nlk->subscriptions = subscriptions;
}
// 更新listeners
static void
netlink_update_listeners(struct sock *sk)
{
struct netlink_table *tbl = &nl_table[sk->sk_protocol];
struct hlist_node *node;
unsigned long mask;
unsigned int i;
for (i = 0; i < NLGRPSZ(tbl->groups)/sizeof(unsigned long); i++) {
mask = 0;
// 遍历多播链表生成多播组的掩码
sk_for_each_bound(sk, node, &tbl->mc_list)
   mask |= nlk_sk(sk)->groups[i];
tbl->listeners[i] = mask;
}
/* this function is only called with the netlink table "grabbed", which
* makes sure updates are visible before bind or setsockopt return. */
}

5.3 连接

连接通常是针对客户端连接服务器
static int netlink_connect(struct socket *sock, struct sockaddr *addr,
      int alen, int flags)
{
int err = 0;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr;
if (addr->sa_family == AF_UNSPEC) {
// 目的地址协议族为AF_UNSPEC(未指定), 简单返回成功
sk->sk_state = NETLINK_UNCONNECTED;
nlk->dst_pid = 0;
nlk->dst_group = 0;
return 0;
}
// 限制目的地址协议族类型为AF_NETLINK
if (addr->sa_family != AF_NETLINK)
return -EINVAL;
/* Only superuser is allowed to send multicasts */
// 只有ROOT权限才能多播
if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND))
return -EPERM;
// 没指定pid的话自动绑定一个pid
if (!nlk->pid)
err = netlink_autobind(sock);
if (err == 0) {
// 已经指定了pid或者自动绑定成功时设置sock的对方参数, 状态为连接成功
sk->sk_state = NETLINK_CONNECTED;
nlk->dst_pid = nladdr->nl_pid;
nlk->dst_group = ffs(nladdr->nl_groups);
}
return err;
}

5.4 获取sock名称

// 填充sockaddr_nl结构中的数据
static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int
peer)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr;
// 协议族
nladdr->nl_family = AF_NETLINK;
nladdr->nl_pad = 0;
*addr_len = sizeof(*nladdr);
if (peer) {
// 对方sock的pid和groups
nladdr->nl_pid = nlk->dst_pid;
nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
} else {
// 自己sock的pid和groups
nladdr->nl_pid = nlk->pid;
nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
}
return 0;
}

5.5 poll

poll是用poll(2)或select(2)系统调用选择套接口数据是否准备好时的处理函数，netlink用的是通用
的数据报的poll处理函数dategram_poll(), 说明略。

5.6 setsockopt

设置netlink sock的各种控制参数:
static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int val = 0, err;
// sock层次要为SOL_NETLINK
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
// 读取用户空间的设置信息
if (optlen >= sizeof(int) &&
     get_user(val, (int __user *)optval))
return -EFAULT;
switch (optname) {
case NETLINK_PKTINFO:
// 处理NETLINK_RECV_PKTINFO标志, 非0设置, 0为清除
if (val)
   nlk->flags |= NETLINK_RECV_PKTINFO;
else
   nlk->flags &= ~NETLINK_RECV_PKTINFO;
err = 0;
break;
case NETLINK_ADD_MEMBERSHIP:
case NETLINK_DROP_MEMBERSHIP: {
// 加入或退出多播组
unsigned int subscriptions;
int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
// 检查权限
if (!netlink_capable(sock, NL_NONROOT_RECV))
   return -EPERM;
// 如果当前sock的多播组为空是分配空间
if (nlk->groups == NULL) {
   err = netlink_alloc_groups(sk);
   if (err)
    return err;
}
// 检查数据范围
if (!val || val - 1 >= nlk->ngroups)
   return -EINVAL;
netlink_table_grab();
// 原来的状态标志
old = test_bit(val - 1, nlk->groups);
// 如果old=1, new=0, subscriptions-1
// 如果old=0, new=1, subscriptions+1
subscriptions = nlk->subscriptions - old + new;
// 设置或清除相应状态标志
if (new)
   __set_bit(val - 1, nlk->groups);
else
   __clear_bit(val - 1, nlk->groups);
// 更新sock参数
netlink_update_subscriptions(sk, subscriptions);
netlink_update_listeners(sk);
netlink_table_ungrab();
err = 0;
break;
}
default:
err = -ENOPROTOOPT;
}
return err;
}

// 分配netlink sock的多播组空间
static int netlink_alloc_groups(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int groups;
int err = 0;
netlink_lock_table();
// 组的数量是内核初始化时固定的, 最小值32, 尽量是8的倍数
groups = nl_table[sk->sk_protocol].groups;
if (!nl_table[sk->sk_protocol].registered)
err = -ENOENT;
netlink_unlock_table();
if (err)
return err;
// NLGRPSZ(groups)进行8字节对齐
nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
if (nlk->groups == NULL)
return -ENOMEM;
nlk->ngroups = groups;
return 0;
}

5.7 getsockopt

获取netlink sock的各种控制参数:

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int len, val, err;
// sock层次要为SOL_NETLINK
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
// 读取用户空间的查询信息
if (get_user(len, optlen))
return -EFAULT;
if (len < 0)
return -EINVAL;
switch (optname) {
case NETLINK_PKTINFO:
// 只提供一种选项信息PKTINFO
if (len < sizeof(int))
   return -EINVAL;
len = sizeof(int);
// 看sock标志是否有NETLINK_RECV_PKTINFO返回1或0
val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
if (put_user(len, optlen) ||
      put_user(val, optval))
   return -EFAULT;
err = 0;
break;
default:
err = -ENOPROTOOPT;
}
return err;
}

5.8 发送消息

从用户层发送数据到内核, 内核的sock是接收方
static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
      struct msghdr *msg, size_t len)
{
// sock的IO控制块
struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
// socket -> sock
struct sock *sk = sock->sk;
// sock -> netlink sock
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *addr=msg->msg_name;
u32 dst_pid;
u32 dst_group;
struct sk_buff *skb;
int err;
// scm: Socket level control messages processing
struct scm_cookie scm;
// 设置了OOB(out of band)标志, 在TCP中支持,netlink不支持
if (msg->msg_flags&MSG_OOB)
return -EOPNOTSUPP;
if (NULL == siocb->scm)
siocb->scm = &scm;
// scm这些处理是干什么的以后再看
err = scm_send(sock, msg, siocb->scm);
if (err < 0)
return err;
// 确定目的pid和组
if (msg->msg_namelen) {
if (addr->nl_family != AF_NETLINK)
   return -EINVAL;
dst_pid = addr->nl_pid;
dst_group = ffs(addr->nl_groups);
if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
   return -EPERM;
} else {
dst_pid = nlk->dst_pid;
dst_group = nlk->dst_group;
}
// 如果sock的pid为0, 自动绑定一个pid
if (!nlk->pid) {
err = netlink_autobind(sock);
if (err)
   goto out;
}
err = -EMSGSIZE;
// 消息长度太大
if (len > sk->sk_sndbuf - 32)
goto out;
err = -ENOBUFS;
// 新生成一个skb数据包
skb = nlmsg_new(len, GFP_KERNEL);
if (skb==NULL)
goto out;
// 设置该skb的netlink控制块参数
NETLINK_CB(skb).pid = nlk->pid;
NETLINK_CB(skb).dst_pid = dst_pid;
NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
selinux_get_task_sid(current, &(NETLINK_CB(skb).sid));
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
/* What can I do? Netlink is asynchronous, so that
    we will have to save current capabilities to
    check them, when this message will be delivered
    to corresponding kernel module.   --ANK (980802)
*/
err = -EFAULT;
// 将发送的信息拷贝到skb的存储区
if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {
kfree_skb(skb);
goto out;
}
/* @netlink_send:
* Save security information for a netlink message so that permission
* checking can be performed when the message is processed. The security
* information can be saved using the eff_cap field of the
*      netlink_skb_parms structure. Also may be used to provide fine
* grained control over message transmission.
* @sk associated sock of task sending the message.,
* @skb contains the sk_buff structure for the netlink message.
* Return 0 if the information was successfully saved and message
* is allowed to be transmitted.
*/
err = security_netlink_send(sk, skb);
if (err) {
kfree_skb(skb);
goto out;
}
// 如果是多播的,先进行广播发送
if (dst_group) {
// 增加使用者计数, 使skb不会真正释放
atomic_inc(&skb->users);
netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
}
// 单播发送
err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
out:
return err;
}

// netlink广播, 发送到组内的全部sock
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
        u32 group, gfp_t allocation)
{
// netlink广播数据结构信息
struct netlink_broadcast_data info;
struct hlist_node *node;
struct sock *sk;
// 调整skb空间
skb = netlink_trim(skb, allocation);
// 填充info结构基本参数
info.exclude_sk = ssk;
info.pid = pid;
info.group = group;
info.failure = 0;
info.congested = 0;
info.delivered = 0;
info.allocation = allocation;
info.skb = skb;
info.skb2 = NULL;
/* While we sleep in clone, do not allow to change socket list */
netlink_lock_table();
// 遍历多播链表, 分别对每个sock进行单播
sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
do_one_broadcast(sk, &info);
// 释放skb, 其实没有立即释放, 要先减少使用者数
kfree_skb(skb);
netlink_unlock_table();
// 如果分配了skb2,释放之
if (info.skb2)
kfree_skb(info.skb2);
if (info.delivered) {
if (info.congested && (allocation & __GFP_WAIT))
   yield();
return 0;
}
if (info.failure)
return -ENOBUFS;
return -ESRCH;
}
// 单一广播
static inline int do_one_broadcast(struct sock *sk,
       struct netlink_broadcast_data *p)
{
struct netlink_sock *nlk = nlk_sk(sk);
int val;
if (p->exclude_sk == sk)
goto out;
// 检查pid和组是否合法
if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
     !test_bit(p->group - 1, nlk->groups))
goto out;
if (p->failure) {
netlink_overrun(sk);
goto out;
}
sock_hold(sk);
if (p->skb2 == NULL) {
if (skb_shared(p->skb)) {
// 克隆skb
   p->skb2 = skb_clone(p->skb, p->allocation);
} else {
// 此时skb2不会为NULL的
   p->skb2 = skb_get(p->skb);
   /*
    * skb ownership may have been set when
    * delivered to a previous socket.
    */
   skb_orphan(p->skb2);
}
}
if (p->skb2 == NULL) {
// 如果还是为NULL必然是克隆失败
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
// 否则发送skb2
} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
netlink_overrun(sk);
} else {
// 数据正常发送
p->congested |= val;
p->delivered = 1;
p->skb2 = NULL;
}
sock_put(sk);
out:
return 0;
}

static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
struct netlink_sock *nlk = nlk_sk(sk);
// 发送缓冲中要有足够空间
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
     !test_bit(0, &nlk->state)) {
skb_set_owner_r(skb, sk);
// 添加到接收队列尾, 由于是本机内部通信, 可以自己找到要发送的目的方,
// 所以直接将数据扔给目的方, 所以是接收队列
skb_queue_tail(&sk->sk_receive_queue, skb);
// 调用netlink sock的sk_data_ready函数处理, 由此进入内核中netlink各协议
// 的回调处理
sk->sk_data_ready(sk, skb->len);
return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;
}
return -1;
}

// netlink单播
int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
{
struct sock *sk;
int err;
long timeo;
// 调整skb大小
skb = netlink_trim(skb, gfp_any());
// 获取超时时间
timeo = sock_sndtimeo(ssk, nonblock);
retry:
// ssk是服务器端的sock, 然后根据pid找到客户端的sock
sk = netlink_getsockbypid(ssk, pid);
if (IS_ERR(sk)) {
kfree_skb(skb);
return PTR_ERR(sk);
}
// 将数据包附着在客户端sock上
err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);
if (err == 1)
goto retry;
if (err)
return err;
// 发送netlink数据包
return netlink_sendskb(sk, skb, ssk->sk_protocol);
}
/*
* Attach a skb to a netlink socket.
* The caller must hold a reference to the destination socket. On error, the
* reference is dropped. The skb is not send to the destination, just all
* all error checks are performed and memory in the queue is reserved.
* Return values:
* < 0: error. skb freed, reference to sock dropped.
* 0: continue
* 1: repeat lookup - reference dropped while waiting for socket memory.
*/
// 注意这个是内核全局函数, 非static
int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
long timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
nlk = nlk_sk(sk);
// 检查接收缓存大小是否足够, 不够的话阻塞等待直到出错或条件满足
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
     test_bit(0, &nlk->state)) {
// 声明当前进程的等待队列
DECLARE_WAITQUEUE(wait, current);
if (!timeo) {
   if (!ssk || nlk_sk(ssk)->pid == 0)
    netlink_overrun(sk);
   sock_put(sk);
   kfree_skb(skb);
   return -EAGAIN;
}
// 设置当前进程状态为可中断的
__set_current_state(TASK_INTERRUPTIBLE);
// 将sock挂接到等待队列
add_wait_queue(&nlk->wait, &wait);
// 空间不够的话阻塞, timeo为阻塞超时
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
       test_bit(0, &nlk->state)) &&
      !sock_flag(sk, SOCK_DEAD))
   timeo = schedule_timeout(timeo);
// 进程状态运行
__set_current_state(TASK_RUNNING);
// 删除等待队列
remove_wait_queue(&nlk->wait, &wait);
sock_put(sk);
if (signal_pending(current)) {
// 阻塞是通过超时解开的,而不是空间条件符合解开, 属于错误状态
   kfree_skb(skb);
   return sock_intr_errno(timeo);
}
// 返回1, 重新选sock
return 1;
}
// 条件满足, 直接将skb的所有者设为该netlink sock
skb_set_owner_r(skb, sk);
return 0;
}
// 注意这个是内核全局函数, 非static
int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)
{
int len = skb->len;
// 将skb添加到接收队列末尾
skb_queue_tail(&sk->sk_receive_queue, skb);
// 调用netlink sock的sk_data_ready函数处理
sk->sk_data_ready(sk, len);
sock_put(sk);
return len;
}

5.9 接收消息

数据是内核传向用户空间的
static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
      struct msghdr *msg, size_t len,
      int flags)
{
// sock的IO控制块
struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
// scm
struct scm_cookie scm;
// socket -> sock
struct sock *sk = sock->sk;
// sock -> netlink sock
struct netlink_sock *nlk = nlk_sk(sk);
// 是否是非阻塞的
int noblock = flags&MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb;
int err;
// 不能带OOB标志
if (flags&MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
// 接收一个数据包
skb = skb_recv_datagram(sk,flags,noblock,&err);
if (skb==NULL)
goto out;
msg->msg_namelen = 0;
// 收到的实际数据长度
copied = skb->len;
// 接收缓冲小于数据长度, 设置数据裁剪标志
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
skb->h.raw = skb->data;
// 将skb的数据拷贝到接收缓冲区
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
if (msg->msg_name) {
// sock有效, 填写nl sock的数据
struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name;
addr->nl_family = AF_NETLINK;
addr->nl_pad    = 0;
addr->nl_pid = NETLINK_CB(skb).pid;
addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
// 接收数据包信息标志, 将消息头拷贝到用户空间
if (nlk->flags & NETLINK_RECV_PKTINFO)
netlink_cmsg_recv_pktinfo(msg, skb);
if (NULL == siocb->scm) {
memset(&scm, 0, sizeof(scm));
siocb->scm = &scm;
}
siocb->scm->creds = *NETLINK_CREDS(skb);
skb_free_datagram(sk, skb);
if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
netlink_dump(sk);
scm_recv(sock, msg, siocb->scm, flags);
out:
// 接收唤醒
netlink_rcv_wake(sk);
return err ? : copied;
}

6. 结论

netlink处理代码不是很好懂, 毕竟和其他协议不同之处是内核中同时存在服务器和客户端的sock, 因
此接收发送数据要注意数据的流向。不过在实际使用中感觉不是很稳定, 流量大时会发生各种奇异的死机现象。

posted on 2014-02-25 15:08 Thomson-Blog 阅读(1335) 评论(0) 收藏举报