DPDK flow_filtering 源码阅读
代码部分
main.c
/*-
* BSD LICENSE
*
* Copyright 2017 Mellanox.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Mellanox. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/queue.h>
#include <netinet/in.h>
#include <setjmp.h>
#include <stdarg.h>
#include <ctype.h>
#include <errno.h>
#include <getopt.h>
#include <signal.h>
#include <stdbool.h>
#include <rte_eal.h>
#include <rte_common.h>
#include <rte_malloc.h>
#include <rte_ether.h>
#include <rte_ethdev.h>
#include <rte_mempool.h>
#include <rte_mbuf.h>
#include <rte_net.h>
#include <rte_flow.h>
#include <rte_cycles.h>
static volatile bool force_quit; // 确保本条指令不会因编译器的优化而省略
// 用到这个变量时必须每次都小心地重新读取这个变量的值,而不是使用保存在寄存器里的备份。
static uint16_t port_id;
static uint16_t nr_queues = 5; // 收发队列各5条
static uint8_t selected_queue = 1;
struct rte_mempool *mbuf_pool;
struct rte_flow *flow;
#define SRC_IP ((0<<24) + (0<<16) + (0<<8) + 0) /* src ip = 0.0.0.0 */
#define DEST_IP ((192<<24) + (168<<16) + (1<<8) + 1) /* dest ip = 192.168.1.1 */
#define FULL_MASK 0xffffffff /* full mask */
#define EMPTY_MASK 0x0 /* empty mask */
#include "flow_blocks.c" // generate_ipv4_flow
static inline void
print_ether_addr(const char *what, struct ether_addr *eth_addr)
{
char buf[ETHER_ADDR_FMT_SIZE];
ether_format_addr(buf, ETHER_ADDR_FMT_SIZE, eth_addr); // Format 48bits Ethernet address in pattern xx:xx:xx:xx:xx:xx.
printf("%s%s", what, buf);
}
static void
main_loop(void)
{
struct rte_mbuf *mbufs[32];
struct ether_hdr *eth_hdr;
struct rte_flow_error error;
uint16_t nb_rx;
uint16_t i;
uint16_t j;
while (!force_quit) {
for (i = 0; i < nr_queues; i++) {
nb_rx = rte_eth_rx_burst(port_id,
i, mbufs, 32); // 收包
if (nb_rx) { // 该应用程序的主要工作是从所有队列读取数据包并打印目标队列的每个数据包
for (j = 0; j < nb_rx; j++) {
struct rte_mbuf *m = mbufs[j];
eth_hdr = rte_pktmbuf_mtod(m,
struct ether_hdr *);
/*rte_pktmbuf_mtod(m,t) 是一个宏,m 是 mbuf 指针
返回一个被强制转换成 t * 的指针。指向 mbuf 里的数据的开始处。
*/
print_ether_addr("src=",
ð_hdr->s_addr); // 见文件 rte_ether.h
print_ether_addr(" - dst=",
ð_hdr->d_addr);
printf(" - queue=0x%x",
(unsigned int)i);
printf("\n");
rte_pktmbuf_free(m); // 收的包就直接free掉
}
}
}
}
/* closing and releasing resources */
rte_flow_flush(port_id, &error); // Destroy all flow rules associated with a port.
rte_eth_dev_stop(port_id); // Stop an Ethernet device. 是函数 rte_eth_dev_start()的反义词
rte_eth_dev_close(port_id); // Close a stopped Ethernet device. The device cannot be restarted
}
#define CHECK_INTERVAL 1000 /* 100ms */
#define MAX_REPEAT_TIMES 90 /* 9s (90 * 100ms) in total */
static void
assert_link_status(void)
{
struct rte_eth_link link; // 用于检索以太网端口的链路级别信息的结构体
uint8_t rep_cnt = MAX_REPEAT_TIMES;
memset(&link, 0, sizeof(link));
do {
rte_eth_link_get(port_id, &link); // 获取链路的状态,可以获得如下信息:
/*
1. 开启或关闭(ON、OFF)
2. 链路速度(单位 Mbps)
3. 通信模式(半双工 or 全双工)
两个参数,port id 和 rte_eth_link 的结构体指针
这个函数可能会阻塞 9 秒钟。
*/
if (link.link_status == ETH_LINK_UP) // 链路开启
break;
rte_delay_ms(CHECK_INTERVAL); // 等待至少N毫秒,位于rte_cycles.h
} while (--rep_cnt);
if (link.link_status == ETH_LINK_DOWN) // 链路关闭
rte_exit(EXIT_FAILURE, ":: error: link is still down\n");
}
static void
init_port(void)
{
int ret;
uint16_t i;
struct rte_eth_conf port_conf = { // 用于配置以太网口的结构体
.rxmode = { // 端口 rx 配置
.split_hdr_size = 0, // hdr buf size (todo)
.ignore_offload_bitfield = 1, //(todo)
.offloads = DEV_RX_OFFLOAD_CRC_STRIP, // 不进行 CRC
},
.txmode = { // 端口 tx 配置
.offloads =
DEV_TX_OFFLOAD_VLAN_INSERT |
DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM |
DEV_TX_OFFLOAD_SCTP_CKSUM |
DEV_TX_OFFLOAD_TCP_TSO,
},
};
struct rte_eth_txconf txq_conf;
struct rte_eth_rxconf rxq_conf;
struct rte_eth_dev_info dev_info;
printf(":: initializing port: %d\n", port_id);
ret = rte_eth_dev_configure(port_id, // 配置网口
nr_queues, nr_queues, &port_conf); // 收发队列各5条
if (ret < 0) {
rte_exit(EXIT_FAILURE,
":: cannot configure device: err=%d, port=%u\n",
ret, port_id);
}
rte_eth_dev_info_get(port_id, &dev_info); // 查询以太网设备信息
rxq_conf = dev_info.default_rxconf; // Default RX configuration,类型:struct rte_eth_rxconf
rxq_conf.offloads = port_conf.rxmode.offloads;
/* only set Rx queues: something we care only so far */
for (i = 0; i < nr_queues; i++) { // 设置 rx queues
ret = rte_eth_rx_queue_setup(port_id, i, 512,
rte_eth_dev_socket_id(port_id),
&rxq_conf, // rx queue的配置数据,类型是 const struct rte_eth_rxconf * 指针
mbuf_pool);
if (ret < 0) {
rte_exit(EXIT_FAILURE,
":: Rx queue setup failed: err=%d, port=%u\n",
ret, port_id);
}
}
txq_conf = dev_info.default_txconf;
txq_conf.offloads = port_conf.txmode.offloads;
for (i = 0; i < nr_queues; i++) { // 设置 tx queues
ret = rte_eth_tx_queue_setup(port_id, i, 512,
rte_eth_dev_socket_id(port_id),
&txq_conf);// tx queue的配置数据,类型是 const struct rte_eth_txconf * 指针
if (ret < 0) {
rte_exit(EXIT_FAILURE,
":: Tx queue setup failed: err=%d, port=%u\n",
ret, port_id);
}
}
rte_eth_promiscuous_enable(port_id); // 启用混杂模式
ret = rte_eth_dev_start(port_id); // 启动设备
if (ret < 0) {
rte_exit(EXIT_FAILURE,
"rte_eth_dev_start:err=%d, port=%u\n",
ret, port_id);
}
assert_link_status();
printf(":: initializing port: %d done\n", port_id);
}
static void
signal_handler(int signum)
{
if (signum == SIGINT || signum == SIGTERM) {
printf("\n\nSignal %d received, preparing to exit...\n",
signum);
force_quit = true;
}
}
int
main(int argc, char **argv)
{
int ret;
uint8_t nr_ports;
struct rte_flow_error error;
ret = rte_eal_init(argc, argv); // 初始化 EAL
if (ret < 0)
rte_exit(EXIT_FAILURE, ":: invalid EAL arguments\n");
/* signal(sig, handler) 设置某一信号的对应动作
第一个参数signum指明了所要处理的信号类型,它可以取除了SIGKILL和SIGSTOP外的任何一种信号。
第二个参数handler描述了与信号关联的动作,它可以取以下三种值:
1.一个无返回值的函数地址
2. SIG_IGN :忽略
3. SIG_DFL :恢复默认处理
*/
force_quit = false;
signal(SIGINT, signal_handler); // 中断信号
signal(SIGTERM, signal_handler); // 程序结束(terminate)信号
nr_ports = rte_eth_dev_count();
if (nr_ports == 0)
rte_exit(EXIT_FAILURE, ":: no Ethernet ports found\n");
port_id = 0;
if (nr_ports != 1) { // 本程序只需要使用 1 个以太网设备
printf(":: warn: %d ports detected, but we use only one: port %u\n",
nr_ports, port_id);
}
mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", 4096, 128, 0,
RTE_MBUF_DEFAULT_BUF_SIZE,
rte_socket_id()); // 创建 mempool
if (mbuf_pool == NULL)
rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
init_port(); // 端口初始化
/* create flow for send packet with */
// 参数:port id、queue index、IP四元组、error
// flow 是 struct rte_flow
flow = generate_ipv4_flow(port_id, selected_queue, // 将目的地ip等于192.168.1.1的数据包发送到队列号1
SRC_IP, EMPTY_MASK,
DEST_IP, FULL_MASK, &error);
// 虽然配置了5条队列,但会把特定IP地址的流量放到一条特定的队列。
// 是对 rte_flow_create() 这个函数的一层封装
if (!flow) {
printf("Flow can't be created %d message: %s\n",
error.type,
error.message ? error.message : "(no stated reason)");
rte_exit(EXIT_FAILURE, "error in creating flow");
}
main_loop();
return 0;
}
flow_blocks.c
:
/*-
* BSD LICENSE
*
* Copyright 2017 Mellanox.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Mellanox nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define MAX_PATTERN_NUM 4
struct rte_flow *
generate_ipv4_flow(uint16_t port_id, uint16_t rx_q,
uint32_t src_ip, uint32_t src_mask,
uint32_t dest_ip, uint32_t dest_mask,
struct rte_flow_error *error);
/**
* create a flow rule that sends packets with matching src and dest ip
* to selected queue.
*
* @param port_id
* The selected port.
* @param rx_q
* The selected target queue.
* @param src_ip
* The src ip value to match the input packet.
* @param src_mask
* The mask to apply to the src ip.
* @param dest_ip
* The dest ip value to match the input packet.
* @param dest_mask
* The mask to apply to the dest ip.
* @param[out] error
* Perform verbose error reporting if not NULL.
*
* @return
* A flow if the rule could be created else return NULL.
*/
struct rte_flow *
generate_ipv4_flow(uint16_t port_id, uint16_t rx_q,
uint32_t src_ip, uint32_t src_mask,
uint32_t dest_ip, uint32_t dest_mask,
struct rte_flow_error *error)
{
struct rte_flow_attr attr; // 流的 attr
struct rte_flow_item pattern[MAX_PATTERN_NUM]; // 流的 pattern。关于 item,见:http://doc.dpdk.org/api/structrte__flow__item.html
struct rte_flow_action action[MAX_PATTERN_NUM]; // 流的 action , 这三个是创建一个流的关键。
struct rte_flow *flow = NULL;
struct rte_flow_action_queue queue = { .index = rx_q };
struct rte_flow_item_eth eth_spec; // spec 和
struct rte_flow_item_eth eth_mask; // mask 是 item 的另外两个字段。void * ,但需要设置成和你选定的特定 type 一样。
struct rte_flow_item_vlan vlan_spec;
struct rte_flow_item_vlan vlan_mask;
struct rte_flow_item_ipv4 ip_spec;
struct rte_flow_item_ipv4 ip_mask;
int res;
memset(pattern, 0, sizeof(pattern));
memset(action, 0, sizeof(action));
/*
* set the rule attribute.
* in this case only ingress packets will be checked.
*/
memset(&attr, 0, sizeof(struct rte_flow_attr));
attr.ingress = 1; // 意思是只对入口流量生效的属性
/*
* create the action sequence.
* one action only, move packet to queue
*/
action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE; // 动作是:Assigns packets to a given queue index.
action[0].conf = &queue;
action[1].type = RTE_FLOW_ACTION_TYPE_END; // 动作数组必须用 RTE_FLOW_ACTION_TYPE_END 作为最后一个元素来结尾
/*
* set the first level of the pattern (eth).
* since in this example we just want to get the
* ipv4 we set this level to allow all.
*/
// 第一个 item 用的是以太网地址
// rte_flow_item_eth 的字段是 目的MAC地址、源MAC地址、Type
memset(ð_spec, 0, sizeof(struct rte_flow_item_eth));
memset(ð_mask, 0, sizeof(struct rte_flow_item_eth));
eth_spec.type = 0;
eth_mask.type = 0;
pattern[0].type = RTE_FLOW_ITEM_TYPE_ETH; // item type 是 以太网的 item
pattern[0].spec = ð_spec; // 指向 eth_item 的指针(因为第一个type指定了rte_flow_item_eth)
pattern[0].mask = ð_mask; // 掩码设置成全 0,意味着所有以太网的header都是被允许的。
/* spec、mask、last 在 pattern item 中的用途:
For example, if for an IPv4 address field, spec provides 10.1.2.3, last provides 10.3.4.5
and mask provides 255.255.0.0, the effective range becomes 10.1.0.0 to 10.3.255.255.
*/
/*
* setting the second level of the pattern (vlan).
* since in this example we just want to get the
* ipv4 we also set this level to allow all.
*/
// 第二个 item 用于匹配 vlan 标签
// 也设置成了所有皆可匹配
memset(&vlan_spec, 0, sizeof(struct rte_flow_item_vlan));
memset(&vlan_mask, 0, sizeof(struct rte_flow_item_vlan));
pattern[1].type = RTE_FLOW_ITEM_TYPE_VLAN;
pattern[1].spec = &vlan_spec;
pattern[1].mask = &vlan_mask;
/*
* setting the third level of the pattern (ip).
* in this example this is the level we care about
* so we set it according to the parameters.
*/
// 第三个 item 匹配特定的IP地址,根据本函数的参数。
memset(&ip_spec, 0, sizeof(struct rte_flow_item_ipv4));
memset(&ip_mask, 0, sizeof(struct rte_flow_item_ipv4));
// ipv4 item 的结构体里面有一个字段,是 ipv4_hdr
ip_spec.hdr.dst_addr = htonl(dest_ip); // 将主机数转换成无符号长整型的网络字节顺序
ip_mask.hdr.dst_addr = dest_mask;
ip_spec.hdr.src_addr = htonl(src_ip);
ip_mask.hdr.src_addr = src_mask;
pattern[2].type = RTE_FLOW_ITEM_TYPE_IPV4;
pattern[2].spec = &ip_spec;
pattern[2].mask = &ip_mask;
/* the final level must be always type end */
// pattern 数组的最后必须用 END 宏结尾。
pattern[3].type = RTE_FLOW_ITEM_TYPE_END;
// 验证这条流的有效性
res = rte_flow_validate(port_id, &attr, pattern, action, error);
if (!res)
flow = rte_flow_create(port_id, &attr, pattern, action, error);
/* rte_flow_create() 在一个给定的端口上创建一条流规则(flow rule)
参数五个
1. port id
2. attr 数组
3. pattern 数组
4. actions 数组
5. rte_flow_error 指针,当有错误发生时,PMD会在此设置内容。
返回值类型是 struct rte_flow*
*/
return flow;
}
- classify(分类) 与 filtering(过滤)的区别:
flow_classify
的思路是创建 classifer 对象,在 classifer 里配置一些 rule,在收到一批包后,主动地调用 query classifer 的 API,查看是否有 packet 符合了 classifer 里面的 rule,并执行对应的 action。flow_filtering
的思路则是在 port 上设定 rule。对于满足 rule 的包,会采取特定的 actions。与 classify 的区别在于,filtering 是在端口上面设定 rule,只要经过这个端口的 ingress/egress 流量,都要进行 rule 的匹配。
可以看到,classify 和 filtering 共同的地方就是 rule。差别在于使用rule的方式。DPDK中,classify的库叫做rte_flow_classify.h
,filtering的库叫做rte_flow.h
,所以说它们之间还是有比较明显的区别。
- rule(规则)的代码实现:体现为结构体
rte_flow_item_xxx
,xxx 有 attr、item 和 action- attr 是这条 rule 的属性,主要用到的有 ingress 和 egress,指明这条 rule 是针对入口还是出口流量生效。
- item 就是规则的本体,用来匹配包里的特定的 header 的,代码中是通过堆叠 item 的方式来实现特定的匹配模式(pattern)。
- action 就是匹配成功后的动作,通过设定宏来定义操作。宏是库文件自带的。常见的例如计数、丢包、转发到特定队列等。
这个 sample flow_filtering
里实现了这样的行为:设立5条收/发队列,收包时将目的IP地址等于 192.168.1.1 的数据包发送到队列号为1的发送队列。所以本 sample 实现的其实是一种类似 QoS 的功能(特定IP地址进入特定的队列处理)。如果真的想实现 filtering(过滤),就把 actions 定义为 Drop 吧。然后,逻辑核上的主线程运行main_loop
,主要工作收包后,打印所有包的发送队列号。由于 rule 的作用,应该可以看到目的地 ip 等于192.168.1.1 的数据包的发送队列号一定为 1。
执行情况
开始执行后,收到的包都到了0号队列(因为目的IP地址不是192.168.1.1)。然后使用scapy发目的IP地址设定为192.168.1.1的包到0号以太网口,发了十个包,可以看到十个包的转发队列为1号队列。
发包脚本如下:
from scapy.all import *
p_join = Ether(dst='a0:36:9f:20:43:92', src='a0:36:9f:20:43:90') / IP(src='172.17.173.64', dst='192.168.1.1', tos=0xc0)
p_join.show()
i = 0
while i < 10 :
# enp1s0f0 -->
# bind enpis0f1 to dpdk
sendp(p_join,iface='enp1s0f0')
i = i + 1
注意事项:
- 本程序只需要绑定一个网口到DPDK即可。
- 网卡必须支持多队列,且队列数目要5条以上。不然会报错,信息类似于:cannot configure device
reference
- API doc 中查看 mbuf、rte flow 和 flow classify
- sample guide:flow_filtering
- linux查看网卡型号、驱动版本、队列数