l3fwd负责三层转发,比l2fwd要复杂点。
1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <stdint.h> 37 #include <inttypes.h> 38 #include <sys/types.h> 39 #include <string.h> 40 #include <sys/queue.h> 41 #include <stdarg.h> 42 #include <errno.h> 43 #include <getopt.h> 44 45 #include <rte_common.h> 46 #include <rte_vect.h> 47 #include <rte_byteorder.h> 48 #include <rte_log.h> 49 #include <rte_memory.h> 50 #include <rte_memcpy.h> 51 #include <rte_memzone.h> 52 #include <rte_eal.h> 53 #include <rte_per_lcore.h> 54 #include <rte_launch.h> 55 #include <rte_atomic.h> 56 #include <rte_cycles.h> 57 #include <rte_prefetch.h> 58 #include <rte_lcore.h> 59 #include <rte_per_lcore.h> 60 #include <rte_branch_prediction.h> 61 #include <rte_interrupts.h> 62 #include <rte_pci.h> 63 #include <rte_random.h> 64 #include <rte_debug.h> 65 #include <rte_ether.h> 66 #include <rte_ethdev.h> 67 #include <rte_ring.h> 68 #include <rte_mempool.h> 69 #include <rte_mbuf.h> 70 #include <rte_ip.h> 71 #include <rte_tcp.h> 72 #include <rte_udp.h> 73 #include <rte_string_fns.h> 74 75 #define APP_LOOKUP_EXACT_MATCH 0 76 #define APP_LOOKUP_LPM 1 77 #define DO_RFC_1812_CHECKS 78 79 #ifndef APP_LOOKUP_METHOD //默认使用LPM来路由 80 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 81 #endif 82 83 /* 84 * 0表示未优化 When set to zero, simple forwaring path is eanbled. 85 * 1表示优化 When set to one, optimized forwarding path is enabled. 86 * LPM会用到SSE4.1特性 Note that LPM optimisation path uses SSE4.1 instructions. 87 * 注意: 发现深圳测试机的CPU支持的是SSE 4.2特性,不知道会不会有影响呢??? 88 */ 89 #if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && !defined(__SSE4_1__)) 90 #define ENABLE_MULTI_BUFFER_OPTIMIZE 0 91 #else 92 #define ENABLE_MULTI_BUFFER_OPTIMIZE 1 93 #endif 94 95 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 96 #include <rte_hash.h> 97 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 98 #include <rte_lpm.h> 99 #include <rte_lpm6.h> 100 #else 101 #error "APP_LOOKUP_METHOD set to incorrect value" 102 #endif 103 104 #ifndef IPv6_BYTES 105 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 106 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 107 #define IPv6_BYTES(addr) \ 108 addr[0], addr[1], addr[2], addr[3], \ 109 addr[4], addr[5], addr[6], addr[7], \ 110 addr[8], addr[9], addr[10], addr[11],\ 111 addr[12], addr[13],addr[14], addr[15] 112 #endif 113 114 115 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1 116 117 #define MAX_JUMBO_PKT_LEN 9600 118 119 #define IPV6_ADDR_LEN 16 120 121 #define MEMPOOL_CACHE_SIZE 256 122 123 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 124 125 /* 126 * This expression is used to calculate the number of mbufs needed depending on user input, taking 127 * into account memory for rx and tx hardware rings, cache per lcore and mtable per port per lcore. 128 * RTE_MAX is used to ensure that NB_MBUF never goes below a minimum value of 8192 129 */ 130 131 #define NB_MBUF RTE_MAX ( \ 132 (nb_ports*nb_rx_queue*RTE_TEST_RX_DESC_DEFAULT + \ 133 nb_ports*nb_lcores*MAX_PKT_BURST + \ 134 nb_ports*n_tx_queue*RTE_TEST_TX_DESC_DEFAULT + \ 135 nb_lcores*MEMPOOL_CACHE_SIZE), \ 136 (unsigned)8192) 137 138 #define MAX_PKT_BURST 32 139 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 140 141 /* 142 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 143 */ 144 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 145 146 #define NB_SOCKETS 8 147 148 /* Configure how many packets ahead to prefetch, when reading packets */ 149 #define PREFETCH_OFFSET 3 150 151 /* Used to mark destination port as 'invalid'. */ 152 #define BAD_PORT ((uint16_t)-1) 153 154 #define FWDSTEP 4 155 156 /* 157 * Configurable number of RX/TX ring descriptors 158 */ 159 #define RTE_TEST_RX_DESC_DEFAULT 128 160 #define RTE_TEST_TX_DESC_DEFAULT 512 161 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; 162 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; 163 164 /* ethernet addresses of ports */ 165 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 166 167 static __m128i val_eth[RTE_MAX_ETHPORTS]; 168 169 /* replace first 12B of the ethernet header. */ 170 #define MASK_ETH 0x3f 171 172 /* mask of enabled ports */ 173 static uint32_t enabled_port_mask = 0; 174 static int promiscuous_on = 0; /**< Ports set in promiscuous mode off by default. */ 175 static int numa_on = 1; /**< NUMA is enabled by default. */ 176 177 178 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 179 static int ipv6 = 0; /**< ipv6 is false by default. */ 180 #endif 181 182 struct mbuf_table { 183 uint16_t len; //实际个数??? 184 struct rte_mbuf *m_table[MAX_PKT_BURST]; 185 }; 186 187 struct lcore_rx_queue { 188 uint8_t port_id; //物理端口的编号 189 uint8_t queue_id;//网卡队列的编号 190 } __rte_cache_aligned; 191 192 #define MAX_RX_QUEUE_PER_LCORE 16 //每个lcore上最多有16个接收队列 193 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS //每个物理端口上最多32个发送队列 194 #define MAX_RX_QUEUE_PER_PORT 128 //每个物理端口上最多128个接收队列 195 196 #define MAX_LCORE_PARAMS 1024 197 struct lcore_params { 198 uint8_t port_id; //物理端口的编号 199 uint8_t queue_id; //网卡队列的编号 200 uint8_t lcore_id; //lcore的编号 201 } __rte_cache_aligned; 202 203 static struct lcore_params lcore_params_array[MAX_LCORE_PARAMS];//最大1024 204 205 //此处可以修改lcore的默认配置 206 static struct lcore_params lcore_params_array_default[] = { 207 {0, 0, 2},//物理端口的编号,网卡队列的编号,lcore的编号 208 {0, 1, 2}, 209 {0, 2, 2}, 210 {1, 0, 2}, 211 {1, 1, 2}, 212 {1, 2, 2}, 213 {2, 0, 2}, 214 {3, 0, 3}, 215 {3, 1, 3}, 216 }; 217 218 static struct lcore_params * lcore_params = lcore_params_array_default; 219 static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) / 220 sizeof(lcore_params_array_default[0]);//默认值为9 221 222 static struct rte_eth_conf port_conf = { 223 .rxmode = { 224 .mq_mode = ETH_MQ_RX_RSS, //看起来l3fwd支持RSS哟 225 .max_rx_pkt_len = ETHER_MAX_LEN, 226 .split_hdr_size = 0, 227 .header_split = 0, /**< Header Split disabled */ 228 .hw_ip_checksum = 1, /**< IP checksum offload enabled */ 229 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 230 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 231 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 232 }, 233 .rx_adv_conf = { 234 .rss_conf = { 235 .rss_key = NULL, 236 .rss_hf = ETH_RSS_IP, 237 }, 238 }, 239 .txmode = { 240 .mq_mode = ETH_MQ_TX_NONE, 241 }, 242 }; 243 244 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 245 246 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 247 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2 248 #include <rte_hash_crc.h> 249 #define DEFAULT_HASH_FUNC rte_hash_crc 250 #else 251 #include <rte_jhash.h> 252 #define DEFAULT_HASH_FUNC rte_jhash 253 #endif 254 struct ipv4_5tuple { //五元组 255 uint32_t ip_dst; //目的ip地址 256 uint32_t ip_src; //源ip地址 257 uint16_t port_dst; //目的端口号 258 uint16_t port_src; //源端口号 259 uint8_t proto; //传输层协议类型 260 } __attribute__((__packed__)); 261 union ipv4_5tuple_host { 262 struct { 263 uint8_t pad0; 264 uint8_t proto; 265 uint16_t pad1; 266 uint32_t ip_src; 267 uint32_t ip_dst; 268 uint16_t port_src; 269 uint16_t port_dst; 270 }; 271 __m128i xmm; 272 }; 273 274 #define XMM_NUM_IN_IPV6_5TUPLE 3 275 struct ipv6_5tuple { 276 uint8_t ip_dst[IPV6_ADDR_LEN]; 277 uint8_t ip_src[IPV6_ADDR_LEN]; 278 uint16_t port_dst; 279 uint16_t port_src; 280 uint8_t proto; 281 } __attribute__((__packed__)); 282 union ipv6_5tuple_host { 283 struct { 284 uint16_t pad0; 285 uint8_t proto; 286 uint8_t pad1; 287 uint8_t ip_src[IPV6_ADDR_LEN]; 288 uint8_t ip_dst[IPV6_ADDR_LEN]; 289 uint16_t port_src; 290 uint16_t port_dst; 291 uint64_t reserve; 292 }; 293 __m128i xmm[XMM_NUM_IN_IPV6_5TUPLE]; 294 }; 295 struct ipv4_l3fwd_route { 296 struct ipv4_5tuple key; 297 uint8_t if_out; 298 }; 299 struct ipv6_l3fwd_route { 300 struct ipv6_5tuple key; u 301 int8_t if_out; 302 }; 303 //这里设置默认的静态的三层转发路由规则,实际使用的时候需要修改这个地方 304 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 305 {{IPv4(101,0,0,0), IPv4(100,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 306 {{IPv4(201,0,0,0), IPv4(200,20,0,1), 102, 12, IPPROTO_TCP}, 1}, 307 {{IPv4(111,0,0,0), IPv4(100,30,0,1), 101, 11, IPPROTO_TCP}, 2}, 308 {{IPv4(211,0,0,0), IPv4(200,40,0,1), 102, 12, IPPROTO_TCP}, 3}, 309 }; 310 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 311 {{ {0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 312 {0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 313 101, 11, IPPROTO_TCP}, 0}, 314 {{ {0xfe, 0x90, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 315 {0xfe, 0x90, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 316 102, 12, IPPROTO_TCP}, 1}, 317 {{ {0xfe, 0xa0, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 318 {0xfe, 0xa0, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 319 101, 11, IPPROTO_TCP}, 2}, 320 {{ {0xfe, 0xb0, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 321 {0xfe, 0xb0, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 322 102, 12, IPPROTO_TCP}, 3}, 323 }; 324 typedef struct rte_hash lookup_struct_t; 325 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 326 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 327 #ifdef RTE_ARCH_X86_64 328 /* default to 4 million hash entries (approx) */ 329 #define L3FWD_HASH_ENTRIES 1024*1024*4 330 #else 331 /* 32-bit has less address-space for hugepage memory, limit to 1M entries */ 332 #define L3FWD_HASH_ENTRIES 1024*1024*1 333 #endif 334 #define HASH_ENTRY_NUMBER_DEFAULT 4 335 static uint32_t hash_entry_number = HASH_ENTRY_NUMBER_DEFAULT; 336 static inline uint32_tipv4_hash_crc(const void *data, 337 __rte_unused uint32_t data_len, uint32_t init_val){ 338 const union ipv4_5tuple_host *k; 339 uint32_t t; const uint32_t *p; 340 k = data; 341 t = k->proto; 342 p = (const uint32_t *)&k->port_src; 343 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2 344 init_val = rte_hash_crc_4byte(t, init_val); 345 init_val = rte_hash_crc_4byte(k->ip_src, init_val); 346 init_val = rte_hash_crc_4byte(k->ip_dst, init_val); 347 init_val = rte_hash_crc_4byte(*p, init_val); 348 #else /* RTE_MACHINE_CPUFLAG_SSE4_2 */ 349 init_val = rte_jhash_1word(t, init_val); 350 init_val = rte_jhash_1word(k->ip_src, init_val); 351 init_val = rte_jhash_1word(k->ip_dst, init_val); 352 init_val = rte_jhash_1word(*p, init_val); 353 #endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */ 354 return (init_val); 355 } 356 static inline uint32_tipv6_hash_crc(const void *data, 357 __rte_unused uint32_t data_len, uint32_t init_val){ 358 const union ipv6_5tuple_host *k; 359 uint32_t t; 360 const uint32_t *p; 361 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2 362 const uint32_t *ip_src0, *ip_src1, *ip_src2, *ip_src3; 363 const uint32_t *ip_dst0, *ip_dst1, *ip_dst2, *ip_dst3; 364 #endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */ 365 k = data; 366 t = k->proto; 367 p = (const uint32_t *)&k->port_src; 368 #ifdef RTE_MACHINE_CPUFLAG_SSE4_2 369 ip_src0 = (const uint32_t *) k->ip_src; 370 ip_src1 = (const uint32_t *)(k->ip_src+4); 371 ip_src2 = (const uint32_t *)(k->ip_src+8); 372 ip_src3 = (const uint32_t *)(k->ip_src+12); 373 ip_dst0 = (const uint32_t *) k->ip_dst; 374 ip_dst1 = (const uint32_t *)(k->ip_dst+4); 375 ip_dst2 = (const uint32_t *)(k->ip_dst+8); 376 ip_dst3 = (const uint32_t *)(k->ip_dst+12); 377 init_val = rte_hash_crc_4byte(t, init_val); 378 init_val = rte_hash_crc_4byte(*ip_src0, init_val); 379 init_val = rte_hash_crc_4byte(*ip_src1, init_val); 380 init_val = rte_hash_crc_4byte(*ip_src2, init_val); 381 init_val = rte_hash_crc_4byte(*ip_src3, init_val); 382 init_val = rte_hash_crc_4byte(*ip_dst0, init_val); 383 init_val = rte_hash_crc_4byte(*ip_dst1, init_val); 384 init_val = rte_hash_crc_4byte(*ip_dst2, init_val); 385 init_val = rte_hash_crc_4byte(*ip_dst3, init_val); 386 init_val = rte_hash_crc_4byte(*p, init_val); 387 #else /* RTE_MACHINE_CPUFLAG_SSE4_2 */ 388 init_val = rte_jhash_1word(t, init_val); 389 init_val = rte_jhash(k->ip_src, sizeof(uint8_t) * IPV6_ADDR_LEN, init_val); 390 init_val = rte_jhash(k->ip_dst, sizeof(uint8_t) * IPV6_ADDR_LEN, init_val); 391 init_val = rte_jhash_1word(*p, init_val); 392 #endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */ 393 return (init_val); 394 } 395 #define IPV4_L3FWD_NUM_ROUTES \ 396 (sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[0])) 397 #define IPV6_L3FWD_NUM_ROUTES \ 398 (sizeof(ipv6_l3fwd_route_array) / sizeof(ipv6_l3fwd_route_array[0])) 399 static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 400 static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 401 #endif 402 403 404 405 406 407 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 408 struct ipv4_l3fwd_route { 409 uint32_t ip; //看起来l3fwd支持RSS哟 410 uint8_t depth; //深度 411 uint8_t if_out; //数据转发的出口 412 }; 413 414 struct ipv6_l3fwd_route { 415 uint8_t ip[16]; 416 uint8_t depth; 417 uint8_t if_out; 418 }; 419 420 421 //这里设置默认的静态的三层转发路由规则,实际使用的时候需要修改这个地方 422 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { //只有8个元素??? 423 {IPv4(1,1,1,0), 24, 0}, //{IPv4(192,168,10,0), 24, 0}, 424 {IPv4(2,1,1,0), 24, 1}, 425 {IPv4(3,1,1,0), 24, 2}, 426 {IPv4(4,1,1,0), 24, 3}, 427 {IPv4(5,1,1,0), 24, 4}, 428 {IPv4(6,1,1,0), 24, 5}, 429 {IPv4(7,1,1,0), 24, 6}, 430 {IPv4(8,1,1,0), 24, 7}, 431 }; 432 433 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 434 {{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 0}, 435 {{2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 1}, 436 {{3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 2}, 437 {{4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 3}, 438 {{5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 4}, 439 {{6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 5}, 440 {{7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 6}, 441 {{8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, 48, 7}, 442 }; 443 444 445 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 446 {{IPv4(101,0,0,0), IPv4(100,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 447 {{IPv4(201,0,0,0), IPv4(200,20,0,1), 102, 12, IPPROTO_TCP}, 1}, 448 {{IPv4(111,0,0,0), IPv4(100,30,0,1), 101, 11, IPPROTO_TCP}, 2}, 449 {{IPv4(211,0,0,0), IPv4(200,40,0,1), 102, 12, IPPROTO_TCP}, 3}, 450 }; 451 452 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 453 {{ 454 {0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 455 {0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 456 101, 11, IPPROTO_TCP}, 0}, 457 458 {{ 459 {0xfe, 0x90, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 460 {0xfe, 0x90, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 461 102, 12, IPPROTO_TCP}, 1}, 462 463 {{ 464 {0xfe, 0xa0, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 465 {0xfe, 0xa0, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 466 101, 11, IPPROTO_TCP}, 2}, 467 468 {{ 469 {0xfe, 0xb0, 0, 0, 0, 0, 0, 0, 0x02, 0x1e, 0x67, 0xff, 0xfe, 0, 0, 0}, 470 {0xfe, 0xb0, 0, 0, 0, 0, 0, 0, 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 471 102, 12, IPPROTO_TCP}, 3}, 472 }; 473 474 475 476 #define IPV4_L3FWD_NUM_ROUTES \ 477 (sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[0])) 478 #define IPV6_L3FWD_NUM_ROUTES \ 479 (sizeof(ipv6_l3fwd_route_array) / sizeof(ipv6_l3fwd_route_array[0])) 480 481 #define IPV4_L3FWD_LPM_MAX_RULES 1024 482 #define IPV6_L3FWD_LPM_MAX_RULES 1024 483 #define IPV6_L3FWD_LPM_NUMBER_TBL8S (1 << 16) 484 485 typedef struct rte_lpm lookup_struct_t; 486 typedef struct rte_lpm6 lookup6_struct_t; 487 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS];//8个元素 488 static lookup6_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 489 #endif 490 491 struct lcore_conf {//保存lcore的配置信息 492 uint16_t n_rx_queue; //接收队列的总数量 493 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];//物理端口和网卡队列编号组成的数组 494 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; //发送队列的编号组成的数组 495 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];//mbuf表 496 lookup_struct_t * ipv4_lookup_struct; //实际上就是struct rte_lpm * 497 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 498 lookup6_struct_t * ipv6_lookup_struct; 499 #else 500 lookup_struct_t * ipv6_lookup_struct; 501 #endif 502 } __rte_cache_aligned; 503 504 static struct lcore_conf lcore_conf[RTE_MAX_LCORE]; 505 506 /* Send burst of packets on an output interface */ 507 static inline int //在输出接口port上把数据包burst发送出去 508 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 509 { 510 struct rte_mbuf **m_table; 511 int ret; 512 uint16_t queueid; 513 514 queueid = qconf->tx_queue_id[port]; 515 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 516 517 ret = rte_eth_tx_burst(port, queueid, m_table, n); 518 if (unlikely(ret < n)) { 519 do { 520 rte_pktmbuf_free(m_table[ret]); 521 } while (++ret < n); 522 } 523 524 return 0; 525 } 526 527 /* Enqueue a single packet, and send burst if queue is filled */ 528 static inline int //发送一个mbuf 529 send_single_packet(struct rte_mbuf *m, uint8_t port) 530 { 531 uint32_t lcore_id; 532 uint16_t len; 533 struct lcore_conf *qconf; 534 535 lcore_id = rte_lcore_id(); 536 537 qconf = &lcore_conf[lcore_id]; 538 len = qconf->tx_mbufs[port].len; 539 qconf->tx_mbufs[port].m_table[len] = m; 540 len++; 541 542 /* enough pkts to be sent */ 543 if (unlikely(len == MAX_PKT_BURST)) { //如果累计到32个数据包 544 send_burst(qconf, MAX_PKT_BURST, port); //把32个数据包发送出去 545 len = 0; 546 } 547 548 qconf->tx_mbufs[port].len = len; 549 return 0; 550 } 551 552 static inline __attribute__ void 553 send_packetsx4(struct lcore_conf *qconf, uint8_t port, 554 struct rte_mbuf *m[], uint32_t num) 555 { 556 uint32_t len, j, n; 557 558 len = qconf->tx_mbufs[port].len; 559 560 /* 如果某个队列的发送缓冲区为空,而且已有足够数量数据包待发送,那么立即发送 561 * If TX buffer for that queue is empty, and we have enough packets, 562 * then send them straightway. 563 */ 564 if (num >= MAX_TX_BURST && len == 0) { 565 n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);//burst发送num个mbufs 566 if (unlikely(n < num)) { //如果实际发送数据包的个数小于num 567 do { 568 rte_pktmbuf_free(m[n]); //把剩下的num-n个mbufs返回mempool 569 } while (++n < num); 570 } 571 return; 572 } 573 574 /* 575 * Put packets into TX buffer for that queue. 576 */ 577 //把那些数据包放到网卡队列的发送缓冲区中 578 n = len + num; 579 n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num; 580 581 j = 0; 582 switch (n % FWDSTEP) { 583 while (j < n) { 584 case 0: 585 qconf->tx_mbufs[port].m_table[len + j] = m[j]; 586 j++; 587 case 3: 588 qconf->tx_mbufs[port].m_table[len + j] = m[j]; 589 j++; 590 case 2: 591 qconf->tx_mbufs[port].m_table[len + j] = m[j]; 592 j++; 593 case 1: 594 qconf->tx_mbufs[port].m_table[len + j] = m[j]; 595 j++; 596 } 597 } 598 599 len += n; 600 601 /*待发送的包数量达到32个 enough pkts to be sent */ 602 if (unlikely(len == MAX_PKT_BURST)) { 603 604 send_burst(qconf, MAX_PKT_BURST, port); 605 606 /* copy rest of the packets into the TX buffer. */ 607 len = num - n; 608 j = 0; 609 switch (len % FWDSTEP) { 610 while (j < len) { 611 case 0: 612 qconf->tx_mbufs[port].m_table[j] = m[n + j]; 613 j++; 614 case 3: 615 qconf->tx_mbufs[port].m_table[j] = m[n + j]; 616 j++; 617 case 2: 618 qconf->tx_mbufs[port].m_table[j] = m[n + j]; 619 j++; 620 case 1: 621 qconf->tx_mbufs[port].m_table[j] = m[n + j]; 622 j++; 623 } 624 } 625 } 626 627 qconf->tx_mbufs[port].len = len; 628 } 629 630 #ifdef DO_RFC_1812_CHECKS 631 static inline int 632 is_valid_ipv4_pkt(struct ipv4_hdr *pkt, uint32_t link_len) 633 { 634 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 635 /* 636 * 1. The packet length reported by the Link Layer must be large 637 * enough to hold the minimum length legal IP datagram (20 bytes). 638 */ 639 if (link_len < sizeof(struct ipv4_hdr)) 640 return -1; 641 642 /* 2. The IP checksum must be correct. */ 643 /* this is checked in H/W */ 644 645 /* 646 * 3. The IP version number must be 4. If the version number is not 4 647 * then the packet may be another version of IP, such as IPng or 648 * ST-II. 649 */ 650 if (((pkt->version_ihl) >> 4) != 4) 651 return -3; 652 /* 653 * 4. The IP header length field must be large enough to hold the 654 * minimum length legal IP datagram (20 bytes = 5 words). 655 */ 656 if ((pkt->version_ihl & 0xf) < 5) 657 return -4; 658 659 /* 660 * 5. The IP total length field must be large enough to hold the IP 661 * datagram header, whose length is specified in the IP header length 662 * field. 663 */ 664 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct ipv4_hdr)) 665 return -5; 666 667 return 0; 668 } 669 #endif 670 671 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 672 673 static __m128i mask0; 674 static __m128i mask1; 675 static __m128i mask2; 676 static inline uint8_t //哈希情形下获取转发出口 677 get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, lookup_struct_t * ipv4_l3fwd_lookup_struct) 678 { 679 int ret = 0; 680 union ipv4_5tuple_host key; 681 682 ipv4_hdr = (uint8_t *)ipv4_hdr + offsetof(struct ipv4_hdr, time_to_live); 683 __m128i data = _mm_loadu_si128((__m128i*)(ipv4_hdr)); 684 /* Get 5 tuple: dst port, src port, dst IP address, src IP address and protocol */ 685 key.xmm = _mm_and_si128(data, mask0); 686 /* Find destination port */ 687 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 688 return (uint8_t)((ret < 0)? portid : ipv4_l3fwd_out_if[ret]); 689 } 690 691 static inline uint8_t 692 get_ipv6_dst_port(void *ipv6_hdr, uint8_t portid, lookup_struct_t * ipv6_l3fwd_lookup_struct) 693 { 694 int ret = 0; 695 union ipv6_5tuple_host key; 696 697 ipv6_hdr = (uint8_t *)ipv6_hdr + offsetof(struct ipv6_hdr, payload_len); 698 __m128i data0 = _mm_loadu_si128((__m128i*)(ipv6_hdr)); 699 __m128i data1 = _mm_loadu_si128((__m128i*)(((uint8_t*)ipv6_hdr)+sizeof(__m128i))); 700 __m128i data2 = _mm_loadu_si128((__m128i*)(((uint8_t*)ipv6_hdr)+sizeof(__m128i)+sizeof(__m128i))); 701 /* Get part of 5 tuple: src IP address lower 96 bits and protocol */ 702 key.xmm[0] = _mm_and_si128(data0, mask1); 703 /* Get part of 5 tuple: dst IP address lower 96 bits and src IP address higher 32 bits */ 704 key.xmm[1] = data1; 705 /* Get part of 5 tuple: dst port and src port and dst IP address higher 32 bits */ 706 key.xmm[2] = _mm_and_si128(data2, mask2); 707 708 /* Find destination port */ 709 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 710 return (uint8_t)((ret < 0)? portid : ipv6_l3fwd_out_if[ret]); 711 } 712 #endif 713 714 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 715 716 static inline uint8_t //LPM情形下获取ipv4数据包的目的端口 717 get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, lookup_struct_t * ipv4_l3fwd_lookup_struct) 718 { 719 uint8_t next_hop; 720 721 return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 722 rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr), 723 &next_hop) == 0) ? next_hop : portid); 724 } 725 726 static inline uint8_t 727 get_ipv6_dst_port(void *ipv6_hdr, uint8_t portid, lookup6_struct_t * ipv6_l3fwd_lookup_struct) 728 { 729 uint8_t next_hop; 730 return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct, 731 ((struct ipv6_hdr*)ipv6_hdr)->dst_addr, &next_hop) == 0)? 732 next_hop : portid); 733 } 734 #endif 735 736 static inline void l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, 737 struct lcore_conf *qconf) __attribute__((unused)); 738 739 #if ((APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) && \ 740 (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)) 741 742 static inline void get_ipv6_5tuple(struct rte_mbuf* m0, __m128i mask0, __m128i mask1, 743 union ipv6_5tuple_host * key) 744 { 745 __m128i tmpdata0 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *) 746 + sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len))); 747 __m128i tmpdata1 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *) 748 + sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len) 749 + sizeof(__m128i))); 750 __m128i tmpdata2 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *) 751 + sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len) 752 + sizeof(__m128i) + sizeof(__m128i))); 753 key->xmm[0] = _mm_and_si128(tmpdata0, mask0); 754 key->xmm[1] = tmpdata1; 755 key->xmm[2] = _mm_and_si128(tmpdata2, mask1); 756 return; 757 } 758 759 760 static inline void 761 simple_ipv4_fwd_4pkts(struct rte_mbuf* m[4], uint8_t portid, struct lcore_conf *qconf) 762 { 763 struct ether_hdr *eth_hdr[4]; 764 struct ipv4_hdr *ipv4_hdr[4]; 765 void *d_addr_bytes[4]; 766 uint8_t dst_port[4]; 767 int32_t ret[4]; 768 union ipv4_5tuple_host key[4]; 769 __m128i data[4]; 770 771 eth_hdr[0] = rte_pktmbuf_mtod(m[0], struct ether_hdr *); 772 eth_hdr[1] = rte_pktmbuf_mtod(m[1], struct ether_hdr *); 773 eth_hdr[2] = rte_pktmbuf_mtod(m[2], struct ether_hdr *); 774 eth_hdr[3] = rte_pktmbuf_mtod(m[3], struct ether_hdr *); 775 776 /* Handle IPv4 headers.*/ 777 ipv4_hdr[0] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[0], unsigned char *) + 778 sizeof(struct ether_hdr)); 779 ipv4_hdr[1] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[1], unsigned char *) + 780 sizeof(struct ether_hdr)); 781 ipv4_hdr[2] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[2], unsigned char *) + 782 sizeof(struct ether_hdr)); 783 ipv4_hdr[3] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[3], unsigned char *) + 784 sizeof(struct ether_hdr)); 785 786 #ifdef DO_RFC_1812_CHECKS 787 /* Check to make sure the packet is valid (RFC1812) */ 788 uint8_t valid_mask = MASK_ALL_PKTS; 789 if (is_valid_ipv4_pkt(ipv4_hdr[0], m[0]->pkt_len) < 0) { 790 rte_pktmbuf_free(m[0]); 791 valid_mask &= EXECLUDE_1ST_PKT; 792 } 793 if (is_valid_ipv4_pkt(ipv4_hdr[1], m[1]->pkt_len) < 0) { 794 rte_pktmbuf_free(m[1]); 795 valid_mask &= EXECLUDE_2ND_PKT; 796 } 797 if (is_valid_ipv4_pkt(ipv4_hdr[2], m[2]->pkt_len) < 0) { 798 rte_pktmbuf_free(m[2]); 799 valid_mask &= EXECLUDE_3RD_PKT; 800 } 801 if (is_valid_ipv4_pkt(ipv4_hdr[3], m[3]->pkt_len) < 0) { 802 rte_pktmbuf_free(m[3]); 803 valid_mask &= EXECLUDE_4TH_PKT; 804 } 805 if (unlikely(valid_mask != MASK_ALL_PKTS)) { 806 if (valid_mask == 0){ 807 return; 808 } else { 809 uint8_t i = 0; 810 for (i = 0; i < 4; i++) { 811 if ((0x1 << i) & valid_mask) { 812 l3fwd_simple_forward(m[i], portid, qconf); 813 } 814 } 815 return; 816 } 817 } 818 #endif // End of #ifdef DO_RFC_1812_CHECKS 819 820 data[0] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[0], unsigned char *) + 821 sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live))); 822 data[1] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[1], unsigned char *) + 823 sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live))); 824 data[2] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[2], unsigned char *) + 825 sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live))); 826 data[3] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[3], unsigned char *) + 827 sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live))); 828 829 key[0].xmm = _mm_and_si128(data[0], mask0); 830 key[1].xmm = _mm_and_si128(data[1], mask0); 831 key[2].xmm = _mm_and_si128(data[2], mask0); 832 key[3].xmm = _mm_and_si128(data[3], mask0); 833 834 const void *key_array[4] = {&key[0], &key[1], &key[2],&key[3]}; 835 rte_hash_lookup_multi(qconf->ipv4_lookup_struct, &key_array[0], 4, ret); 836 dst_port[0] = (uint8_t) ((ret[0] < 0) ? portid : ipv4_l3fwd_out_if[ret[0]]); 837 dst_port[1] = (uint8_t) ((ret[1] < 0) ? portid : ipv4_l3fwd_out_if[ret[1]]); 838 dst_port[2] = (uint8_t) ((ret[2] < 0) ? portid : ipv4_l3fwd_out_if[ret[2]]); 839 dst_port[3] = (uint8_t) ((ret[3] < 0) ? portid : ipv4_l3fwd_out_if[ret[3]]); 840 841 if (dst_port[0] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[0]) == 0) 842 dst_port[0] = portid; 843 if (dst_port[1] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[1]) == 0) 844 dst_port[1] = portid; 845 if (dst_port[2] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[2]) == 0) 846 dst_port[2] = portid; 847 if (dst_port[3] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[3]) == 0) 848 dst_port[3] = portid; 849 850 /* 02:00:00:00:00:xx */ 851 d_addr_bytes[0] = ð_hdr[0]->d_addr.addr_bytes[0]; 852 d_addr_bytes[1] = ð_hdr[1]->d_addr.addr_bytes[0]; 853 d_addr_bytes[2] = ð_hdr[2]->d_addr.addr_bytes[0]; 854 d_addr_bytes[3] = ð_hdr[3]->d_addr.addr_bytes[0]; 855 *((uint64_t *)d_addr_bytes[0]) = 0x000000000002 + ((uint64_t)dst_port[0] << 40); 856 *((uint64_t *)d_addr_bytes[1]) = 0x000000000002 + ((uint64_t)dst_port[1] << 40); 857 *((uint64_t *)d_addr_bytes[2]) = 0x000000000002 + ((uint64_t)dst_port[2] << 40); 858 *((uint64_t *)d_addr_bytes[3]) = 0x000000000002 + ((uint64_t)dst_port[3] << 40); 859 860 #ifdef DO_RFC_1812_CHECKS 861 /* Update time to live and header checksum */ 862 --(ipv4_hdr[0]->time_to_live); 863 --(ipv4_hdr[1]->time_to_live); 864 --(ipv4_hdr[2]->time_to_live); 865 --(ipv4_hdr[3]->time_to_live); 866 ++(ipv4_hdr[0]->hdr_checksum); 867 ++(ipv4_hdr[1]->hdr_checksum); 868 ++(ipv4_hdr[2]->hdr_checksum); 869 ++(ipv4_hdr[3]->hdr_checksum); 870 #endif 871 872 /* src addr */ 873 ether_addr_copy(&ports_eth_addr[dst_port[0]], ð_hdr[0]->s_addr); 874 ether_addr_copy(&ports_eth_addr[dst_port[1]], ð_hdr[1]->s_addr); 875 ether_addr_copy(&ports_eth_addr[dst_port[2]], ð_hdr[2]->s_addr); 876 ether_addr_copy(&ports_eth_addr[dst_port[3]], ð_hdr[3]->s_addr); 877 878 send_single_packet(m[0], (uint8_t)dst_port[0]); 879 send_single_packet(m[1], (uint8_t)dst_port[1]); 880 send_single_packet(m[2], (uint8_t)dst_port[2]); 881 send_single_packet(m[3], (uint8_t)dst_port[3]); 882 883 } 884 885 886 887 #define MASK_ALL_PKTS 0xf 888 #define EXECLUDE_1ST_PKT 0xe 889 #define EXECLUDE_2ND_PKT 0xd 890 #define EXECLUDE_3RD_PKT 0xb 891 #define EXECLUDE_4TH_PKT 0x7 892 893 894 895 896 static inline void 897 simple_ipv6_fwd_4pkts(struct rte_mbuf* m[4], uint8_t portid, struct lcore_conf *qconf) 898 { 899 struct ether_hdr *eth_hdr[4]; 900 __attribute__((unused)) struct ipv6_hdr *ipv6_hdr[4]; 901 void *d_addr_bytes[4]; 902 uint8_t dst_port[4]; 903 int32_t ret[4]; 904 union ipv6_5tuple_host key[4]; 905 906 eth_hdr[0] = rte_pktmbuf_mtod(m[0], struct ether_hdr *); 907 eth_hdr[1] = rte_pktmbuf_mtod(m[1], struct ether_hdr *); 908 eth_hdr[2] = rte_pktmbuf_mtod(m[2], struct ether_hdr *); 909 eth_hdr[3] = rte_pktmbuf_mtod(m[3], struct ether_hdr *); 910 911 /* Handle IPv6 headers.*/ 912 ipv6_hdr[0] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[0], unsigned char *) + 913 sizeof(struct ether_hdr)); 914 ipv6_hdr[1] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[1], unsigned char *) + 915 sizeof(struct ether_hdr)); 916 ipv6_hdr[2] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[2], unsigned char *) + 917 sizeof(struct ether_hdr)); 918 ipv6_hdr[3] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[3], unsigned char *) + 919 sizeof(struct ether_hdr)); 920 921 get_ipv6_5tuple(m[0], mask1, mask2, &key[0]); 922 get_ipv6_5tuple(m[1], mask1, mask2, &key[1]); 923 get_ipv6_5tuple(m[2], mask1, mask2, &key[2]); 924 get_ipv6_5tuple(m[3], mask1, mask2, &key[3]); 925 926 const void *key_array[4] = {&key[0], &key[1], &key[2],&key[3]}; 927 rte_hash_lookup_multi(qconf->ipv6_lookup_struct, &key_array[0], 4, ret); 928 dst_port[0] = (uint8_t) ((ret[0] < 0)? portid:ipv6_l3fwd_out_if[ret[0]]); 929 dst_port[1] = (uint8_t) ((ret[1] < 0)? portid:ipv6_l3fwd_out_if[ret[1]]); 930 dst_port[2] = (uint8_t) ((ret[2] < 0)? portid:ipv6_l3fwd_out_if[ret[2]]); 931 dst_port[3] = (uint8_t) ((ret[3] < 0)? portid:ipv6_l3fwd_out_if[ret[3]]); 932 933 if (dst_port[0] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[0]) == 0) 934 dst_port[0] = portid; 935 if (dst_port[1] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[1]) == 0) 936 dst_port[1] = portid; 937 if (dst_port[2] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[2]) == 0) 938 dst_port[2] = portid; 939 if (dst_port[3] >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port[3]) == 0) 940 dst_port[3] = portid; 941 942 /* 02:00:00:00:00:xx */ 943 d_addr_bytes[0] = ð_hdr[0]->d_addr.addr_bytes[0]; 944 d_addr_bytes[1] = ð_hdr[1]->d_addr.addr_bytes[0]; 945 d_addr_bytes[2] = ð_hdr[2]->d_addr.addr_bytes[0]; 946 d_addr_bytes[3] = ð_hdr[3]->d_addr.addr_bytes[0]; 947 *((uint64_t *)d_addr_bytes[0]) = 0x000000000002 + ((uint64_t)dst_port[0] << 40); 948 *((uint64_t *)d_addr_bytes[1]) = 0x000000000002 + ((uint64_t)dst_port[1] << 40); 949 *((uint64_t *)d_addr_bytes[2]) = 0x000000000002 + ((uint64_t)dst_port[2] << 40); 950 *((uint64_t *)d_addr_bytes[3]) = 0x000000000002 + ((uint64_t)dst_port[3] << 40); 951 952 /* src addr */ 953 ether_addr_copy(&ports_eth_addr[dst_port[0]], ð_hdr[0]->s_addr); 954 ether_addr_copy(&ports_eth_addr[dst_port[1]], ð_hdr[1]->s_addr); 955 ether_addr_copy(&ports_eth_addr[dst_port[2]], ð_hdr[2]->s_addr); 956 ether_addr_copy(&ports_eth_addr[dst_port[3]], ð_hdr[3]->s_addr); 957 958 send_single_packet(m[0], (uint8_t)dst_port[0]); 959 send_single_packet(m[1], (uint8_t)dst_port[1]); 960 send_single_packet(m[2], (uint8_t)dst_port[2]); 961 send_single_packet(m[3], (uint8_t)dst_port[3]); 962 963 } 964 #endif /* APP_LOOKUP_METHOD */ 965 966 static inline __attribute__ void //简单三层转发,没有使用SSE4.1优化 967 l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qconf) 968 { 969 struct ether_hdr *eth_hdr; 970 struct ipv4_hdr *ipv4_hdr; 971 void *d_addr_bytes; 972 uint8_t dst_port; 973 974 eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); //得到eth_hdr指针 975 976 if (m->ol_flags & PKT_RX_IPV4_HDR) { //如果是ipv4包 977 /* Handle IPv4 headers.*/ 978 ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m, unsigned char *) + 979 sizeof(struct ether_hdr)); 980 981 #ifdef DO_RFC_1812_CHECKS 982 /* Check to make sure the packet is valid (RFC1812) */ 983 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 984 rte_pktmbuf_free(m); 985 return; 986 } 987 #endif 988 //想要满足文生提出的需求,主要在这里修改ip层和tcp层的数据内容。 989 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, //获取转发出口 990 qconf->ipv4_lookup_struct); 991 if (dst_port >= RTE_MAX_ETHPORTS || 992 (enabled_port_mask & 1 << dst_port) == 0) 993 dst_port = portid; //出错则直接把入口作为转发出口 994 995 /* 02:00:00:00:00:xx 这里是修改目的mac地址吗??? */ 996 d_addr_bytes = ð_hdr->d_addr.addr_bytes[0]; 997 *((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR + 998 ((uint64_t)dst_port << 40); 999 1000 #ifdef DO_RFC_1812_CHECKS 1001 /* Update time to live and header checksum */ 1002 --(ipv4_hdr->time_to_live); 1003 ++(ipv4_hdr->hdr_checksum); 1004 #endif 1005 1006 /* //把进入包的目的mac地址作为转发包的源地址 src addr */ 1007 ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); 1008 1009 send_single_packet(m, dst_port); //经过dst_port把转发包发送出去 1010 1011 } else { //如果是ipv6包 1012 /* Handle IPv6 headers.*/ 1013 struct ipv6_hdr *ipv6_hdr; 1014 1015 ipv6_hdr = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m, unsigned char *) + 1016 sizeof(struct ether_hdr)); 1017 1018 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, qconf->ipv6_lookup_struct); 1019 1020 if (dst_port >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port) == 0) 1021 dst_port = portid; 1022 1023 /* 02:00:00:00:00:xx */ 1024 d_addr_bytes = ð_hdr->d_addr.addr_bytes[0]; 1025 *((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR + 1026 ((uint64_t)dst_port << 40); 1027 1028 /* src addr */ 1029 ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); 1030 1031 send_single_packet(m, dst_port); 1032 } 1033 1034 } 1035 1036 #ifdef DO_RFC_1812_CHECKS 1037 1038 #define IPV4_MIN_VER_IHL 0x45 1039 #define IPV4_MAX_VER_IHL 0x4f 1040 #define IPV4_MAX_VER_IHL_DIFF (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL) 1041 1042 /* Minimum value of IPV4 total length (20B) in network byte order. */ 1043 #define IPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8) 1044 1045 /* 1046 * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2: 1047 * - The IP version number must be 4. 1048 * - The IP header length field must be large enough to hold the 1049 * minimum length legal IP datagram (20 bytes = 5 words). 1050 * - The IP total length field must be large enough to hold the IP 1051 * datagram header, whose length is specified in the IP header length 1052 * field. 1053 * If we encounter invalid IPV4 packet, then set destination port for it 1054 * to BAD_PORT value. 1055 */ 1056 static inline __attribute__ void //ipv4错误检查 1057 rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t flags) 1058 { 1059 uint8_t ihl; 1060 1061 if ((flags & PKT_RX_IPV4_HDR) != 0) {//如果是ipv4 1062 1063 ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL; 1064 1065 ipv4_hdr->time_to_live--; 1066 ipv4_hdr->hdr_checksum++; 1067 1068 if (ihl > IPV4_MAX_VER_IHL_DIFF || 1069 ((uint8_t)ipv4_hdr->total_length == 0 && 1070 ipv4_hdr->total_length < IPV4_MIN_LEN_BE)) { 1071 dp[0] = BAD_PORT; //应该是出错了 1072 } 1073 } 1074 } 1075 1076 #else 1077 #define rfc1812_process(mb, dp) do { } while (0) 1078 #endif /* DO_RFC_1812_CHECKS */ 1079 1080 1081 #if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \ 1082 (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)) 1083 1084 static inline __attribute__ uint16_t //得到目的ip地址对应的转发出口 1085 get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt, 1086 uint32_t dst_ipv4, uint8_t portid) 1087 { 1088 uint8_t next_hop; 1089 struct ipv6_hdr *ipv6_hdr; 1090 struct ether_hdr *eth_hdr; 1091 1092 if (pkt->ol_flags & PKT_RX_IPV4_HDR) { //如果都是ipv4 1093 if (rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4, 1094 &next_hop) != 0) //返回0则查找到,next_hop中已经得到下一跳 1095 next_hop = portid; //此时没找到,则直接把portid设定为下一跳 1096 } else if (pkt->ol_flags & PKT_RX_IPV6_HDR) { //如果都是ipv6 1097 eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *); 1098 ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1); 1099 if (rte_lpm6_lookup(qconf->ipv6_lookup_struct, 1100 ipv6_hdr->dst_addr, &next_hop) != 0) 1101 next_hop = portid; 1102 } else { //如果有其他种类的数据包 1103 next_hop = portid;//设定下一跳 1104 } 1105 1106 return next_hop;//返回下一跳 1107 } 1108 1109 static inline void //处理一个数据包 1110 process_packet(struct lcore_conf *qconf, struct rte_mbuf *pkt, 1111 uint16_t *dst_port, uint8_t portid) 1112 { 1113 struct ether_hdr *eth_hdr; 1114 struct ipv4_hdr *ipv4_hdr; 1115 uint32_t dst_ipv4; 1116 uint16_t dp; 1117 __m128i te, ve; 1118 1119 eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);//获取eth首部 1120 ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);//获取ipv4首部 1121 1122 dst_ipv4 = ipv4_hdr->dst_addr; //得到大端的ipv4目的地址 1123 dst_ipv4 = rte_be_to_cpu_32(dst_ipv4);//转换成小端 1124 dp = get_dst_port(qconf, pkt, dst_ipv4, portid); //获取转发出口/下一跳 1125 1126 te = _mm_load_si128((__m128i *)eth_hdr); 1127 ve = val_eth[dp]; 1128 1129 dst_port[0] = dp; 1130 rfc1812_process(ipv4_hdr, dst_port, pkt->ol_flags); 1131 1132 te = _mm_blend_epi16(te, ve, MASK_ETH); 1133 _mm_store_si128((__m128i *)eth_hdr, te); 1134 } 1135 1136 /* 从4个mbufs中读取目的IP地址和ol_flags 1137 * Read ol_flags and destination IPV4 addresses from 4 mbufs. 1138 */ 1139 static inline void 1140 processx4_step1(struct rte_mbuf *pkt[FWDSTEP], __m128i *dip, uint32_t *flag) 1141 { 1142 struct ipv4_hdr *ipv4_hdr; 1143 struct ether_hdr *eth_hdr; 1144 uint32_t x0, x1, x2, x3; 1145 //第一个mbuf 1146 eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);//得到eth_hdr 1147 ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);//得到ipv4_hdr 1148 x0 = ipv4_hdr->dst_addr;//得到dst_addr 1149 flag[0] = pkt[0]->ol_flags & PKT_RX_IPV4_HDR; 1150 //第二个mbuf 1151 eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *); 1152 ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1); 1153 x1 = ipv4_hdr->dst_addr; 1154 flag[0] &= pkt[1]->ol_flags; //与前一个mbuf标志做&运算 1155 //第三个mbuf 1156 eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *); 1157 ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1); 1158 x2 = ipv4_hdr->dst_addr; 1159 flag[0] &= pkt[2]->ol_flags; //与前一个mbuf标志做&运算 1160 //第四个mbuf 1161 eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *); 1162 ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1); 1163 x3 = ipv4_hdr->dst_addr; 1164 flag[0] &= pkt[3]->ol_flags; //与前一个mbuf标志做&运算 1165 1166 dip[0] = _mm_set_epi32(x3, x2, x1, x0);//把4个dst_addr合并为128位的寄存器 1167 } 1168 1169 /* 1170 * Lookup into LPM for destination port. 1171 * If lookup fails, use incoming port (portid) as destination port. 1172 */ //在LPM中查找转发出口/下一跳,如果没有找到则把入口作为转发出口 1173 static inline void 1174 processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag, 1175 uint8_t portid, struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP]) 1176 { 1177 rte_xmm_t dst; 1178 const __m128i bswap_mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 1179 4, 5, 6, 7, 0, 1, 2, 3); //表示重新排列的顺序 1180 1181 /* Byte swap 4 IPV4 addresses. 按照字节交换ipv4地址 */ 1182 dip = _mm_shuffle_epi8(dip, bswap_mask); 1183 1184 /* 如果4个分组都是ipv4的 if all 4 packets are IPV4. */ 1185 if (likely(flag != 0)) { 1186 rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid); 1187 } else { 1188 dst.x = dip; //获取4个目的ip地址 1189 dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);//得到下一跳/转发出口 1190 dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid); 1191 dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid); 1192 dprt[3] = get_dst_port(qconf, pkt[3], dst.u32[3], portid); 1193 } 1194 } 1195 1196 /* 1197 * Update source and destination MAC addresses in the ethernet header. 1198 * Perform RFC1812 checks and updates for IPV4 packets. 1199 */ //更新目的mac和源mac地址 1200 static inline void 1201 processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP]) 1202 { 1203 __m128i te[FWDSTEP]; 1204 __m128i ve[FWDSTEP]; 1205 __m128i *p[FWDSTEP]; 1206 1207 p[0] = (rte_pktmbuf_mtod(pkt[0], __m128i *));//指向第一个数据包的内容 1208 p[1] = (rte_pktmbuf_mtod(pkt[1], __m128i *)); 1209 p[2] = (rte_pktmbuf_mtod(pkt[2], __m128i *)); 1210 p[3] = (rte_pktmbuf_mtod(pkt[3], __m128i *)); 1211 1212 ve[0] = val_eth[dst_port[0]]; 1213 te[0] = _mm_load_si128(p[0]);//将p[0]指向的内容加载到128位寄存器中 1214 1215 ve[1] = val_eth[dst_port[1]]; 1216 te[1] = _mm_load_si128(p[1]); 1217 1218 ve[2] = val_eth[dst_port[2]]; 1219 te[2] = _mm_load_si128(p[2]); 1220 1221 ve[3] = val_eth[dst_port[3]]; 1222 te[3] = _mm_load_si128(p[3]); 1223 1224 /*替换更新前12个字节,保留剩余 Update first 12 bytes, keep rest bytes intact. */ 1225 te[0] = _mm_blend_epi16(te[0], ve[0], MASK_ETH); 1226 te[1] = _mm_blend_epi16(te[1], ve[1], MASK_ETH); 1227 te[2] = _mm_blend_epi16(te[2], ve[2], MASK_ETH); 1228 te[3] = _mm_blend_epi16(te[3], ve[3], MASK_ETH); 1229 1230 _mm_store_si128(p[0], te[0]); 1231 _mm_store_si128(p[1], te[1]); 1232 _mm_store_si128(p[2], te[2]); 1233 _mm_store_si128(p[3], te[3]); 1234 1235 rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1), 1236 &dst_port[0], pkt[0]->ol_flags); 1237 rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1), 1238 &dst_port[1], pkt[1]->ol_flags); 1239 rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1), 1240 &dst_port[2], pkt[2]->ol_flags); 1241 rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1), 1242 &dst_port[3], pkt[3]->ol_flags); 1243 } 1244 1245 /* //把转发出口相同的连续数据包做一次burst发送 1246 为了避免额外的延迟,与其他的包处理一起完成,但在对转发出口做了决策之后。 1247 1248 * We group consecutive packets with the same destionation port into one burst. 1249 * To avoid extra latency this is done together with some other packet 1250 * processing, but after we made a final decision about packet's destination. 1251 * To do this we maintain: 1252 * pnum - array of number of consecutive packets with the same dest port for 1253 * each packet in the input burst. ***pnum是保存转发出口相同的连续数据包的数组 1254 * lp - pointer to the last updated element in the pnum. ***lp指向pnum中最后一次更新的元素 1255 * dlp - dest port value lp corresponds to. ***dlp为lp对应的转发出口编号 1256 */ 1257 1258 #define GRPSZ (1 << FWDSTEP) //16 1259 #define GRPMSK (GRPSZ - 1) //15 1260 1261 #define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \ 1262 if (likely((dlp) == (dcp)[(idx)])) { \ 1263 (lp)[0]++; \ 1264 } else { \ 1265 (dlp) = (dcp)[idx]; \ 1266 (lp) = (pn) + (idx); \ 1267 (lp)[0] = 1; \ 1268 } \ 1269 } while (0) 1270 1271 /* 1272 * Group consecutive packets with the same destination port in bursts of 4. 1273 * Suppose we have array of destionation ports: 1274 * dst_port[] = {a, b, c, d,, e, ... } 1275 * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>. 1276 * We doing 4 comparisions at once and the result is 4 bit mask. 1277 * This mask is used as an index into prebuild array of pnum values. 1278 */ 1279 static inline uint16_t * //把出口相同的4个数据包构成一组 1280 port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2) 1281 { 1282 static const struct { 1283 uint64_t pnum; /*为pnum预设的4个值 prebuild 4 values for pnum[]. */ 1284 int32_t idx; /*最后一次更新的元素的索引 index for new last updated elemnet. */ 1285 uint16_t lpv; /*把值加到最后一次更新的元素 add value to the last updated element. */ 1286 } gptbl[GRPSZ] = { 1287 { 1288 /* 0: a != b, b != c, c != d, d != e */ 1289 .pnum = UINT64_C(0x0001000100010001), 1290 .idx = 4, 1291 .lpv = 0, 1292 }, 1293 { 1294 /* 1: a == b, b != c, c != d, d != e */ 1295 .pnum = UINT64_C(0x0001000100010002), 1296 .idx = 4, 1297 .lpv = 1, 1298 }, 1299 { 1300 /* 2: a != b, b == c, c != d, d != e */ 1301 .pnum = UINT64_C(0x0001000100020001), 1302 .idx = 4, 1303 .lpv = 0, 1304 }, 1305 { 1306 /* 3: a == b, b == c, c != d, d != e */ 1307 .pnum = UINT64_C(0x0001000100020003), 1308 .idx = 4, 1309 .lpv = 2, 1310 }, 1311 { 1312 /* 4: a != b, b != c, c == d, d != e */ 1313 .pnum = UINT64_C(0x0001000200010001), 1314 .idx = 4, 1315 .lpv = 0, 1316 }, 1317 { 1318 /* 5: a == b, b != c, c == d, d != e */ 1319 .pnum = UINT64_C(0x0001000200010002), 1320 .idx = 4, 1321 .lpv = 1, 1322 }, 1323 { 1324 /* 6: a != b, b == c, c == d, d != e */ 1325 .pnum = UINT64_C(0x0001000200030001), 1326 .idx = 4, 1327 .lpv = 0, 1328 }, 1329 { 1330 /* 7: a == b, b == c, c == d, d != e */ 1331 .pnum = UINT64_C(0x0001000200030004), 1332 .idx = 4, 1333 .lpv = 3, 1334 }, 1335 { 1336 /* 8: a != b, b != c, c != d, d == e */ 1337 .pnum = UINT64_C(0x0002000100010001), 1338 .idx = 3, 1339 .lpv = 0, 1340 }, 1341 { 1342 /* 9: a == b, b != c, c != d, d == e */ 1343 .pnum = UINT64_C(0x0002000100010002), 1344 .idx = 3, 1345 .lpv = 1, 1346 }, 1347 { 1348 /* 0xa: a != b, b == c, c != d, d == e */ 1349 .pnum = UINT64_C(0x0002000100020001), 1350 .idx = 3, 1351 .lpv = 0, 1352 }, 1353 { 1354 /* 0xb: a == b, b == c, c != d, d == e */ 1355 .pnum = UINT64_C(0x0002000100020003), 1356 .idx = 3, 1357 .lpv = 2, 1358 }, 1359 { 1360 /* 0xc: a != b, b != c, c == d, d == e */ 1361 .pnum = UINT64_C(0x0002000300010001), 1362 .idx = 2, 1363 .lpv = 0, 1364 }, 1365 { 1366 /* 0xd: a == b, b != c, c == d, d == e */ 1367 .pnum = UINT64_C(0x0002000300010002), 1368 .idx = 2, 1369 .lpv = 1, 1370 }, 1371 { 1372 /* 0xe: a != b, b == c, c == d, d == e */ 1373 .pnum = UINT64_C(0x0002000300040001), 1374 .idx = 1, 1375 .lpv = 0, 1376 }, 1377 { 1378 /* 0xf: a == b, b == c, c == d, d == e */ 1379 .pnum = UINT64_C(0x0002000300040005), 1380 .idx = 0, 1381 .lpv = 4, 1382 }, 1383 }; 1384 1385 union { 1386 uint16_t u16[FWDSTEP + 1]; 1387 uint64_t u64; 1388 } *pnum = (void *)pn; 1389 1390 int32_t v; 1391 1392 dp1 = _mm_cmpeq_epi16(dp1, dp2); //按照16位一个单元来比较dp1和dp2 1393 dp1 = _mm_unpacklo_epi16(dp1, dp1); //按照16位一个单元将dp1与dp1来结合 1394 v = _mm_movemask_ps((__m128)dp1); //根据dp1的4个值形成4个位的掩码 1395 1396 /*更新最后一次端口计数 update last port counter. */ 1397 lp[0] += gptbl[v].lpv; 1398 1399 /*如果转发出口的值已经改变 if dest port value has changed. */ 1400 if (v != GRPMSK) { 1401 lp = pnum->u16 + gptbl[v].idx; 1402 lp[0] = 1; 1403 pnum->u64 = gptbl[v].pnum; 1404 } 1405 1406 return lp; 1407 } 1408 1409 #endif /* APP_LOOKUP_METHOD */ 1410 1411 /* 线程执行函数 main processing loop */ 1412 static int 1413 main_loop(__attribute__((unused)) void *dummy) 1414 { 1415 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; //32个指针构成的数组 1416 unsigned lcore_id; 1417 uint64_t prev_tsc, diff_tsc, cur_tsc; 1418 int i, j, nb_rx; 1419 uint8_t portid, queueid; 1420 struct lcore_conf *qconf; 1421 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1422 US_PER_S * BURST_TX_DRAIN_US; 1423 1424 #if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \ 1425 (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)) 1426 int32_t k; 1427 uint16_t dlp; //dlp为lp对应的转发出口编号 1428 uint16_t *lp; //lp指向pkts_burst中最后一次更新的元素 1429 uint16_t dst_port[MAX_PKT_BURST]; //dst_port是32个数据包的转发出口构成的数组 1430 __m128i dip[MAX_PKT_BURST / FWDSTEP]; //数据包的目的IP地址构成的数组 1431 uint32_t flag[MAX_PKT_BURST / FWDSTEP]; 1432 uint16_t pnum[MAX_PKT_BURST + 1]; //转发出口相同的数据包的编号 1433 #endif 1434 1435 prev_tsc = 0; 1436 1437 lcore_id = rte_lcore_id(); //获取lcore_id 1438 qconf = &lcore_conf[lcore_id];//获取lcore_id的配置信息 1439 1440 if (qconf->n_rx_queue == 0) { //如果lcore上没有接收队列 1441 RTE_LOG(INFO, L3FWD, "lcore %u has nothing to do\n", lcore_id); 1442 return 0; 1443 } 1444 1445 RTE_LOG(INFO, L3FWD, "entering main loop on lcore %u\n", lcore_id); 1446 1447 for (i = 0; i < qconf->n_rx_queue; i++) { //遍历所有的接收队列 1448 1449 portid = qconf->rx_queue_list[i].port_id; //得到物理端口的编号 1450 queueid = qconf->rx_queue_list[i].queue_id; //得到网卡队列的编号 1451 RTE_LOG(INFO, L3FWD, " -- lcoreid=%u portid=%hhu rxqueueid=%hhu\n", lcore_id, 1452 portid, queueid); 1453 } 1454 1455 while (1) { //死循环,体现PMD思想 1456 1457 cur_tsc = rte_rdtsc(); 1458 1459 /* 1460 * TX burst queue drain 1461 */ 1462 diff_tsc = cur_tsc - prev_tsc; //计算时间差 1463 if (unlikely(diff_tsc > drain_tsc)) { //如果两次时间差大于定值 1464 1465 /* 1466 * This could be optimized (use queueid instead of 1467 * portid), but it is not called so often 1468 */ 1469 for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {//遍历所有的物理端口 1470 if (qconf->tx_mbufs[portid].len == 0) 1471 continue; 1472 send_burst(qconf, 1473 qconf->tx_mbufs[portid].len, 1474 portid); 1475 qconf->tx_mbufs[portid].len = 0; 1476 } 1477 1478 prev_tsc = cur_tsc; //记下前一时间 1479 } 1480 1481 /* 从接收队列中读取数据包 1482 * Read packet from RX queues 1483 */ 1484 for (i = 0; i < qconf->n_rx_queue; ++i) { //遍历所有的接收队列 1485 portid = qconf->rx_queue_list[i].port_id;//得到物理端口的编号 1486 queueid = qconf->rx_queue_list[i].queue_id; //得到网卡队列的编号 1487 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1488 MAX_PKT_BURST); //在每个队列上尽量接收32个数据包,用nb_rx记录实际个数 1489 if (nb_rx == 0) //如果一个包也没有收到 1490 continue; 1491 1492 #if (ENABLE_MULTI_BUFFER_OPTIMIZE == 1) //如果支持Intel SSE4.1特性 1493 if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) //如果使用lpm 1494 1495 k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP); //整除4 1496 for (j = 0; j != k; j += FWDSTEP) { //每次处理4个mbufs 1497 processx4_step1(&pkts_burst[j], //从4个mbufs中读取目的ip地址和ol_flags 1498 &dip[j / FWDSTEP], 1499 &flag[j / FWDSTEP]); 1500 } 1501 1502 k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP); 1503 for (j = 0; j != k; j += FWDSTEP) {//每次处理4个mbufs 1504 processx4_step2(qconf, dip[j / FWDSTEP], //在LPM中查找转发出口,如果失败则把进入的端口作为转发出口 1505 flag[j / FWDSTEP], portid, 1506 &pkts_burst[j], &dst_port[j]); 1507 } 1508 1509 /* 完成包处理,并根据相同的转发出口来分组连续的数据包 1510 * Finish packet processing and group consecutive 1511 * packets with the same destination port. 1512 */ 1513 k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);//处理成4的幂 1514 if (k != 0) { 1515 __m128i dp1, dp2; 1516 1517 lp = pnum; 1518 lp[0] = 1; 1519 1520 processx4_step3(pkts_burst, dst_port); //更新目的mac和源mac地址 1521 1522 /* dp1: <d[0], d[1], d[2], d[3], ... > */ 1523 dp1 = _mm_loadu_si128((__m128i *)dst_port); //把目的端口加载到寄存器dp1中 1524 1525 for (j = FWDSTEP; j != k; j += FWDSTEP) { //每次处理4个mbufs 1526 processx4_step3(&pkts_burst[j], //更新目的mac和源mac地址 1527 &dst_port[j]); 1528 1529 /* 1530 * dp2: 1531 * <d[j-3], d[j-2], d[j-1], d[j], ... > 1532 */ 1533 dp2 = _mm_loadu_si128((__m128i *) //返回一个__m128i的寄存器 1534 &dst_port[j - FWDSTEP + 1]); 1535 lp = port_groupx4(&pnum[j - FWDSTEP], //把出口相同的4个数据包构成一组 1536 lp, dp1, dp2); 1537 1538 /* 1539 * dp1: 1540 * <d[j], d[j+1], d[j+2], d[j+3], ... > 1541 */ 1542 dp1 = _mm_srli_si128(dp2, //逻辑左移3*16位,返回一个__m128i的寄存器 1543 (FWDSTEP - 1) * 1544 sizeof(dst_port[0])); 1545 } 1546 1547 /* 1548 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... > 1549 */ 1550 dp2 = _mm_shufflelo_epi16(dp1, 0xf9); //重新排序,返回一个__m128i的寄存器 1551 lp = port_groupx4(&pnum[j - FWDSTEP], lp, //把4个连续分组按照目的端口分组 1552 dp1, dp2); 1553 1554 /* 1555 * remove values added by the last repeated 1556 * dst port. 1557 */ 1558 lp[0]--; 1559 dlp = dst_port[j - 1]; 1560 } else { 1561 /* set dlp and lp to the never used values. */ 1562 dlp = BAD_PORT - 1; 1563 lp = pnum + MAX_PKT_BURST; 1564 } 1565 1566 /*处理最后的三个分组 Process up to last 3 packets one by one. */ 1567 switch (nb_rx % FWDSTEP) { 1568 case 3: //第三个mbuf 1569 process_packet(qconf, pkts_burst[j], 1570 dst_port + j, portid); 1571 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j); 1572 j++; 1573 case 2://第二个mbuf 1574 process_packet(qconf, pkts_burst[j], 1575 dst_port + j, portid); 1576 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j); 1577 j++; 1578 case 1://第一个mbuf 1579 process_packet(qconf, pkts_burst[j], 1580 dst_port + j, portid); 1581 GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j); 1582 j++; 1583 } 1584 1585 /*通过目的端口把数据包都发出去,这些数据包之前已经组合好了的 1586 * Send packets out, through destination port. 1587 * Consecuteve pacekts with the same destination port 1588 * are already grouped together. 1589 * If destination port for the packet equals BAD_PORT, 1590 * then free the packet without sending it out. 1591 */ 1592 for (j = 0; j < nb_rx; j += k) { //遍历接收到的数据包 1593 1594 int32_t m; 1595 uint16_t pn; 1596 1597 pn = dst_port[j]; 1598 k = pnum[j]; 1599 1600 if (likely(pn != BAD_PORT)) { 1601 send_packetsx4(qconf, pn, //把待发送的数据包放到发送缓冲区中,累积到32个再发出去 1602 pkts_burst + j, k); 1603 } else { 1604 for (m = j; m != j + k; m++) 1605 rte_pktmbuf_free(pkts_burst[m]); 1606 } 1607 } 1608 1609 #endif /* APP_LOOKUP_METHOD */ 1610 #else /*如果不支持Intel SSE4.1特性 ENABLE_MULTI_BUFFER_OPTIMIZE == 0 */ 1611 1612 /*预取接收队列上的第一个数据包 Prefetch first packets */ 1613 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1614 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j], void *)); 1615 } 1616 1617 /*预取和转发已经预取的数据包 Prefetch and forward already prefetched packets */ 1618 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1619 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1620 j + PREFETCH_OFFSET], void *)); 1621 l3fwd_simple_forward(pkts_burst[j], portid, qconf);//简单转发4倍数的数据包 1622 1623 } 1624 1625 /*转发正在预取的数据包 Forward remaining prefetched packets */ 1626 for (; j < nb_rx; j++) { 1627 l3fwd_simple_forward(pkts_burst[j], portid, qconf);//简单转发剩余几个数据包 1628 1629 } 1630 #endif /* ENABLE_MULTI_BUFFER_OPTIMIZE */ 1631 1632 } //for (i = 0; i < qconf->n_rx_queue; ++i) 1633 } //while (1) 1634 }//end of main_loop 1635 1636 static int //检查lcore的参数 1637 check_lcore_params(void) 1638 { 1639 uint8_t queue, lcore; 1640 uint16_t i; 1641 int socketid; 1642 1643 for (i = 0; i < nb_lcore_params; ++i) { //遍历lcores的参数表 1644 queue = lcore_params[i].queue_id; 1645 if (queue >= MAX_RX_QUEUE_PER_PORT) { //如果队列编号大于128 1646 printf("invalid queue number: %hhu\n", queue); 1647 return -1; 1648 } 1649 lcore = lcore_params[i].lcore_id; 1650 if (!rte_lcore_is_enabled(lcore)) { //如果lcore没有启用 1651 printf("error: lcore %hhu is not enabled in lcore mask\n", lcore); 1652 return -1; 1653 } 1654 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1655 (numa_on == 0)) { //如果numa关闭 1656 printf("warning: lcore %hhu is on socket %d with numa off \n", 1657 lcore, socketid); 1658 } 1659 } 1660 return 0; 1661 } 1662 1663 static int //检查物理端口的配置 1664 check_port_config(const unsigned nb_ports) 1665 { 1666 unsigned portid; 1667 uint16_t i; 1668 1669 for (i = 0; i < nb_lcore_params; ++i) { //遍历lcores的参数表 1670 portid = lcore_params[i].port_id; 1671 if ((enabled_port_mask & (1 << portid)) == 0) { 1672 printf("port %u is not enabled in port mask\n", portid); 1673 return -1; 1674 } 1675 if (portid >= nb_ports) { 1676 printf("port %u is not present on the board\n", portid); 1677 return -1; 1678 } 1679 } 1680 return 0; 1681 } 1682 1683 static uint8_t //获取物理端口上的接收队列数量 1684 get_port_n_rx_queues(const uint8_t port) //其实就是取queue_id最大值加1 1685 { 1686 int queue = -1; 1687 uint16_t i; 1688 1689 for (i = 0; i < nb_lcore_params; ++i) { //遍历lcores的参数表 1690 if (lcore_params[i].port_id == port && lcore_params[i].queue_id > queue) 1691 queue = lcore_params[i].queue_id;//获取queue_id值 1692 } 1693 return (uint8_t)(++queue); //因为queue_id从0开始 1694 } 1695 1696 static int //初始化lcore上的接收队列 1697 init_lcore_rx_queues(void) 1698 { 1699 uint16_t i, nb_rx_queue; 1700 uint8_t lcore; 1701 1702 for (i = 0; i < nb_lcore_params; ++i) {//遍历lcores的参数表 1703 lcore = lcore_params[i].lcore_id; 1704 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1705 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) {//如果接收队列总数大于128 1706 printf("error: too many queues (%u) for lcore: %u\n", 1707 (unsigned)nb_rx_queue + 1, (unsigned)lcore); 1708 return -1; 1709 } else { 1710 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1711 lcore_params[i].port_id; //记录port_id 1712 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1713 lcore_params[i].queue_id; //记录queue_id 1714 lcore_conf[lcore].n_rx_queue++;//lcore上接收队列的数量加1 1715 } 1716 } 1717 return 0; 1718 } 1719 1720 /* display usage */ 1721 static void //打印使用说明 1722 print_usage(const char *prgname) 1723 { 1724 printf ("%s [EAL options] -- -p PORTMASK -P" 1725 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1726 " [--enable-jumbo [--max-pkt-len PKTLEN]]\n" 1727 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1728 " -P : enable promiscuous mode\n" 1729 " --config (port,queue,lcore): rx queues configuration\n" 1730 " --no-numa: optional, disable numa awareness\n" 1731 " --ipv6: optional, specify it if running ipv6 packets\n" 1732 " --enable-jumbo: enable jumbo frame" 1733 " which max packet len is PKTLEN in decimal (64-9600)\n" 1734 " --hash-entry-num: specify the hash entry number in hexadecimal to be setup\n", 1735 prgname); 1736 } 1737 1738 static int //分析数据包的长度 1739 parse_max_pkt_len(const char *pktlen) 1740 { 1741 char *end = NULL; 1742 unsigned long len; 1743 1744 /* parse decimal string */ 1745 len = strtoul(pktlen, &end, 10); //把字符串转换成十进制数字 1746 if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) 1747 return -1; 1748 1749 if (len == 0) 1750 return -1; 1751 1752 return len; 1753 } 1754 1755 static int //分析物理端口的掩码 1756 parse_portmask(const char *portmask) 1757 { 1758 char *end = NULL; 1759 unsigned long pm; 1760 1761 /* parse hexadecimal string */ 1762 pm = strtoul(portmask, &end, 16);//字符串转换为十六进制的数字 1763 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1764 return -1; 1765 1766 if (pm == 0) 1767 return -1; 1768 1769 return pm; 1770 } 1771 1772 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1773 static int 1774 parse_hash_entry_number(const char *hash_entry_num) 1775 { 1776 char *end = NULL; 1777 unsigned long hash_en; 1778 /* parse hexadecimal string */ 1779 hash_en = strtoul(hash_entry_num, &end, 16); 1780 if ((hash_entry_num[0] == '\0') || (end == NULL) || (*end != '\0')) 1781 return -1; 1782 1783 if (hash_en == 0) 1784 return -1; 1785 1786 return hash_en; 1787 } 1788 #endif 1789 1790 static int //分析参数中的配置 1791 parse_config(const char *q_arg) 1792 { 1793 char s[256]; 1794 const char *p, *p0 = q_arg; 1795 char *end; 1796 enum fieldnames { 1797 FLD_PORT = 0, 1798 FLD_QUEUE, 1799 FLD_LCORE, 1800 _NUM_FLD 1801 }; 1802 unsigned long int_fld[_NUM_FLD]; 1803 char *str_fld[_NUM_FLD]; 1804 int i; 1805 unsigned size; 1806 1807 nb_lcore_params = 0; //数组的元素个数初始化为0 1808 //举例: --config="(0,0,1),(0,1,2),(1,0,1),(1,1,3)" 1809 while ((p = strchr(p0,'(')) != NULL) { //找到左括号的位置,并赋值给p,除非找不到左括号才结束while循环 1810 ++p; 1811 if((p0 = strchr(p,')')) == NULL) //找到有括号的位置,并赋值给p0 1812 return -1; 1813 1814 size = p0 - p; //计算括号内的字符串长度 1815 if(size >= sizeof(s)) 1816 return -1; 1817 1818 snprintf(s, sizeof(s), "%.*s", size, p); //按照size宽度拼接字符串s 1819 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != _NUM_FLD)//分割字符串s到str_fld中 1820 return -1; 1821 for (i = 0; i < _NUM_FLD; i++){//遍历各个成员 1822 errno = 0; 1823 int_fld[i] = strtoul(str_fld[i], &end, 0);//获取port_id、queue_id、lcore_id成员的值 1824 if (errno != 0 || end == str_fld[i] || int_fld[i] > 255) 1825 return -1; 1826 } 1827 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1828 printf("exceeded max number of lcore params: %hu\n", 1829 nb_lcore_params); 1830 return -1; 1831 } 1832 lcore_params_array[nb_lcore_params].port_id = (uint8_t)int_fld[FLD_PORT];//赋值port_id 1833 lcore_params_array[nb_lcore_params].queue_id = (uint8_t)int_fld[FLD_QUEUE];//赋值queue_id 1834 lcore_params_array[nb_lcore_params].lcore_id = (uint8_t)int_fld[FLD_LCORE];//赋值lcore_id 1835 ++nb_lcore_params; //数组的元素个数自增 1836 } 1837 lcore_params = lcore_params_array;//使用新配置,抛弃默认配置 1838 return 0; 1839 } 1840 1841 #define CMD_LINE_OPT_CONFIG "config" 1842 #define CMD_LINE_OPT_NO_NUMA "no-numa" 1843 #define CMD_LINE_OPT_IPV6 "ipv6" 1844 #define CMD_LINE_OPT_ENABLE_JUMBO "enable-jumbo" 1845 #define CMD_LINE_OPT_HASH_ENTRY_NUM "hash-entry-num" 1846 1847 /* Parse the argument given in the command line of the application */ 1848 static int //分析l3fwd相关的参数 1849 parse_args(int argc, char **argv) 1850 { 1851 int opt, ret; 1852 char **argvopt; 1853 int option_index; 1854 char *prgname = argv[0]; 1855 static struct option lgopts[] = { 1856 {CMD_LINE_OPT_CONFIG, 1, 0, 0}, //config参数对应于case 0 1857 {CMD_LINE_OPT_NO_NUMA, 0, 0, 0}, 1858 {CMD_LINE_OPT_IPV6, 0, 0, 0}, 1859 {CMD_LINE_OPT_ENABLE_JUMBO, 0, 0, 0}, 1860 {CMD_LINE_OPT_HASH_ENTRY_NUM, 1, 0, 0}, 1861 {NULL, 0, 0, 0}//应该可以在这个地方加上kni_config命令字 1862 1863 }; 1864 1865 argvopt = argv; 1866 1867 while ((opt = getopt_long(argc, argvopt, "p:P", 1868 lgopts, &option_index)) != EOF) { 1869 1870 switch (opt) { 1871 /* portmask 物理端口的掩码*/ 1872 case 'p': 1873 enabled_port_mask = parse_portmask(optarg);//optarg为指向当前选项参数的指针 1874 if (enabled_port_mask == 0) { 1875 printf("invalid portmask\n"); 1876 print_usage(prgname); 1877 return -1; 1878 } 1879 break; 1880 case 'P': //混杂模式 1881 printf("Promiscuous mode selected\n"); 1882 promiscuous_on = 1; 1883 break; 1884 1885 /* long options 解析长选项 */ 1886 case 0: 1887 if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_CONFIG, 1888 sizeof (CMD_LINE_OPT_CONFIG))) { //参数config 1889 ret = parse_config(optarg);//解析()中的参数 1890 if (ret) { 1891 printf("invalid config\n"); 1892 print_usage(prgname); 1893 return -1; 1894 } 1895 } 1896 1897 if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_NO_NUMA, 1898 sizeof(CMD_LINE_OPT_NO_NUMA))) { //参数no-numa 1899 printf("numa is disabled \n"); 1900 numa_on = 0; 1901 } 1902 1903 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1904 if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_IPV6, 1905 sizeof(CMD_LINE_OPT_IPV6))) { //参数ipv6 1906 printf("ipv6 is specified \n"); 1907 ipv6 = 1; 1908 } 1909 #endif 1910 1911 if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_ENABLE_JUMBO, 1912 sizeof (CMD_LINE_OPT_ENABLE_JUMBO))) {//参数enable-jumbo 1913 struct option lenopts = {"max-pkt-len", required_argument, 0, 0}; 1914 1915 printf("jumbo frame is enabled - disabling simple TX path\n"); 1916 port_conf.rxmode.jumbo_frame = 1; 1917 1918 /* if no max-pkt-len set, use the default value ETHER_MAX_LEN */ 1919 if (0 == getopt_long(argc, argvopt, "", &lenopts, &option_index)) { 1920 ret = parse_max_pkt_len(optarg); //分析数据包的长度 1921 if ((ret < 64) || (ret > MAX_JUMBO_PKT_LEN)){ 1922 printf("invalid packet length\n"); 1923 print_usage(prgname); 1924 return -1; 1925 } 1926 port_conf.rxmode.max_rx_pkt_len = ret; 1927 } 1928 printf("set jumbo frame max packet length to %u\n", 1929 (unsigned int)port_conf.rxmode.max_rx_pkt_len); 1930 } 1931 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1932 if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_HASH_ENTRY_NUM, 1933 sizeof(CMD_LINE_OPT_HASH_ENTRY_NUM))) {//参数hash-entry-num 1934 ret = parse_hash_entry_number(optarg); 1935 if ((ret > 0) && (ret <= L3FWD_HASH_ENTRIES)) { 1936 hash_entry_number = ret; 1937 } else { 1938 printf("invalid hash entry number\n"); 1939 print_usage(prgname); 1940 return -1; 1941 } 1942 } 1943 #endif 1944 break; 1945 1946 default: 1947 print_usage(prgname); 1948 return -1; 1949 } 1950 } 1951 1952 if (optind >= 0) 1953 argv[optind-1] = prgname; 1954 1955 ret = optind-1; 1956 optind = 0; /* optind是下一个选项的索引 reset getopt lib */ 1957 return ret; 1958 } 1959 1960 static void //打印mac地址 1961 print_ethaddr(const char *name, const struct ether_addr *eth_addr) 1962 { 1963 char buf[ETHER_ADDR_FMT_SIZE]; 1964 ether_format_addr(buf, ETHER_ADDR_FMT_SIZE, eth_addr); 1965 printf("%s%s", name, buf); 1966 } 1967 1968 1969 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 1970 static void //创建LPM 1971 setup_lpm(int socketid) 1972 { 1973 struct rte_lpm6_config config; 1974 unsigned i; 1975 int ret; 1976 char s[64]; 1977 1978 /* 创建LPM ipv4表 create the LPM table */ 1979 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 1980 ipv4_l3fwd_lookup_struct[socketid] = rte_lpm_create(s, socketid, 1981 IPV4_L3FWD_LPM_MAX_RULES, 0); 1982 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 1983 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 1984 " on socket %d\n", socketid); 1985 1986 /* 填充ipv4 LPM表 populate the LPM table */ 1987 1988 for (i = 0; i < IPV4_L3FWD_NUM_ROUTES; i++) {//遍历已经配置的所有的规则 1989 1990 /* skip unused ports 跳过未使用的物理端口*/ 1991 if ((1 << ipv4_l3fwd_route_array[i].if_out & 1992 enabled_port_mask) == 0) 1993 continue; 1994 1995 1996 //添加一条路由,即把规则转换为tbl24或者tbl8 1997 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 1998 ipv4_l3fwd_route_array[i].ip, 1999 ipv4_l3fwd_route_array[i].depth, 2000 ipv4_l3fwd_route_array[i].if_out); 2001 2002 if (ret < 0) { //如果添加路由失败 2003 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2004 "l3fwd LPM table on socket %d\n", 2005 i, socketid); 2006 } 2007 2008 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2009 (unsigned)ipv4_l3fwd_route_array[i].ip, 2010 ipv4_l3fwd_route_array[i].depth, 2011 ipv4_l3fwd_route_array[i].if_out); 2012 } 2013 2014 /* 创建lpm ipv6表 create the LPM6 table */ 2015 snprintf(s, sizeof(s), "IPV6_L3FWD_LPM_%d", socketid); 2016 2017 config.max_rules = IPV6_L3FWD_LPM_MAX_RULES; 2018 config.number_tbl8s = IPV6_L3FWD_LPM_NUMBER_TBL8S; 2019 config.flags = 0; 2020 ipv6_l3fwd_lookup_struct[socketid] = rte_lpm6_create(s, socketid, 2021 &config); 2022 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2023 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2024 " on socket %d\n", socketid); 2025 2026 /* 填充LPM ipv6表 populate the LPM table */ 2027 for (i = 0; i < IPV6_L3FWD_NUM_ROUTES; i++) { 2028 2029 /* skip unused ports */ 2030 if ((1 << ipv6_l3fwd_route_array[i].if_out & 2031 enabled_port_mask) == 0) 2032 continue; 2033 2034 ret = rte_lpm6_add(ipv6_l3fwd_lookup_struct[socketid], 2035 ipv6_l3fwd_route_array[i].ip, 2036 ipv6_l3fwd_route_array[i].depth, 2037 ipv6_l3fwd_route_array[i].if_out); 2038 2039 if (ret < 0) { 2040 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2041 "l3fwd LPM table on socket %d\n", 2042 i, socketid); 2043 } 2044 2045 printf("LPM: Adding route %s / %d (%d)\n", 2046 "IPV6", 2047 ipv6_l3fwd_route_array[i].depth, 2048 ipv6_l3fwd_route_array[i].if_out); 2049 } 2050 } 2051 #endif 2052 2053 static int //初始化内存 2054 init_mem(unsigned nb_mbuf) 2055 { 2056 struct lcore_conf *qconf; 2057 int socketid; 2058 unsigned lcore_id; 2059 char s[64]; 2060 2061 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {//遍历所有lcores 2062 if (rte_lcore_is_enabled(lcore_id) == 0) 2063 continue; 2064 2065 if (numa_on) //一般开启了numa 2066 socketid = rte_lcore_to_socket_id(lcore_id);//得到lcore所在的socketid 2067 else 2068 socketid = 0; //默认socketid为0 2069 2070 if (socketid >= NB_SOCKETS) { 2071 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 2072 socketid, lcore_id, NB_SOCKETS); 2073 } 2074 if (pktmbuf_pool[socketid] == NULL) { 2075 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2076 pktmbuf_pool[socketid] = //为每一个socket创建mempool用来动态分配mbufs 2077 rte_mempool_create(s, nb_mbuf, MBUF_SIZE, MEMPOOL_CACHE_SIZE, 2078 sizeof(struct rte_pktmbuf_pool_private), 2079 rte_pktmbuf_pool_init, NULL, 2080 rte_pktmbuf_init, NULL, 2081 socketid, 0); 2082 if (pktmbuf_pool[socketid] == NULL) 2083 rte_exit(EXIT_FAILURE, 2084 "Cannot init mbuf pool on socket %d\n", socketid); 2085 else 2086 printf("Allocated mbuf pool on socket %d\n", socketid); 2087 2088 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2089 setup_lpm(socketid); //创建LPM表,只需给每个socket cpu创建一个LPM表,而同一个CPU上的lcores共享LPM 2090 #else 2091 setup_hash(socketid); //创建Hash表 2092 #endif 2093 } 2094 qconf = &lcore_conf[lcore_id]; 2095 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2096 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2097 } 2098 return 0; 2099 } 2100 2101 /* Check the link status of all ports in up to 9s, and print them finally */ 2102 static void //检查物理端口的连接状态 2103 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask) 2104 { 2105 #define CHECK_INTERVAL 100 /* 100ms */ 2106 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2107 uint8_t portid, count, all_ports_up, print_flag = 0; 2108 struct rte_eth_link link; 2109 2110 printf("\nChecking link status"); 2111 fflush(stdout); 2112 for (count = 0; count <= MAX_CHECK_TIME; count++) {//最多执行9000次 2113 all_ports_up = 1; 2114 for (portid = 0; portid < port_num; portid++) {//遍历物理端口 2115 if ((port_mask & (1 << portid)) == 0) 2116 continue; 2117 memset(&link, 0, sizeof(link)); 2118 rte_eth_link_get_nowait(portid, &link); 2119 /* print link status if flag set */ 2120 if (print_flag == 1) { 2121 if (link.link_status) 2122 printf("Port %d Link Up - speed %u " 2123 "Mbps - %s\n", (uint8_t)portid, 2124 (unsigned)link.link_speed, 2125 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 2126 ("full-duplex") : ("half-duplex\n")); 2127 else 2128 printf("Port %d Link Down\n", 2129 (uint8_t)portid); 2130 continue; 2131 } 2132 /* clear all_ports_up flag if any link down */ 2133 if (link.link_status == 0) { 2134 all_ports_up = 0; 2135 break; 2136 } 2137 } 2138 /* after finally printing all link status, get out */ 2139 if (print_flag == 1) 2140 break; 2141 2142 if (all_ports_up == 0) { 2143 printf("."); 2144 fflush(stdout); 2145 rte_delay_ms(CHECK_INTERVAL); 2146 } 2147 2148 /* set the print_flag if all ports up or timeout */ 2149 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2150 print_flag = 1; 2151 printf("done\n"); 2152 } 2153 } 2154 } 2155 2156 int //主函数 2157 main(int argc, char **argv) 2158 { 2159 struct lcore_conf *qconf; 2160 struct rte_eth_dev_info dev_info; 2161 struct rte_eth_txconf *txconf; 2162 int ret; 2163 unsigned nb_ports; 2164 uint16_t queueid; 2165 unsigned lcore_id; 2166 uint32_t n_tx_queue, nb_lcores; 2167 uint8_t portid, nb_rx_queue, queue, socketid; 2168 2169 /* init EAL */ 2170 ret = rte_eal_init(argc, argv); //初始化软件抽象层,并解析EAL有关参数 2171 if (ret < 0) 2172 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2173 argc -= ret; //减少参数个数 2174 argv += ret; //移动参数位置 2175 2176 /* parse application arguments (after the EAL ones) */ 2177 ret = parse_args(argc, argv); //解析l3fwd有关参数: -p -P --config 2178 if (ret < 0) 2179 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2180 2181 if (check_lcore_params() < 0) //检查lcore参数 2182 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2183 2184 ret = init_lcore_rx_queues(); //初始化每个lcore上的rx queue数量 2185 if (ret < 0) 2186 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2187 2188 nb_ports = rte_eth_dev_count(); //获取物理端口的个数 2189 if (nb_ports > RTE_MAX_ETHPORTS) //如果超过32个 2190 nb_ports = RTE_MAX_ETHPORTS; 2191 2192 if (check_port_config(nb_ports) < 0) //检查物理端口的配置 2193 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2194 2195 nb_lcores = rte_lcore_count(); //获取启用的lcores的总个数 2196 2197 2198 /* initialize all ports 初始化所有的物理端口 */ 2199 for (portid = 0; portid < nb_ports; portid++) { //遍历所有的物理端口 2200 /* skip ports that are not enabled 跳过没有启用的物理端口 */ 2201 if ((enabled_port_mask & (1 << portid)) == 0) { 2202 printf("\nSkipping disabled port %d\n", portid); 2203 continue; 2204 } 2205 2206 /* init port 初始化物理端口*/ 2207 printf("Initializing port %d ... ", portid ); 2208 fflush(stdout); //清空标准输出(屏幕)的缓冲区,这样就能立即在屏幕上看到打印信息 2209 2210 nb_rx_queue = get_port_n_rx_queues(portid); //获取portid上的接收队列的个数 2211 n_tx_queue = nb_lcores; //设定portid上的发送队列的个数为启用的lcores的个数 2212 if (n_tx_queue > MAX_TX_QUEUE_PER_PORT) //如果发送队列的数量超过16个 2213 n_tx_queue = MAX_TX_QUEUE_PER_PORT; 2214 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2215 nb_rx_queue, (unsigned)n_tx_queue ); //这里是不是有点粗暴啊????? 2216 ret = rte_eth_dev_configure(portid, nb_rx_queue, //第一步,配置网络设备 2217 (uint16_t)n_tx_queue, &port_conf); 2218 if (ret < 0) //如果配置设备失败 2219 rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%d\n", 2220 ret, portid); 2221 2222 rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); //记录mac地址到ports_eth_addr[portid] 2223 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2224 printf(", "); 2225 2226 /* 为每一个物理端口准备着源mac地址和目的mac地址 2227 * prepare dst and src MACs for each port. 2228 */ 2229 *(uint64_t *)(val_eth + portid) = 2230 ETHER_LOCAL_ADMIN_ADDR + ((uint64_t)portid << 40); 2231 ether_addr_copy(&ports_eth_addr[portid], //前一个参数为from,后一个为to 2232 (struct ether_addr *)(val_eth + portid) + 1); 2233 /* init memory 分配内存并创建LPM或者hash */ 2234 ret = init_mem(NB_MBUF); //mempool包含8192个元素 2235 if (ret < 0) 2236 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2237 2238 /*初始化一个发送队列成一对(lcore, port) init one TX queue per couple (lcore,port) */ 2239 queueid = 0; 2240 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { //遍历一个物理接口上的所有的lcores 2241 if (rte_lcore_is_enabled(lcore_id) == 0) //忽略未启用的lcore 2242 continue; 2243 2244 if (numa_on)//如果启用numa 2245 socketid = (uint8_t)rte_lcore_to_socket_id(lcore_id); //获取lcore_id所在的socketid 2246 else 2247 socketid = 0;//默认socketid为0 2248 2249 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2250 fflush(stdout);//清空标准输出(屏幕)的缓冲区 2251 2252 rte_eth_dev_info_get(portid, &dev_info);//获取设备信息 2253 txconf = &dev_info.default_txconf;//得到发送的配置结构体指针 2254 if (port_conf.rxmode.jumbo_frame) 2255 txconf->txq_flags = 0; 2256 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, //第二步,建立发送队列 2257 socketid, txconf); //一个port上可能有多个queue,每个queue用一个lcore来绑定 2258 if (ret < 0) 2259 rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup: err=%d, " 2260 "port=%d\n", ret, portid); 2261 2262 qconf = &lcore_conf[lcore_id]; //得到lcore_id的配置结构体指针 2263 qconf->tx_queue_id[portid] = queueid; //记录发送队列的编号到lcore_conf中 2264 queueid++; //发送队列的编号自增 2265 } //end of for(lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) 2266 printf("\n"); 2267 } //end of for(portid = 0; portid < nb_ports; portid++) 2268 2269 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { //遍历所有的lcores 2270 if (rte_lcore_is_enabled(lcore_id) == 0) 2271 continue; //忽略未启用的lcore 2272 qconf = &lcore_conf[lcore_id]; 2273 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2274 fflush(stdout); 2275 /* init RX queues 初始化接收队列 */ 2276 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { //遍历所有的接收队列 2277 portid = qconf->rx_queue_list[queue].port_id; //物理端口的编号 2278 queueid = qconf->rx_queue_list[queue].queue_id;//接收队列的编号 2279 2280 if (numa_on)//一般启用numa 2281 socketid = (uint8_t)rte_lcore_to_socket_id(lcore_id);//获取lcore_id所在的socketid 2282 else 2283 socketid = 0;//默认socketid为0 2284 2285 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2286 fflush(stdout);//清空标准输出(屏幕)的缓冲区 2287 2288 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, //第三步,建立接收队列 2289 socketid, //一个port上可能有多个queue,每个queue用一个lcore来绑定 2290 NULL, 2291 pktmbuf_pool[socketid]); 2292 if (ret < 0) 2293 rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d," 2294 "port=%d\n", ret, portid); 2295 } //for(queue = 0; queue < qconf->n_rx_queue; ++queue) 2296 }//for(lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) 2297 2298 printf("\n"); 2299 2300 /* start ports 启动物理端口 */ 2301 for (portid = 0; portid < nb_ports; portid++) { //遍历所有的物理端口 2302 if ((enabled_port_mask & (1 << portid)) == 0) { 2303 continue; //忽略未启用的物理端口 2304 } 2305 /* Start device 启动设备 */ 2306 ret = rte_eth_dev_start(portid); //第四步,启动物理端口 2307 if (ret < 0) 2308 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, port=%d\n", 2309 ret, portid); 2310 2311 /* 2312 * If enabled, put device in promiscuous mode. 2313 * This allows IO forwarding mode to forward packets 2314 * to itself through 2 cross-connected ports of the 2315 * target machine. 2316 */ 2317 if (promiscuous_on) //如果开始混杂模式 2318 rte_eth_promiscuous_enable(portid); //启动混杂模式 2319 }//end of for (portid = 0; portid < nb_ports; portid++) 2320 2321 check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask); 2322 2323 /* launch per-lcore init on every lcore 在每一个lcore上至多启动一个线程 */ 2324 rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);//CALL_MASTER表示在master也会启动线程 2325 RTE_LCORE_FOREACH_SLAVE(lcore_id) { //遍历每个slave lcore 2326 if (rte_eal_wait_lcore(lcore_id) < 0) //等待线程结束 2327 return -1; 2328 } 2329 2330 return 0; 2331 }