diff --git a/cps/kni.c b/cps/kni.c index 2d808d6a..4e5a25d3 100644 --- a/cps/kni.c +++ b/cps/kni.c @@ -44,10 +44,15 @@ static int setup_dpdk_interface(struct cps_kni *kni, const struct gatekeeper_if *iface, struct rte_mempool *mp, uint16_t queue_size) { + /* + * DO NOT add RTE_ETH_RX_OFFLOAD_SCATTER to port_conf.rxmode.offloads + * to guarantee that all mbufs have a single segment. + * + * If more dataroom in mbufs is needed, review create_pktmbuf_pool(). + */ struct rte_eth_conf port_conf = { .rxmode = { .mtu = iface->mtu, - .offloads = RTE_ETH_RX_OFFLOAD_SCATTER, }, }; diff --git a/cps/main.c b/cps/main.c index 4a0ca788..8c61a566 100644 --- a/cps/main.c +++ b/cps/main.c @@ -808,7 +808,7 @@ assign_cps_queue_ids(struct cps_config *cps_conf) num_mbuf = calculate_mempool_config_para("cps", cps_conf->net, total_pkt_burst); cps_conf->mp = create_pktmbuf_pool("cps", - cps_conf->lcore_id, num_mbuf); + cps_conf->lcore_id, num_mbuf, cps_conf->net); if (cps_conf->mp == NULL) { ret = -1; goto fail; diff --git a/ggu/main.c b/ggu/main.c index de2dbd02..efde0ed0 100644 --- a/ggu/main.c +++ b/ggu/main.c @@ -633,7 +633,7 @@ ggu_stage1(void *arg) ggu_conf->net, ggu_conf->total_pkt_burst); ggu_conf->mp = create_pktmbuf_pool("ggu", - ggu_conf->lcore_id, num_mbuf); + ggu_conf->lcore_id, num_mbuf, ggu_conf->net); if (ggu_conf->mp == NULL) return -1; diff --git a/gk/main.c b/gk/main.c index b576781e..5ccfccaa 100644 --- a/gk/main.c +++ b/gk/main.c @@ -2550,7 +2550,8 @@ gk_stage1(void *arg) unsigned int lcore = gk_conf->lcores[i]; struct gk_instance *inst_ptr = &gk_conf->instances[i]; - inst_ptr->mp = create_pktmbuf_pool("gk", lcore, num_mbuf); + inst_ptr->mp = create_pktmbuf_pool("gk", lcore, num_mbuf, + gk_conf->net); if (inst_ptr->mp == NULL) goto cleanup; diff --git a/gt/main.c b/gt/main.c index ad320f61..5d914ef6 100644 --- a/gt/main.c +++ b/gt/main.c @@ -2075,7 +2075,8 @@ init_gt_instances(struct gt_config *gt_conf) unsigned int lcore = gt_conf->lcores[i]; inst_ptr = >_conf->instances[i]; - inst_ptr->mp = create_pktmbuf_pool("gt", lcore, num_mbuf); + inst_ptr->mp = create_pktmbuf_pool("gt", lcore, num_mbuf, + gt_conf->net); if (inst_ptr->mp == NULL) { ret = -1; goto free_gt_instance; diff --git a/include/gatekeeper_net.h b/include/gatekeeper_net.h index 48f25a86..aeaa2b41 100644 --- a/include/gatekeeper_net.h +++ b/include/gatekeeper_net.h @@ -241,6 +241,16 @@ struct gatekeeper_if { /* Whether IPv4 (L3) checksums should be enabled in hardware. */ bool ipv4_hw_cksum; + /* + * XXX #730 Due to a bug in some Poll Mode Drivers (PMD), + * the MTU parameter passed during the initialization must include + * length of the link layer header(s). + * + * The ICE drivers is known to require this workaround. + * See check_if_mtu() for mare information. + */ + bool pmd_mtu_workaround; + /* * This field decides if the flag GRND_RANDOM is passed to getradom(2) * while initializing field @rss_key. @@ -609,7 +619,8 @@ bool ipv6_configured(struct net_config *net_conf); unsigned int calculate_mempool_config_para(const char *block_name, struct net_config *net_conf, unsigned int total_pkt_burst); struct rte_mempool *create_pktmbuf_pool(const char *block_name, - unsigned int lcore, unsigned int num_mbuf); + unsigned int lcore, unsigned int num_mbuf, + const struct net_config *net); /* * No cleanup for this step, since DPDK diff --git a/lib/net.c b/lib/net.c index c8dd5c14..a8ac290d 100644 --- a/lib/net.c +++ b/lib/net.c @@ -26,6 +26,7 @@ #include #include #include +#include /* For sizeof(struct virtio_net_hdr_mrg_rxbuf) */ #include #include @@ -1362,12 +1363,21 @@ check_if_mtu(struct gatekeeper_if *iface, /* * Set up device MTU. * - * If greater than the size of the mbufs, then add the - * multi-segment buffer flag. + * DO NOT add RTE_ETH_TX_OFFLOAD_MULTI_SEGS to + * port_conf->txmode.offloads because all mbufs in Gatekeeper are + * single segment and RTE_ETH_TX_OFFLOAD_MULTI_SEGS may disable + * vectorial transmission; see ice_set_tx_function() in the ICE driver + * for an example. + * + * If more dataroom in mbufs is needed, review create_pktmbuf_pool(). */ port_conf->rxmode.mtu = iface->mtu; - if (iface->mtu > RTE_MBUF_DEFAULT_BUF_SIZE) - port_conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; + + if (iface->pmd_mtu_workaround) { + port_conf->rxmode.mtu += RTE_ETHER_HDR_LEN; /* Ethernet */ + if (iface->vlan_insert) + port_conf->rxmode.mtu += RTE_VLAN_HLEN; /* VLAN */ + } if (unlikely(dev_info->min_mtu > port_conf->rxmode.mtu)) { G_LOG(ERR, "%s(%s): the minimum MTU %u is larger than the configured MTU %"PRIu32"\n", @@ -1383,15 +1393,6 @@ check_if_mtu(struct gatekeeper_if *iface, return -EINVAL; } - if (unlikely((port_conf->txmode.offloads & - RTE_ETH_TX_OFFLOAD_MULTI_SEGS) && - !(dev_info->tx_offload_capa & - RTE_ETH_TX_OFFLOAD_MULTI_SEGS))) { - G_LOG(NOTICE, "%s(%s): interface does not support offloading multi-segment TX buffers\n", - __func__, iface->name); - port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_MULTI_SEGS; - } - return 0; } @@ -2378,9 +2379,49 @@ calculate_mempool_config_para(const char *block_name, return num_mbuf; } +#define LINK_LAYER_HEADROOM (32) + +/* + * Computes the dataroom size needed for struct mbuf's to be used at front, + * back, and KNI interfaces. + */ +static uint16_t +dataroom_from_net(const struct net_config *net) +{ + uint16_t max_mtu = net->front.mtu; + if (net->back_iface_enabled && net->back.mtu > max_mtu) + max_mtu = net->back.mtu; + + /* + * Even Ethernet NICs do not agree on the exact headroom for + * the link layer, so the test below favors Virtio (i.e., KNI + * interfaces) because it is the most demaing part. + * See virtio_ethdev.c:virtio_mtu_set() for details. + */ + RTE_BUILD_BUG_ON(LINK_LAYER_HEADROOM < ( + RTE_ETHER_HDR_LEN + /* Ethernet */ + RTE_VLAN_HLEN + /* VLAN */ + sizeof(struct virtio_net_hdr_mrg_rxbuf) /* Virtio */ + ) + ); + + /* + * Having RTE_MBUF_DEFAULT_BUF_SIZE in the RTE_MAX() below + * guarantees the minimum size that some NICs require to avoid + * splitting frames into multiple segments. + * + * In the second expression: + * @RTE_PKTMBUF_HEADROOM is required by DPDK libraries. + * @LINK_LAYER_HEADROOM accounts for link layers headers. + * @max_mtu makes the expression valid for front and back interfaces. + */ + return RTE_MAX(RTE_MBUF_DEFAULT_BUF_SIZE, + RTE_PKTMBUF_HEADROOM + LINK_LAYER_HEADROOM + max_mtu); +} + struct rte_mempool * create_pktmbuf_pool(const char *block_name, unsigned int lcore, - unsigned int num_mbuf) + unsigned int num_mbuf, const struct net_config *net) { struct rte_mempool *mp; char pool_name[64]; @@ -2388,7 +2429,7 @@ create_pktmbuf_pool(const char *block_name, unsigned int lcore, block_name, lcore); RTE_VERIFY(ret > 0 && ret < (int)sizeof(pool_name)); mp = rte_pktmbuf_pool_create_by_ops(pool_name, num_mbuf, 0, - sizeof(struct sol_mbuf_priv), RTE_MBUF_DEFAULT_BUF_SIZE, + sizeof(struct sol_mbuf_priv), dataroom_from_net(net), rte_lcore_to_socket_id(lcore), "ring_mp_sc"); if (mp == NULL) { G_LOG(ERR, diff --git a/lls/main.c b/lls/main.c index 4e737f43..ebf6ae8b 100644 --- a/lls/main.c +++ b/lls/main.c @@ -836,7 +836,7 @@ assign_lls_queue_ids(struct lls_config *lls_conf) num_mbuf = calculate_mempool_config_para("lls", lls_conf->net, total_pkt_burst); lls_conf->mp = create_pktmbuf_pool("lls", - lls_conf->lcore_id, num_mbuf); + lls_conf->lcore_id, num_mbuf, lls_conf->net); if (lls_conf->mp == NULL) { ret = -1; goto fail; diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua index 73dd3101..ec661fb4 100644 --- a/lua/gatekeeper/staticlib.lua +++ b/lua/gatekeeper/staticlib.lua @@ -283,6 +283,7 @@ struct gatekeeper_if { bool ipv4_hw_udp_cksum; bool ipv6_hw_udp_cksum; bool ipv4_hw_cksum; + bool pmd_mtu_workaround; bool guarantee_random_entropy; bool alternative_rss_hash; /* This struct has hidden fields. */ diff --git a/lua/net.lua b/lua/net.lua index 3153d38d..1402153b 100644 --- a/lua/net.lua +++ b/lua/net.lua @@ -46,6 +46,8 @@ return function (gatekeeper_server) local back_ipv6_hw_udp_cksum = true local front_ipv4_hw_cksum = true local back_ipv4_hw_cksum = true + local front_pmd_mtu_workaround = false + local back_pmd_mtu_workaround = false local front_alternative_rss_hash = false local back_alternative_rss_hash = false @@ -75,6 +77,7 @@ return function (gatekeeper_server) front_iface.ipv4_hw_udp_cksum = front_ipv4_hw_udp_cksum front_iface.ipv6_hw_udp_cksum = front_ipv6_hw_udp_cksum front_iface.ipv4_hw_cksum = front_ipv4_hw_cksum + front_iface.pmd_mtu_workaround = front_pmd_mtu_workaround front_iface.guarantee_random_entropy = guarantee_random_entropy front_iface.alternative_rss_hash = front_alternative_rss_hash local ret = staticlib.init_iface(front_iface, "front", @@ -99,6 +102,7 @@ return function (gatekeeper_server) back_iface.ipv4_hw_udp_cksum = back_ipv4_hw_udp_cksum back_iface.ipv6_hw_udp_cksum = back_ipv6_hw_udp_cksum back_iface.ipv4_hw_cksum = back_ipv4_hw_cksum + back_iface.pmd_mtu_workaround = back_pmd_mtu_workaround back_iface.guarantee_random_entropy = guarantee_random_entropy back_iface.alternative_rss_hash = back_alternative_rss_hash ret = staticlib.init_iface(back_iface, "back",