Discussion:
[dpdk-dev] [PATCH 00/24] Refactor mlx5 to improve performance
Nelio Laranjeiro
2016-06-08 09:47:47 UTC
Permalink
Enhance mlx5 with a data path that bypasses Verbs.

The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.

The PMD remains usable during the transition.

This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".

Adrien Mazarguil (8):
mlx5: replace countdown with threshold for TX completions
mlx5: add debugging information about TX queues capabilities
mlx5: check remaining space while processing TX burst
mlx5: resurrect TX gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant RX queue initialization code
mlx5: make RX queue reinitialization safer
mlx5: resurrect RX scatter support

Nelio Laranjeiro (15):
mlx5: split memory registration function for better performance
mlx5: remove TX gather support
mlx5: remove RX scatter support
mlx5: remove configuration variable for maximum number of segments
mlx5: remove inline TX support
mlx5: split TX queue structure
mlx5: split RX queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add TX/RX burst function selection wrapper
mlx5: refactor RX data path
mlx5: refactor TX data path
mlx5: handle RX CQE compression
mlx5: add support for multi-packet send

Yaacov Hazan (1):
mlx5: add support for inline send

config/common_base | 2 -
doc/guides/nics/mlx5.rst | 94 +-
drivers/net/mlx5/Makefile | 49 +-
drivers/net/mlx5/mlx5.c | 158 ++-
drivers/net/mlx5/mlx5.h | 10 +
drivers/net/mlx5/mlx5_defs.h | 26 +-
drivers/net/mlx5/mlx5_ethdev.c | 188 +++-
drivers/net/mlx5/mlx5_fdir.c | 20 +-
drivers/net/mlx5/mlx5_mr.c | 280 +++++
drivers/net/mlx5/mlx5_prm.h | 155 +++
drivers/net/mlx5/mlx5_rxmode.c | 8 -
drivers/net/mlx5/mlx5_rxq.c | 757 +++++---------
drivers/net/mlx5/mlx5_rxtx.c | 2206 +++++++++++++++++++++++-----------------
drivers/net/mlx5/mlx5_rxtx.h | 176 ++--
drivers/net/mlx5/mlx5_txq.c | 362 ++++---
drivers/net/mlx5/mlx5_vlan.c | 6 +-
16 files changed, 2578 insertions(+), 1919 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c
create mode 100644 drivers/net/mlx5/mlx5_prm.h
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:48 UTC
Permalink
Except for the first time when memory registration occurs, the lkey is
always cached. Since memory registration is slow and performs system calls,
performance can be improved by moving that code to its own function outside
of the data path so only the lookup code is left in the original inlined
function.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 1 +
drivers/net/mlx5/mlx5_mr.c | 277 +++++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.c | 209 ++------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 8 +-
4 files changed, 295 insertions(+), 200 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 92bfa07..1dba3de 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -47,6 +47,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c

# Dependencies.
DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 0000000..7c3e87f
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,277 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mempool.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+struct mlx5_check_mempool_data {
+ int ret;
+ char *start;
+ char *end;
+};
+
+/* Called by mlx5_check_mempool() when iterating the memory chunks. */
+static void mlx5_check_mempool_cb(struct rte_mempool *mp,
+ void *opaque, struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx)
+{
+ struct mlx5_check_mempool_data *data = opaque;
+
+ (void)mp;
+ (void)mem_idx;
+
+ /* It already failed, skip the next chunks. */
+ if (data->ret != 0)
+ return;
+ /* It is the first chunk. */
+ if (data->start == NULL && data->end == NULL) {
+ data->start = memhdr->addr;
+ data->end = data->start + memhdr->len;
+ return;
+ }
+ if (data->end == memhdr->addr) {
+ data->end += memhdr->len;
+ return;
+ }
+ if (data->start == (char *)memhdr->addr + memhdr->len) {
+ data->start -= memhdr->len;
+ return;
+ }
+ /* Error, mempool is not virtually contiguous. */
+ data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ * Pointer to memory pool.
+ * @param[out] start
+ * Pointer to the start address of the mempool virtual memory area
+ * @param[out] end
+ * Pointer to the end address of the mempool virtual memory area
+ *
+ * @return
+ * 0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
+ uintptr_t *end)
+{
+ struct mlx5_check_mempool_data data;
+
+ memset(&data, 0, sizeof(data));
+ rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
+ *start = (uintptr_t)data.start;
+ *end = (uintptr_t)data.end;
+
+ return data.ret;
+}
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ * Pointer to protection domain.
+ * @param mp
+ * Pointer to memory pool.
+ *
+ * @return
+ * Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ uintptr_t start;
+ uintptr_t end;
+ unsigned int i;
+
+ if (mlx5_check_mempool(mp, &start, &end) != 0) {
+ ERROR("mempool %p: not virtually contiguous",
+ (void *)mp);
+ return NULL;
+ }
+
+ DEBUG("mempool %p area start=%p end=%p size=%zu",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ /* Round start and end to page boundary if found in memory segments. */
+ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+ uintptr_t addr = (uintptr_t)ms[i].addr;
+ size_t len = ms[i].len;
+ unsigned int align = ms[i].hugepage_sz;
+
+ if ((start > addr) && (start < addr + len))
+ start = RTE_ALIGN_FLOOR(start, align);
+ if ((end > addr) && (end < addr + len))
+ end = RTE_ALIGN_CEIL(end, align);
+ }
+ DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ return ibv_reg_mr(pd,
+ (void *)start,
+ end - start,
+ IBV_ACCESS_LOCAL_WRITE);
+}
+
+/**
+ * Register a Memory Region (MR) <-> Memory Pool (MP) association in
+ * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ *
+ * This function should only be called by txq_mp2mr().
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] mp
+ * Memory Pool for which a Memory Region lkey must be returned.
+ * @param idx
+ * Index of the next available entry.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
+{
+ struct ibv_mr *mr;
+
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq->priv->pd, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --idx;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[idx].mp = mp;
+ txq->mp2mr[idx].mr = mr;
+ txq->mp2mr[idx].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
+ return txq->mp2mr[idx].lkey;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+ int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ * The mempool pointer
+ * @param[in] arg
+ * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
+ * return value.
+ * @param[in] obj
+ * Object address.
+ * @param index
+ * Object index, unused.
+ */
+static void
+txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+ uint32_t index __rte_unused)
+{
+ struct txq_mp2mr_mbuf_check_data *data = arg;
+ struct rte_mbuf *buf = obj;
+
+ /* Check whether mbuf structure fits element size and whether mempool
+ * pointer is valid. */
+ if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+ data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a TX queue.
+ *
+ * @param[in] mp
+ * Memory Pool to register.
+ * @param *arg
+ * Pointer to TX queue structure.
+ */
+void
+txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+ struct txq *txq = arg;
+ struct txq_mp2mr_mbuf_check_data data = {
+ .ret = 0,
+ };
+ unsigned int i;
+
+ /* Register mempool only if the first element looks like a mbuf. */
+ if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
+ data.ret == -1)
+ return;
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp)
+ return;
+ }
+ txq_mp2mr_reg(txq, mp, i);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9cb1dfa..616cf7a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -140,121 +140,6 @@ txq_complete(struct txq *txq)
return 0;
}

-struct mlx5_check_mempool_data {
- int ret;
- char *start;
- char *end;
-};
-
-/* Called by mlx5_check_mempool() when iterating the memory chunks. */
-static void mlx5_check_mempool_cb(struct rte_mempool *mp,
- void *opaque, struct rte_mempool_memhdr *memhdr,
- unsigned mem_idx)
-{
- struct mlx5_check_mempool_data *data = opaque;
-
- (void)mp;
- (void)mem_idx;
-
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
- return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
- return;
- }
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
- }
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
- }
- /* Error, mempool is not virtually contigous. */
- data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area
- *
- * @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
- uintptr_t *end)
-{
- struct mlx5_check_mempool_data data;
-
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
-
- return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __attribute__((noinline));
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- * Pointer to protection domain.
- * @param mp
- * Pointer to memory pool.
- *
- * @return
- * Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
-
- if (mlx5_check_mempool(mp, &start, &end) != 0) {
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
- }
-
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- return ibv_reg_mr(pd,
- (void *)start,
- end - start,
- IBV_ACCESS_LOCAL_WRITE);
-}
-
/**
* Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
@@ -273,6 +158,10 @@ txq_mb2mp(struct rte_mbuf *buf)
return buf->pool;
}

+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+ __attribute__((always_inline));
+
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
@@ -286,11 +175,11 @@ txq_mb2mp(struct rte_mbuf *buf)
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
-static uint32_t
+static inline uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
- struct ibv_mr *mr;
+ uint32_t lkey = (uint32_t)-1;

for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
@@ -300,89 +189,13 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ lkey = txq->mp2mr[i].lkey;
+ break;
}
}
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index __rte_unused)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- /* Check whether mbuf structure fits element size and whether mempool
- * pointer is valid. */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to TX queue structure.
- */
-void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- txq_mp2mr(txq, mp);
+ if (unlikely(lkey == (uint32_t)-1))
+ lkey = txq_mp2mr_reg(txq, mp, i);
+ return lkey;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 47f6299..462eddf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -337,12 +337,16 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_rxtx.c */

-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
-void txq_mp2mr_iter(struct rte_mempool *, void *);
uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);

+/* mlx5_mr.c */
+
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
+void txq_mp2mr_iter(struct rte_mempool *, void *);
+uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int);
+
#endif /* RTE_PMD_MLX5_RXTX_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:49 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. TX gather cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 315 ++++++++---------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 17 ---
drivers/net/mlx5/mlx5_txq.c | 49 ++-----
4 files changed, 69 insertions(+), 314 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index d2a63b8..29aec49 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1261,7 +1261,7 @@ mlx5_secondary_data_setup(struct priv *priv)
if (txq != NULL) {
if (txq_setup(priv->dev,
txq,
- primary_txq->elts_n * MLX5_PMD_SGE_WR_N,
+ primary_txq->elts_n,
primary_txq->socket,
NULL) == 0) {
txq->stats.idx = primary_txq->stats.idx;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 616cf7a..6e184c3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -228,156 +228,6 @@ insert_vlan_sw(struct rte_mbuf *buf)
return 0;
}

-#if MLX5_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- * Linear output buffer.
- * @param[in] buf
- * Scattered input buffer.
- *
- * @return
- * Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
- unsigned int size = 0;
- unsigned int offset;
-
- do {
- unsigned int len = DATA_LEN(buf);
-
- offset = size;
- size += len;
- if (unlikely(size > sizeof(*linear)))
- return 0;
- memcpy(&(*linear)[offset],
- rte_pktmbuf_mtod(buf, uint8_t *),
- len);
- buf = NEXT(buf);
- } while (buf != NULL);
- return size;
-}
-
-/**
- * Handle scattered buffers for mlx5_tx_burst().
- *
- * @param txq
- * TX queue structure.
- * @param segs
- * Number of segments in buf.
- * @param elt
- * TX queue element to fill.
- * @param[in] buf
- * Buffer to process.
- * @param elts_head
- * Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- * Array filled with SGEs on success.
- *
- * @return
- * A structure containing the processed packet size in bytes and the
- * number of SGEs. Both fields are set to (unsigned int)-1 in case of
- * failure.
- */
-static struct tx_burst_sg_ret {
- unsigned int length;
- unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
- struct rte_mbuf *buf, unsigned int elts_head,
- struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
-{
- unsigned int sent_size = 0;
- unsigned int j;
- int linearize = 0;
-
- /* When there are too many segments, extra segments are
- * linearized in the last SGE. */
- if (unlikely(segs > RTE_DIM(*sges))) {
- segs = (RTE_DIM(*sges) - 1);
- linearize = 1;
- }
- /* Update element. */
- elt->buf = buf;
- /* Register segments as SGEs. */
- for (j = 0; (j != segs); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- uint32_t lkey;
-
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update SGE. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)sge->addr);
- sge->length = DATA_LEN(buf);
- sge->lkey = lkey;
- sent_size += sge->length;
- buf = NEXT(buf);
- }
- /* If buf is not NULL here and is not going to be linearized,
- * nb_segs is not valid. */
- assert(j == segs);
- assert((buf == NULL) || (linearize));
- /* Linearize extra segments. */
- if (linearize) {
- struct ibv_sge *sge = &(*sges)[segs];
- linear_t *linear = &(*txq->elts_linear)[elts_head];
- unsigned int size = linearize_mbuf(linear, buf);
-
- assert(segs == (RTE_DIM(*sges) - 1));
- if (size == 0) {
- /* Invalid packet. */
- DEBUG("%p: packet too large to be linearized.",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
- if (RTE_DIM(*sges) == 1) {
- do {
- struct rte_mbuf *next = NEXT(buf);
-
- rte_pktmbuf_free_seg(buf);
- buf = next;
- } while (buf != NULL);
- elt->buf = NULL;
- }
- /* Update SGE. */
- sge->addr = (uintptr_t)&(*linear)[0];
- sge->length = size;
- sge->lkey = txq->mr_linear->lkey;
- sent_size += size;
- /* Include last segment. */
- segs++;
- }
- return (struct tx_burst_sg_ret){
- .length = sent_size,
- .num = segs,
- };
-stop:
- return (struct tx_burst_sg_ret){
- .length = -1,
- .num = -1,
- };
-}
-
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
-
/**
* DPDK callback for TX.
*
@@ -424,14 +274,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt = &(*txq->elts)[elts_head];
- unsigned int segs = NB_SEGS(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
- unsigned int sent_size = 0;
-#endif
uint32_t send_flags = 0;
#ifdef HAVE_VERBS_VLAN_INSERTION
int insert_vlan = 0;
#endif /* HAVE_VERBS_VLAN_INSERTION */
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+ uintptr_t buf_next_addr;

if (i + 1 < max)
rte_prefetch0(buf_next);
@@ -464,126 +314,81 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto stop;
}
}
- if (likely(segs == 1)) {
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
- uintptr_t buf_next_addr;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- /* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
- /* Put packet into send queue. */
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->sriov)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ /* Prefetch next buffer data. */
+ if (i + 1 < max) {
+ buf_next_addr =
+ rte_pktmbuf_mtod(buf_next, uintptr_t);
+ rte_prefetch0((volatile void *)
+ (uintptr_t)buf_next_addr);
+ }
+ /* Put packet into send queue. */
#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
+ if (length <= txq->max_inline) {
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_inline_vlan
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += length;
+ err = txq->send_pending_inline
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags);
+ } else
#endif
- } else {
-#if MLX5_PMD_SGE_WR_N > 1
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
- struct tx_burst_sg_ret ret;
-
- ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
- &sges);
- if (ret.length == (unsigned int)-1)
+ {
+ /* Retrieve Memory Region key for this
+ * memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
goto stop;
- /* Put SG list into send queue. */
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
if (insert_vlan)
- err = txq->send_pending_sg_list_vlan
+ err = txq->send_pending_vlan
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags,
&buf->vlan_tci);
else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_sg_list
+ err = txq->send_pending
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += ret.length;
-#endif
-#else /* MLX5_PMD_SGE_WR_N > 1 */
- DEBUG("%p: TX scattered buffers support not"
- " compiled in", (void *)txq);
- goto stop;
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
}
- elts_head = elts_head_next;
- buf = buf_next;
+ if (unlikely(err))
+ goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
+ txq->stats.obytes += length;
#endif
- }
stop:
+ elts_head = elts_head_next;
+ buf = buf_next;
+ }
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 462eddf..8358ccb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -242,14 +242,6 @@ struct txq_elt {
struct rte_mbuf *buf;
};

-/* Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes. */
-typedef uint8_t linear_t[16384];
-
/* TX queue descriptor. */
struct txq {
struct priv *priv; /* Back pointer to private data. */
@@ -264,12 +256,6 @@ struct txq {
int (*send_pending_inline_vlan)();
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- int (*send_pending_sg_list)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_sg_list_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
@@ -289,9 +275,6 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- /* Elements used only for init part are here. */
- linear_t (*elts_linear)[]; /* Linearized buffers. */
- struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e20df21..5a248c9 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,26 +82,13 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
unsigned int i;
struct txq_elt (*elts)[elts_n] =
rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
- linear_t (*elts_linear)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
- txq->socket);
- struct ibv_mr *mr_linear = NULL;
int ret = 0;

- if ((elts == NULL) || (elts_linear == NULL)) {
+ if (elts == NULL) {
ERROR("%p: can't allocate packets array", (void *)txq);
ret = ENOMEM;
goto error;
}
- mr_linear =
- ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
- IBV_ACCESS_LOCAL_WRITE);
- if (mr_linear == NULL) {
- ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
- (void *)txq);
- ret = EINVAL;
- goto error;
- }
for (i = 0; (i != elts_n); ++i) {
struct txq_elt *elt = &(*elts)[i];

@@ -119,15 +106,9 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
txq->elts_comp_cd = txq->elts_comp_cd_init;
- txq->elts_linear = elts_linear;
- txq->mr_linear = mr_linear;
assert(ret == 0);
return 0;
error:
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
rte_free(elts);

DEBUG("%p: failed, freed everything", (void *)txq);
@@ -148,8 +129,6 @@ txq_free_elts(struct txq *txq)
unsigned int elts_head = txq->elts_head;
unsigned int elts_tail = txq->elts_tail;
struct txq_elt (*elts)[elts_n] = txq->elts;
- linear_t (*elts_linear)[elts_n] = txq->elts_linear;
- struct ibv_mr *mr_linear = txq->mr_linear;

DEBUG("%p: freeing WRs", (void *)txq);
txq->elts_n = 0;
@@ -159,12 +138,7 @@ txq_free_elts(struct txq *txq)
txq->elts_comp_cd = 0;
txq->elts_comp_cd_init = 0;
txq->elts = NULL;
- txq->elts_linear = NULL;
- txq->mr_linear = NULL;
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));

- rte_free(elts_linear);
if (elts == NULL)
return;
while (elts_tail != elts_head) {
@@ -286,12 +260,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of TX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of TX descriptors", (void *)dev);
+ return EINVAL;
+ }
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: TX gather is not supported", (void *)dev);
return EINVAL;
}
- desc /= MLX5_PMD_SGE_WR_N;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -332,10 +308,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
- .max_send_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_send_sge = 1,
#if MLX5_PMD_MAX_INLINE > 0
.max_inline_data = MLX5_PMD_MAX_INLINE,
#endif
@@ -440,12 +413,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_sg_list_vlan = txq->if_qp->send_pending_sg_list_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:50 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. RX scatter cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 31 +---
drivers/net/mlx5/mlx5_rxq.c | 314 ++++++-----------------------------------
drivers/net/mlx5/mlx5_rxtx.c | 211 +--------------------------
drivers/net/mlx5/mlx5_rxtx.h | 13 +-
4 files changed, 53 insertions(+), 516 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 29aec49..bab826c 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -624,8 +624,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)

};

- if (dev->rx_pkt_burst == mlx5_rx_burst ||
- dev->rx_pkt_burst == mlx5_rx_burst_sp)
+ if (dev->rx_pkt_burst == mlx5_rx_burst)
return ptypes;
return NULL;
}
@@ -763,19 +762,11 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- /* Provide new values to rxq_setup(). */
- dev->data->dev_conf.rxmode.jumbo_frame = sp;
- dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
- ret = rxq_rehash(dev, rxq);
- if (ret) {
- /* Force SP RX if that queue requires it and abort. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
- break;
+ if (sp) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ ret = ENOTSUP;
+ goto out;
}
- /* Scattered burst function takes priority. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
}
/* Burst functions can now be called again. */
rte_wmb();
@@ -1104,22 +1095,12 @@ priv_set_link(struct priv *priv, int up)
{
struct rte_eth_dev *dev = priv->dev;
int err;
- unsigned int i;

if (up) {
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- for (i = 0; i < priv->rxqs_n; i++)
- if ((*priv->rxqs)[i]->sp)
- break;
- /* Check if an sp queue exists.
- * Note: Some old frames might be received.
- */
- if (i == priv->rxqs_n)
- dev->rx_pkt_burst = mlx5_rx_burst;
- else
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
+ dev->rx_pkt_burst = mlx5_rx_burst;
dev->tx_pkt_burst = mlx5_tx_burst;
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0bcf55b..38ff9fd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -634,145 +634,6 @@ priv_rehash_flows(struct priv *priv)
}

/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt_sp (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
- struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
-
- /* These two arrays must have the same size. */
- assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
- /* For each SGE (segment). */
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- elt->bufs[j] = buf;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- if (j == 0) {
- /* The first SGE keeps its headroom. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- sge->length = (buf->buf_len -
- RTE_PKTMBUF_HEADROOM);
- } else {
- /* Subsequent SGEs lose theirs. */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- SET_DATA_OFF(buf, 0);
- sge->addr = (uintptr_t)buf->buf_addr;
- sge->length = buf->buf_len;
- }
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- }
- }
- DEBUG("%p: allocated and configured %u WRs (%zu segments)",
- (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
-}
-
-/**
* Allocate RX queue elements.
*
* @param rxq
@@ -838,7 +699,7 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
(void *)rxq, elts_n);
rxq->elts_n = elts_n;
rxq->elts_head = 0;
- rxq->elts.no_sp = elts;
+ rxq->elts = elts;
assert(ret == 0);
return 0;
error:
@@ -869,11 +730,11 @@ rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[elts_n] = rxq->elts;

DEBUG("%p: freeing WRs", (void *)rxq);
rxq->elts_n = 0;
- rxq->elts.no_sp = NULL;
+ rxq->elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -900,10 +761,7 @@ rxq_cleanup(struct rxq *rxq)
struct ibv_exp_release_intf_params params;

DEBUG("cleaning up %p", (void *)rxq);
- if (rxq->sp)
- rxq_free_elts_sp(rxq);
- else
- rxq_free_elts(rxq);
+ rxq_free_elts(rxq);
rxq->poll = NULL;
rxq->recv = NULL;
if (rxq->if_wq != NULL) {
@@ -973,12 +831,12 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- unsigned int mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ struct rxq_elt (*elts)[tmpl.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
+ desc_n = tmpl.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
@@ -989,22 +847,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
rxq->csum_l2tun = tmpl.csum_l2tun;
}
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc_n /= MLX5_PMD_SGE_WR_N;
- } else
- tmpl.sp = 0;
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
- /* If scatter mode is the same as before, nothing to do. */
- if (tmpl.sp == rxq->sp) {
- DEBUG("%p: nothing to do", (void *)dev);
- return 0;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -1025,35 +867,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- if (rxq->sp) {
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[i];
- unsigned int j;
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- assert(elt->bufs[j] != NULL);
- pool[k++] = elt->bufs[j];
- }
- }
- } else {
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
+ elts = rxq->elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf = elt->buf;

- pool[k++] = buf;
- }
+ pool[k++] = buf;
}
assert(k == mbuf_n);
tmpl.elts_n = 0;
- tmpl.elts.sp = NULL;
- assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
- err = ((tmpl.sp) ?
- rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
- rxq_alloc_elts(&tmpl, desc_n, pool));
+ tmpl.elts = NULL;
+ assert((void *)&tmpl.elts == NULL);
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
rte_free(pool);
@@ -1061,12 +886,11 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
return err;
}
assert(tmpl.elts_n == desc_n);
- assert(tmpl.elts.sp != NULL);
rte_free(pool);
/* Clean up original data. */
rxq->elts_n = 0;
- rte_free(rxq->elts.sp);
- rxq->elts.sp = NULL;
+ rte_free(rxq->elts);
+ rxq->elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
@@ -1080,28 +904,14 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (err)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ err = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (err)
+ break;
}
if (err) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1110,10 +920,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- if (tmpl.sp)
- tmpl.recv = tmpl.if_wq->recv_sg_list;
- else
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.recv = tmpl.if_wq->recv_burst;
error:
*rxq = tmpl;
assert(err >= 0);
@@ -1159,31 +966,26 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ struct rxq_elt (*elts)[desc];
int ret = 0;
unsigned int i;
unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ return ENOTSUP;
+ }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc /= MLX5_PMD_SGE_WR_N;
- }
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
+ (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -1232,10 +1034,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
priv->device_attr.max_qp_wr :
(int)cq_size),
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_recv_sge = 1,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1297,10 +1096,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
- if (tmpl.sp)
- ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
- else
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1346,28 +1142,14 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Post SGEs. */
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (ret)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ ret = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (ret)
+ break;
}
if (ret) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1388,10 +1170,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq->poll = rxq->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- if (rxq->sp)
- rxq->recv = rxq->if_wq->recv_sg_list;
- else
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq->recv = rxq->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1466,10 +1245,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)rxq);
(*priv->rxqs)[idx] = rxq;
/* Update receive callback. */
- if (rxq->sp)
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
- else
- dev->rx_pkt_burst = mlx5_rx_burst;
+ dev->rx_pkt_burst = mlx5_rx_burst;
}
priv_unlock(priv);
return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6e184c3..07d95eb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -502,215 +502,8 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
}

/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(!rxq->sp))
- return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
- if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
- return 0;
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[elts_head];
- unsigned int len;
- unsigned int pkt_buf_len;
- struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
- struct rte_mbuf **pkt_buf_next = &pkt_buf;
- unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
- unsigned int j = 0;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
- pkt_buf_len = len;
- /*
- * Replace spent segments with new ones, concatenate and
- * return them as pkt_buf.
- */
- while (1) {
- struct ibv_sge *sge = &elt->sges[j];
- struct rte_mbuf *seg = elt->bufs[j];
- struct rte_mbuf *rep;
- unsigned int seg_tailroom;
-
- assert(seg != NULL);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_prefetch0(seg);
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- if (pkt_buf != NULL) {
- *pkt_buf_next = NULL;
- rte_pktmbuf_free(pkt_buf);
- }
- /* Increment out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
-#ifndef NDEBUG
- /* Poison user-modifiable fields in rep. */
- NEXT(rep) = (void *)((uintptr_t)-1);
- SET_DATA_OFF(rep, 0xdead);
- DATA_LEN(rep) = 0xd00d;
- PKT_LEN(rep) = 0xdeadd00d;
- NB_SEGS(rep) = 0x2a;
- PORT(rep) = 0x2a;
- rep->ol_flags = -1;
-#endif
- assert(rep->buf_len == seg->buf_len);
- /* Reconfigure sge to use rep instead of seg. */
- assert(sge->lkey == rxq->mr->lkey);
- sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
- elt->bufs[j] = rep;
- ++j;
- /* Update pkt_buf if it's the first segment, or link
- * seg to the previous one and update pkt_buf_next. */
- *pkt_buf_next = seg;
- pkt_buf_next = &NEXT(seg);
- /* Update seg information. */
- seg_tailroom = (seg->buf_len - seg_headroom);
- assert(sge->length == seg_tailroom);
- SET_DATA_OFF(seg, seg_headroom);
- if (likely(len <= seg_tailroom)) {
- /* Last segment. */
- DATA_LEN(seg) = len;
- PKT_LEN(seg) = len;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) ==
- seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) ==
- (seg_tailroom - len));
- break;
- }
- DATA_LEN(seg) = seg_tailroom;
- PKT_LEN(seg) = seg_tailroom;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) == seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) == 0);
- /* Fix len and clear headroom for next segments. */
- len -= seg_tailroom;
- seg_headroom = 0;
- }
- /* Update head and tail segments. */
- *pkt_buf_next = NULL;
- assert(pkt_buf != NULL);
- assert(j != 0);
- NB_SEGS(pkt_buf) = j;
- PORT(pkt_buf) = rxq->port_id;
- PKT_LEN(pkt_buf) = pkt_buf_len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
- pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
- pkt_buf->vlan_tci = vlan_tci;
- }
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- }
-
- /* Return packet. */
- *(pkts++) = pkt_buf;
- ++pkts_ret;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment bytes counter. */
- rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
- ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_sg_list(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- rxq->elts_head = elts_head;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
* DPDK callback for RX.
*
- * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
@@ -725,7 +518,7 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_sge sges[pkts_n];
@@ -733,8 +526,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int pkts_ret = 0;
int ret;

- if (unlikely(rxq->sp))
- return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
for (i = 0; (i != pkts_n); ++i) {
struct rxq_elt *elt = &(*elts)[elts_head];
unsigned int len;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8358ccb..2e1f83b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -81,12 +81,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element (scattered packets). */
-struct rxq_elt_sp {
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
- struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
/* RX element. */
struct rxq_elt {
struct ibv_sge sge; /* Scatter/Gather Element. */
@@ -112,15 +106,11 @@ struct rxq {
unsigned int port_id; /* Port ID for incoming packets. */
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int sp:1; /* Use scattered RX elements. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- union {
- struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
- struct rxq_elt (*no_sp)[]; /* RX elements. */
- } elts;
+ struct rxq_elt (*elts)[]; /* RX elements. */
unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
@@ -321,7 +311,6 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
-uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:51 UTC
Permalink
There is no scatter/gather support anymore, CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
has no purpose and can be removed.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 7 -------
drivers/net/mlx5/Makefile | 4 ----
drivers/net/mlx5/mlx5_defs.h | 5 -----
drivers/net/mlx5/mlx5_rxq.c | 4 ----
drivers/net/mlx5/mlx5_txq.c | 4 ----
6 files changed, 25 deletions(-)

diff --git a/config/common_base b/config/common_base
index 47c26f6..a4a3a3a 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index d9196d1..84c35a0 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,13 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N`` (default **4**)
-
- Number of scatter/gather elements (SGEs) per work request (WR). Lowering
- this number improves performance but also limits the ability to receive
- scattered packets (packets that do not fit a single mbuf). The default
- value is a safe tradeoff.
-
- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)

Amount of data to be inlined during TX operations. Improves latency.
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 1dba3de..9a26269 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -84,10 +84,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
-CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 09207d9..da1c90e 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX5_PMD_SGE_WR_N
-#define MLX5_PMD_SGE_WR_N 4
-#endif
-
/* Maximum size for inline data. */
#ifndef MLX5_PMD_MAX_INLINE
#define MLX5_PMD_MAX_INLINE 0
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 38ff9fd..4000624 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -976,10 +976,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- return ENOTSUP;
- }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5a248c9..59974c5 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -264,10 +264,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
ERROR("%p: invalid number of TX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: TX gather is not supported", (void *)dev);
- return EINVAL;
- }
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:53 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure txq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 21 +++--
drivers/net/mlx5/mlx5_ethdev.c | 27 +++---
drivers/net/mlx5/mlx5_mr.c | 39 ++++----
drivers/net/mlx5/mlx5_rxtx.h | 9 +-
drivers/net/mlx5/mlx5_txq.c | 198 +++++++++++++++++++++--------------------
5 files changed, 158 insertions(+), 136 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 67a541c..cc30463 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -98,7 +98,6 @@ static void
mlx5_dev_close(struct rte_eth_dev *dev)
{
struct priv *priv = mlx5_get_priv(dev);
- void *tmp;
unsigned int i;

priv_lock(priv);
@@ -122,12 +121,13 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_rx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
- tmp = (*priv->rxqs)[i];
- if (tmp == NULL)
+ struct rxq *rxq = (*priv->rxqs)[i];
+
+ if (rxq == NULL)
continue;
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(tmp);
- rte_free(tmp);
+ rxq_cleanup(rxq);
+ rte_free(rxq);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
@@ -136,12 +136,15 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_tx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->txqs_n); ++i) {
- tmp = (*priv->txqs)[i];
- if (tmp == NULL)
+ struct txq *txq = (*priv->txqs)[i];
+ struct txq_ctrl *txq_ctrl;
+
+ if (txq == NULL)
continue;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
(*priv->txqs)[i] = NULL;
- txq_cleanup(tmp);
- rte_free(tmp);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
priv->txqs_n = 0;
priv->txqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index bab826c..3710bba 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1233,28 +1233,31 @@ mlx5_secondary_data_setup(struct priv *priv)
/* TX queues. */
for (i = 0; i != nb_tx_queues; ++i) {
struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
- struct txq *txq;
+ struct txq_ctrl *primary_txq_ctrl;
+ struct txq_ctrl *txq_ctrl;

if (primary_txq == NULL)
continue;
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0,
- primary_txq->socket);
- if (txq != NULL) {
+ primary_txq_ctrl = container_of(primary_txq,
+ struct txq_ctrl, txq);
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
+ primary_txq_ctrl->socket);
+ if (txq_ctrl != NULL) {
if (txq_setup(priv->dev,
- txq,
+ primary_txq_ctrl,
primary_txq->elts_n,
- primary_txq->socket,
+ primary_txq_ctrl->socket,
NULL) == 0) {
- txq->stats.idx = primary_txq->stats.idx;
- tx_queues[i] = txq;
+ txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
+ tx_queues[i] = &txq_ctrl->txq;
continue;
}
- rte_free(txq);
+ rte_free(txq_ctrl);
}
while (i) {
- txq = tx_queues[--i];
- txq_cleanup(txq);
- rte_free(txq);
+ txq_ctrl = tx_queues[--i];
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
goto error;
}
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 7c3e87f..79d5568 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -183,33 +183,36 @@ mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
uint32_t
txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
{
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
struct ibv_mr *mr;

/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
+ (void *)txq_ctrl, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
+ (void *)txq_ctrl);
return (uint32_t)-1;
}
- if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ if (unlikely(idx == RTE_DIM(txq_ctrl->txq.mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
+ (void *)txq_ctrl);
--idx;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr));
+ memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1],
+ (sizeof(txq_ctrl->txq.mp2mr) -
+ sizeof(txq_ctrl->txq.mp2mr[0])));
}
/* Store the new entry. */
- txq->mp2mr[idx].mp = mp;
- txq->mp2mr[idx].mr = mr;
- txq->mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].mp = mp;
+ txq_ctrl->txq.mp2mr[idx].mr = mr;
+ txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
- return txq->mp2mr[idx].lkey;
+ (void *)txq_ctrl, mp->name, (void *)mp,
+ txq_ctrl->txq.mp2mr[idx].lkey);
+ return txq_ctrl->txq.mp2mr[idx].lkey;
}

struct txq_mp2mr_mbuf_check_data {
@@ -255,7 +258,7 @@ txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
void
txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
{
- struct txq *txq = arg;
+ struct txq_ctrl *txq_ctrl = arg;
struct txq_mp2mr_mbuf_check_data data = {
.ret = 0,
};
@@ -265,13 +268,13 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
data.ret == -1)
return;
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (unlikely(txq_ctrl->txq.mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
- if (txq->mp2mr[i].mp == mp)
+ if (txq_ctrl->txq.mp2mr[i].mp == mp)
return;
}
- txq_mp2mr_reg(txq, mp, i);
+ txq_mp2mr_reg(&txq_ctrl->txq, mp, i);
}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3a353b0..5baefcb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -256,6 +256,10 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+} __rte_cache_aligned;
+
+/* TX queue control descriptor. */
+struct txq_ctrl {
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
@@ -264,6 +268,7 @@ struct txq {
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
+ struct txq txq; /* Data path structure. */
};

/* mlx5_rxq.c */
@@ -291,8 +296,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_txq.c */

-void txq_cleanup(struct txq *);
-int txq_setup(struct rte_eth_dev *, struct txq *, uint16_t, unsigned int,
+void txq_cleanup(struct txq_ctrl *);
+int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 75da65b..4683775 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -68,7 +68,7 @@
/**
* Allocate TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -77,15 +77,15 @@
* 0 on success, errno value on failure.
*/
static int
-txq_alloc_elts(struct txq *txq, unsigned int elts_n)
+txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
+ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq);
+ ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -94,24 +94,24 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)

elt->buf = NULL;
}
- DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
- txq->elts_n = elts_n;
- txq->elts = elts;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
+ DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
+ txq_ctrl->txq.elts_n = elts_n;
+ txq_ctrl->txq.elts = elts;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
* at least 4 times per ring. */
- txq->elts_comp_cd_init =
+ txq_ctrl->txq.elts_comp_cd_init =
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq->elts_comp_cd = txq->elts_comp_cd_init;
+ txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
assert(ret == 0);
return 0;
error:
rte_free(elts);

- DEBUG("%p: failed, freed everything", (void *)txq);
+ DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
assert(ret > 0);
return ret;
}
@@ -119,25 +119,25 @@ error:
/**
* Free TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
static void
-txq_free_elts(struct txq *txq)
+txq_free_elts(struct txq_ctrl *txq_ctrl)
{
- unsigned int elts_n = txq->elts_n;
- unsigned int elts_head = txq->elts_head;
- unsigned int elts_tail = txq->elts_tail;
- struct txq_elt (*elts)[elts_n] = txq->elts;
+ unsigned int elts_n = txq_ctrl->txq.elts_n;
+ unsigned int elts_head = txq_ctrl->txq.elts_head;
+ unsigned int elts_tail = txq_ctrl->txq.elts_tail;
+ struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;

- DEBUG("%p: freeing WRs", (void *)txq);
- txq->elts_n = 0;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
- txq->elts_comp_cd = 0;
- txq->elts_comp_cd_init = 0;
- txq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)txq_ctrl);
+ txq_ctrl->txq.elts_n = 0;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
+ txq_ctrl->txq.elts_comp_cd = 0;
+ txq_ctrl->txq.elts_comp_cd_init = 0;
+ txq_ctrl->txq.elts = NULL;

if (elts == NULL)
return;
@@ -161,63 +161,63 @@ txq_free_elts(struct txq *txq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
void
-txq_cleanup(struct txq *txq)
+txq_cleanup(struct txq_ctrl *txq_ctrl)
{
struct ibv_exp_release_intf_params params;
size_t i;

- DEBUG("cleaning up %p", (void *)txq);
- txq_free_elts(txq);
- txq->poll_cnt = NULL;
- txq->send_flush = NULL;
- if (txq->if_qp != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->qp != NULL);
+ DEBUG("cleaning up %p", (void *)txq_ctrl);
+ txq_free_elts(txq_ctrl);
+ txq_ctrl->txq.poll_cnt = NULL;
+ txq_ctrl->txq.send_flush = NULL;
+ if (txq_ctrl->if_qp != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_qp,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_qp,
&params));
}
- if (txq->if_cq != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->cq != NULL);
+ if (txq_ctrl->if_cq != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_cq,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_cq,
&params));
}
- if (txq->qp != NULL)
- claim_zero(ibv_destroy_qp(txq->qp));
- if (txq->cq != NULL)
- claim_zero(ibv_destroy_cq(txq->cq));
- if (txq->rd != NULL) {
+ if (txq_ctrl->txq.qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
+ if (txq_ctrl->txq.cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
- txq->rd,
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->rd,
&attr));
}
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (txq->mp2mr[i].mp == NULL)
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (txq_ctrl->txq.mp2mr[i].mp == NULL)
break;
- assert(txq->mp2mr[i].mr != NULL);
- claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+ assert(txq_ctrl->txq.mp2mr[i].mr != NULL);
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr));
}
- memset(txq, 0, sizeof(*txq));
+ memset(txq_ctrl, 0, sizeof(*txq_ctrl));
}

/**
@@ -225,7 +225,7 @@ txq_cleanup(struct txq *txq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -238,13 +238,15 @@ txq_cleanup(struct txq *txq)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
+txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
- struct txq tmpl = {
- .priv = priv,
- .socket = socket
+ struct txq_ctrl tmpl = {
+ .socket = socket,
+ .txq = {
+ .priv = priv,
+ },
};
union {
struct ibv_exp_query_intf_params params;
@@ -279,8 +281,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
+ if (tmpl.txq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -292,9 +294,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.cq,
+ .send_cq = tmpl.txq.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.cq,
+ .recv_cq = tmpl.txq.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -312,8 +314,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.qp == NULL) {
+ tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.txq.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -325,7 +327,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
@@ -341,14 +343,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -357,7 +359,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.txq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -369,7 +371,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.qp,
+ .obj = tmpl.txq.qp,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
@@ -389,18 +391,18 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
goto error;
}
/* Clean up txq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
- txq_cleanup(txq);
- *txq = tmpl;
- txq->poll_cnt = txq->if_cq->poll_cnt;
- txq->send_pending = txq->if_qp->send_pending;
+ DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
+ txq_cleanup(txq_ctrl);
+ *txq_ctrl = tmpl;
+ txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
+ txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
+ txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
#endif
- txq->send_flush = txq->if_qp->send_flush;
- DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
+ txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
+ DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
- rte_mempool_walk(txq_mp2mr_iter, txq);
+ rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
assert(ret == 0);
return 0;
error:
@@ -432,12 +434,15 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
+ struct txq_ctrl *txq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (txq)
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -454,24 +459,25 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->txqs)[idx] = NULL;
- txq_cleanup(txq);
+ txq_cleanup(txq_ctrl);
} else {
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
- if (txq == NULL) {
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
+ 0, socket);
+ if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq, desc, socket, conf);
+ ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
- rte_free(txq);
+ rte_free(txq_ctrl);
else {
- txq->stats.idx = idx;
+ txq_ctrl->txq.stats.idx = idx;
DEBUG("%p: adding TX queue %p to list",
- (void *)dev, (void *)txq);
- (*priv->txqs)[idx] = txq;
+ (void *)dev, (void *)txq_ctrl);
+ (*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
dev->tx_pkt_burst = mlx5_tx_burst;
}
@@ -489,6 +495,7 @@ void
mlx5_tx_queue_release(void *dpdk_txq)
{
struct txq *txq = (struct txq *)dpdk_txq;
+ struct txq_ctrl *txq_ctrl;
struct priv *priv;
unsigned int i;

@@ -497,17 +504,18 @@ mlx5_tx_queue_release(void *dpdk_txq)

if (txq == NULL)
return;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
priv = txq->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
DEBUG("%p: removing TX queue %p from list",
- (void *)priv->dev, (void *)txq);
+ (void *)priv->dev, (void *)txq_ctrl);
(*priv->txqs)[i] = NULL;
break;
}
- txq_cleanup(txq);
- rte_free(txq);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
priv_unlock(priv);
}
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:52 UTC
Permalink
Inline TX will be fully managed by the PMD after Verbs is bypassed in the
data path. Remove the current code until then.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 10 ------
drivers/net/mlx5/Makefile | 4 ---
drivers/net/mlx5/mlx5_defs.h | 5 ---
drivers/net/mlx5/mlx5_rxtx.c | 73 +++++++++++++++-----------------------------
drivers/net/mlx5/mlx5_rxtx.h | 9 ------
drivers/net/mlx5/mlx5_txq.c | 16 ----------
7 files changed, 25 insertions(+), 93 deletions(-)

diff --git a/config/common_base b/config/common_base
index a4a3a3a..2d6832f 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

#
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 84c35a0..77fa957 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,16 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)
-
- Amount of data to be inlined during TX operations. Improves latency.
- Can improve PPS performance when PCI backpressure is detected and may be
- useful for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE`` (default **8**)

Maximum number of cached memory pools (MPs) per TX queue. Each MP from
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 9a26269..798859c 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -84,10 +84,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
-CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE
CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index da1c90e..9a19835 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum size for inline data. */
-#ifndef MLX5_PMD_MAX_INLINE
-#define MLX5_PMD_MAX_INLINE 0
-#endif
-
/*
* Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
* from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 07d95eb..4ba88ea 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -329,56 +329,33 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch0((volatile void *)
(uintptr_t)buf_next_addr);
}
- /* Put packet into send queue. */
-#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
+ goto stop;
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_vlan
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
+ err = txq->send_pending
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags);
if (unlikely(err))
goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2e1f83b..3a353b0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -240,19 +240,10 @@ struct txq {
#ifdef HAVE_VERBS_VLAN_INSERTION
int (*send_pending_vlan)();
#endif
-#if MLX5_PMD_MAX_INLINE > 0
- int (*send_pending_inline)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_inline_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
struct txq_elt (*elts)[]; /* TX elements. */
-#if MLX5_PMD_MAX_INLINE > 0
- uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
-#endif
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int elts_tail; /* First element awaiting completion. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 59974c5..75da65b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -173,9 +173,6 @@ txq_cleanup(struct txq *txq)
DEBUG("cleaning up %p", (void *)txq);
txq_free_elts(txq);
txq->poll_cnt = NULL;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = NULL;
-#endif
txq->send_flush = NULL;
if (txq->if_qp != NULL) {
assert(txq->priv != NULL);
@@ -305,9 +302,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
desc),
/* Max number of scatter/gather elements in a WR. */
.max_send_sge = 1,
-#if MLX5_PMD_MAX_INLINE > 0
- .max_inline_data = MLX5_PMD_MAX_INLINE,
-#endif
},
.qp_type = IBV_QPT_RAW_PACKET,
/* Do *NOT* enable this, completions events are managed per
@@ -325,10 +319,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
-#if MLX5_PMD_MAX_INLINE > 0
- /* ibv_create_qp() updates this value. */
- tmpl.max_inline = attr.init.cap.max_inline_data;
-#endif
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
@@ -403,12 +393,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq_cleanup(txq);
*txq = tmpl;
txq->poll_cnt = txq->if_cq->poll_cnt;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = txq->if_qp->send_pending_inline;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:54 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure rxq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 6 +-
drivers/net/mlx5/mlx5_fdir.c | 8 +-
drivers/net/mlx5/mlx5_rxq.c | 250 ++++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.c | 1 -
drivers/net/mlx5/mlx5_rxtx.h | 13 ++-
5 files changed, 148 insertions(+), 130 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index cc30463..95279bd 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -122,12 +122,14 @@ mlx5_dev_close(struct rte_eth_dev *dev)
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
+ struct rxq_ctrl *rxq_ctrl;

if (rxq == NULL)
continue;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 63e43ad..e3b97ba 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -424,7 +424,9 @@ create_flow:
static struct fdir_queue *
priv_get_fdir_queue(struct priv *priv, uint16_t idx)
{
- struct fdir_queue *fdir_queue = &(*priv->rxqs)[idx]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq);
+ struct fdir_queue *fdir_queue = &rxq_ctrl->fdir_queue;
struct ibv_exp_rwq_ind_table *ind_table = NULL;
struct ibv_qp *qp = NULL;
struct ibv_exp_rwq_ind_table_init_attr ind_init_attr;
@@ -629,8 +631,10 @@ priv_fdir_disable(struct priv *priv)
/* Run on every RX queue to destroy related flow director QP and
* indirection table. */
for (i = 0; (i != priv->rxqs_n); i++) {
- fdir_queue = &(*priv->rxqs)[i]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[i], struct rxq_ctrl, rxq);

+ fdir_queue = &rxq_ctrl->fdir_queue;
if (fdir_queue->qp != NULL) {
claim_zero(ibv_destroy_qp(fdir_queue->qp));
fdir_queue->qp = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 4000624..8d32e74 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -636,7 +636,7 @@ priv_rehash_flows(struct priv *priv)
/**
* Allocate RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -648,16 +648,17 @@ priv_rehash_flows(struct priv *priv)
* 0 on success, errno value on failure.
*/
static int
-rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
+rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
+ struct rte_mbuf **pool)
{
unsigned int i;
struct rxq_elt (*elts)[elts_n] =
rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
+ rxq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
+ ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -672,10 +673,10 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
assert(buf != NULL);
rte_pktmbuf_reset(buf);
} else
- buf = rte_pktmbuf_alloc(rxq->mp);
+ buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
+ ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -691,15 +692,15 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
sge->addr = (uintptr_t)
((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
+ sge->lkey = rxq_ctrl->mr->lkey;
/* Redundant check for tailroom. */
assert(sge->length == rte_pktmbuf_tailroom(buf));
}
DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq, elts_n);
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts = elts;
+ (void *)rxq_ctrl, elts_n);
+ rxq_ctrl->rxq.elts_n = elts_n;
+ rxq_ctrl->rxq.elts_head = 0;
+ rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
@@ -714,7 +715,7 @@ error:
}
rte_free(elts);
}
- DEBUG("%p: failed, freed everything", (void *)rxq);
+ DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
return ret;
}
@@ -722,19 +723,19 @@ error:
/**
* Free RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
static void
-rxq_free_elts(struct rxq *rxq)
+rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
+ rxq_ctrl->rxq.elts_n = 0;
+ rxq_ctrl->rxq.elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -752,58 +753,58 @@ rxq_free_elts(struct rxq *rxq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
void
-rxq_cleanup(struct rxq *rxq)
+rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
{
struct ibv_exp_release_intf_params params;

- DEBUG("cleaning up %p", (void *)rxq);
- rxq_free_elts(rxq);
- rxq->poll = NULL;
- rxq->recv = NULL;
- if (rxq->if_wq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->wq != NULL);
+ DEBUG("cleaning up %p", (void *)rxq_ctrl);
+ rxq_free_elts(rxq_ctrl);
+ rxq_ctrl->rxq.poll = NULL;
+ rxq_ctrl->rxq.recv = NULL;
+ if (rxq_ctrl->if_wq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_wq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_wq,
&params));
}
- if (rxq->if_cq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->cq != NULL);
+ if (rxq_ctrl->if_cq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_cq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_cq,
&params));
}
- if (rxq->wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq->wq));
- if (rxq->cq != NULL)
- claim_zero(ibv_destroy_cq(rxq->cq));
- if (rxq->rd != NULL) {
+ if (rxq_ctrl->rxq.wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
+ if (rxq_ctrl->rxq.cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
- rxq->rd,
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->rd,
&attr));
}
- if (rxq->mr != NULL)
- claim_zero(ibv_dereg_mr(rxq->mr));
- memset(rxq, 0, sizeof(*rxq));
+ if (rxq_ctrl->mr != NULL)
+ claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
+ memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
}

/**
@@ -815,37 +816,37 @@ rxq_cleanup(struct rxq *rxq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* RX queue pointer.
*
* @return
* 0 on success, errno value on failure.
*/
int
-rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
+rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq->priv;
- struct rxq tmpl = *rxq;
+ struct priv *priv = rxq_ctrl->rxq.priv;
+ struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.elts_n];
+ struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
+ DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.elts_n;
+ desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum = tmpl.csum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum = tmpl.rxq.csum;
}
if (priv->hw_csum_l2tun) {
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum_l2tun = tmpl.csum_l2tun;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
}
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
@@ -853,7 +854,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -867,7 +868,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq->elts;
+ elts = rxq_ctrl->rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
struct rxq_elt *elt = &(*elts)[i];
struct rte_mbuf *buf = elt->buf;
@@ -875,9 +876,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
pool[k++] = buf;
}
assert(k == mbuf_n);
- tmpl.elts_n = 0;
- tmpl.elts = NULL;
- assert((void *)&tmpl.elts == NULL);
+ tmpl.rxq.elts_n = 0;
+ tmpl.rxq.elts = NULL;
+ assert((void *)&tmpl.rxq.elts == NULL);
err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
@@ -885,18 +886,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
assert(err > 0);
return err;
}
- assert(tmpl.elts_n == desc_n);
+ assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
/* Clean up original data. */
- rxq->elts_n = 0;
- rte_free(rxq->elts);
- rxq->elts = NULL;
+ rxq_ctrl->rxq.elts_n = 0;
+ rte_free(rxq_ctrl->rxq.elts);
+ rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
@@ -904,10 +905,10 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
err = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (err)
@@ -920,9 +921,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.rxq.recv = tmpl.if_wq->recv_burst;
error:
- *rxq = tmpl;
+ *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -932,7 +933,7 @@ error:
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -947,15 +948,17 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
+rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_rxconf *conf,
struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
- struct rxq tmpl = {
- .priv = priv,
- .mp = mp,
- .socket = socket
+ struct rxq_ctrl tmpl = {
+ .socket = socket,
+ .rxq = {
+ .priv = priv,
+ .mp = mp,
+ },
};
struct ibv_exp_wq_attr mod;
union {
@@ -978,9 +981,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
}
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
(void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
@@ -1007,9 +1010,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.rxq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1020,8 +1023,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
DEBUG("priv->device_attr.max_sge is %d",
priv->device_attr.max_sge);
/* Configure VLAN stripping. */
- tmpl.vlan_strip = (priv->hw_vlan_strip &&
- !!dev->data->dev_conf.rxmode.hw_vlan_strip);
+ tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
+ !!dev->data->dev_conf.rxmode.hw_vlan_strip);
attr.wq = (struct ibv_exp_wq_init_attr){
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
@@ -1032,7 +1035,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.cq,
+ .cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
@@ -1041,7 +1044,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
0,
.res_domain = tmpl.rd,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- .vlan_offloads = (tmpl.vlan_strip ?
+ .vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1050,24 +1053,24 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
} else if (priv->hw_fcs_strip) {
/* Ask HW/Verbs to leave CRC in place when supported. */
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
- tmpl.crc_present = 1;
+ tmpl.rxq.crc_present = 1;
} else {
WARN("%p: CRC stripping has been disabled but will still"
" be performed by hardware, make sure MLNX_OFED and"
" firmware are up to date",
(void *)dev);
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
}
DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
" incoming frames to hide it",
(void *)dev,
- tmpl.crc_present ? "disabled" : "enabled",
- tmpl.crc_present << 2);
+ tmpl.rxq.crc_present ? "disabled" : "enabled",
+ tmpl.rxq.crc_present << 2);
#endif /* HAVE_VERBS_FCS */

#ifdef HAVE_VERBS_RX_END_PADDING
@@ -1075,7 +1078,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
; /* Nothing else to do. */
else if (priv->hw_padding) {
INFO("%p: enabling packet padding on queue %p",
- (void *)dev, (void *)rxq);
+ (void *)dev, (void *)rxq_ctrl);
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
} else
@@ -1085,8 +1088,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev);
#endif /* HAVE_VERBS_RX_END_PADDING */

- tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.wq == NULL) {
+ tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.rxq.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1099,15 +1102,15 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Save port ID. */
- tmpl.port_id = dev->data->port_id;
- DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+ tmpl.rxq.port_id = dev->data->port_id;
+ DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.rxq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1118,7 +1121,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.wq,
+ .obj = tmpl.rxq.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1131,17 +1134,17 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
/* Post SGEs. */
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (ret)
@@ -1155,18 +1158,18 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
- rxq_cleanup(rxq);
- *rxq = tmpl;
- DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
+ rxq_cleanup(rxq_ctrl);
+ *rxq_ctrl = tmpl;
+ DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->poll = rxq->if_cq->poll_length_flags;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1200,12 +1203,14 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1222,24 +1227,25 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->rxqs)[idx] = NULL;
- rxq_cleanup(rxq);
+ rxq_cleanup(rxq_ctrl);
} else {
- rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
- if (rxq == NULL) {
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
+ socket);
+ if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
+ ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
- rte_free(rxq);
+ rte_free(rxq_ctrl);
else {
- rxq->stats.idx = idx;
+ rxq_ctrl->rxq.stats.idx = idx;
DEBUG("%p: adding RX queue %p to list",
- (void *)dev, (void *)rxq);
- (*priv->rxqs)[idx] = rxq;
+ (void *)dev, (void *)rxq_ctrl);
+ (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
/* Update receive callback. */
dev->rx_pkt_burst = mlx5_rx_burst;
}
@@ -1257,6 +1263,7 @@ void
mlx5_rx_queue_release(void *dpdk_rxq)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct rxq_ctrl *rxq_ctrl;
struct priv *priv;
unsigned int i;

@@ -1265,6 +1272,7 @@ mlx5_rx_queue_release(void *dpdk_rxq)

if (rxq == NULL)
return;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
priv = rxq->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
@@ -1274,8 +1282,8 @@ mlx5_rx_queue_release(void *dpdk_rxq)
(*priv->rxqs)[i] = NULL;
break;
}
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
priv_unlock(priv);
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4ba88ea..f0b42e9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -574,7 +574,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)

/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;

/* Add SGE to array for repost. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 5baefcb..2c5e447 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -111,8 +111,11 @@ struct rxq {
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
struct rxq_elt (*elts)[]; /* RX elements. */
- unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
+} __rte_cache_aligned;
+
+/* RX queue control descriptor. */
+struct rxq_ctrl {
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -122,6 +125,8 @@ struct rxq {
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ struct rxq rxq; /* Data path structure. */
};

/* Hash RX queue types. */
@@ -285,9 +290,9 @@ int priv_create_hash_rxqs(struct priv *);
void priv_destroy_hash_rxqs(struct priv *);
int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
-void rxq_cleanup(struct rxq *);
-int rxq_rehash(struct rte_eth_dev *, struct rxq *);
-int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int,
+void rxq_cleanup(struct rxq_ctrl *);
+int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
+int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:55 UTC
Permalink
The latest version of Mellanox OFED exposes hardware definitions necessary
to implement data path operation bypassing Verbs. Update the minimum
version requirement to MLNX_OFED >= 3.3 and clean up compatibility checks
for previous releases.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 44 +++---------------------------------------
drivers/net/mlx5/Makefile | 39 ++++++++-----------------------------
drivers/net/mlx5/mlx5.c | 23 ----------------------
drivers/net/mlx5/mlx5.h | 5 +++++
drivers/net/mlx5/mlx5_defs.h | 9 ---------
drivers/net/mlx5/mlx5_fdir.c | 10 ----------
drivers/net/mlx5/mlx5_rxmode.c | 8 --------
drivers/net/mlx5/mlx5_rxq.c | 30 ----------------------------
drivers/net/mlx5/mlx5_rxtx.c | 4 ----
drivers/net/mlx5/mlx5_rxtx.h | 8 --------
drivers/net/mlx5/mlx5_txq.c | 2 --
drivers/net/mlx5/mlx5_vlan.c | 3 ---
12 files changed, 16 insertions(+), 169 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 77fa957..3a07928 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -125,16 +125,6 @@ These options can be modified in the ``.config`` file.
Environment variables
~~~~~~~~~~~~~~~~~~~~~

-- ``MLX5_ENABLE_CQE_COMPRESSION``
-
- A nonzero value lets ConnectX-4 return smaller completion entries to
- improve performance when PCI backpressure is detected. It is most useful
- for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``MLX5_PMD_ENABLE_PADDING``

Enables HW packet padding in PCI bus transactions.
@@ -211,40 +201,12 @@ DPDK and must be installed separately:

Currently supported by DPDK:

-- Mellanox OFED **3.1-1.0.3**, **3.1-1.5.7.1** or **3.2-2.0.0.0** depending
- on usage.
-
- The following features are supported with version **3.1-1.5.7.1** and
- above only:
-
- - IPv6, UPDv6, TCPv6 RSS.
- - RX checksum offloads.
- - IBM POWER8.
-
- The following features are supported with version **3.2-2.0.0.0** and
- above only:
-
- - Flow director.
- - RX VLAN stripping.
- - TX VLAN insertion.
- - RX CRC stripping configuration.
+- Mellanox OFED **3.3-1.0.0.0**.

- Minimum firmware version:

- With MLNX_OFED **3.1-1.0.3**:
-
- - ConnectX-4: **12.12.1240**
- - ConnectX-4 Lx: **14.12.1100**
-
- With MLNX_OFED **3.1-1.5.7.1**:
-
- - ConnectX-4: **12.13.0144**
- - ConnectX-4 Lx: **14.13.0144**
-
- With MLNX_OFED **3.2-2.0.0.0**:
-
- - ConnectX-4: **12.14.2036**
- - ConnectX-4 Lx: **14.14.2036**
+ - ConnectX-4: **12.16.1006**
+ - ConnectX-4 Lx: **14.16.1006**

Getting Mellanox OFED
~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 798859c..a63d6b3 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -102,42 +102,19 @@ endif
mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_EXP_QUERY_DEVICE \
- infiniband/verbs.h \
- type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_FLOW_SPEC_IPV6 \
- infiniband/verbs.h \
- type 'struct ibv_exp_flow_spec_ipv6' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- infiniband/verbs.h \
- enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- infiniband/verbs.h \
- enum IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_CQ_RX_TCP_PACKET \
+ HAVE_VERBS_VLAN_INSERTION \
infiniband/verbs.h \
- enum IBV_EXP_CQ_RX_TCP_PACKET \
+ enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_FCS \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS \
+ HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
+ infiniband/verbs_exp.h \
+ enum IBV_EXP_CQ_COMPRESSED_CQE \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_RX_END_PADDING \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
+ HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
+ infiniband/mlx5_hw.h \
+ enum MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
$(AUTOCONF_OUTPUT)

$(SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD):.c=.o): mlx5_autoconf.h
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 95279bd..e9cc38a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -195,17 +195,13 @@ static const struct eth_dev_ops mlx5_dev_ops = {
.mac_addr_add = mlx5_mac_addr_add,
.mac_addr_set = mlx5_mac_addr_set,
.mtu_set = mlx5_dev_set_mtu,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
.vlan_offload_set = mlx5_vlan_offload_set,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.reta_update = mlx5_dev_rss_reta_update,
.reta_query = mlx5_dev_rss_reta_query,
.rss_hash_update = mlx5_rss_hash_update,
.rss_hash_conf_get = mlx5_rss_hash_conf_get,
-#ifdef MLX5_FDIR_SUPPORT
.filter_ctrl = mlx5_dev_filter_ctrl,
-#endif /* MLX5_FDIR_SUPPORT */
};

static struct {
@@ -352,24 +348,16 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct ibv_pd *pd = NULL;
struct priv *priv = NULL;
struct rte_eth_dev *eth_dev;
-#ifdef HAVE_EXP_QUERY_DEVICE
struct ibv_exp_device_attr exp_device_attr;
-#endif /* HAVE_EXP_QUERY_DEVICE */
struct ether_addr mac;
uint16_t num_vfs = 0;

-#ifdef HAVE_EXP_QUERY_DEVICE
exp_device_attr.comp_mask =
IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
IBV_EXP_DEVICE_ATTR_RX_HASH |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-#ifdef HAVE_VERBS_RX_END_PADDING
IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN |
-#endif /* HAVE_VERBS_RX_END_PADDING */
0;
-#endif /* HAVE_EXP_QUERY_DEVICE */

DEBUG("using port %u (%08" PRIx32 ")", port, test);

@@ -420,7 +408,6 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
-#ifdef HAVE_EXP_QUERY_DEVICE
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
@@ -446,30 +433,20 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
DEBUG("maximum RX indirection table size is %u",
priv->ind_table_max_size);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap &
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP);
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
DEBUG("VLAN stripping is %ssupported",
(priv->hw_vlan_strip ? "" : "not "));

-#ifdef HAVE_VERBS_FCS
priv->hw_fcs_strip = !!(exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_SCATTER_FCS);
-#endif /* HAVE_VERBS_FCS */
DEBUG("FCS stripping configuration is %ssupported",
(priv->hw_fcs_strip ? "" : "not "));

-#ifdef HAVE_VERBS_RX_END_PADDING
priv->hw_padding = !!exp_device_attr.rx_pad_end_addr_align;
-#endif /* HAVE_VERBS_RX_END_PADDING */
DEBUG("hardware RX end alignment padding is %ssupported",
(priv->hw_padding ? "" : "not "));

-#else /* HAVE_EXP_QUERY_DEVICE */
- priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
priv->mps = mps;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index dccc18d..4170e3b 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -69,6 +69,11 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+#if !defined(HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE) || \
+ !defined(HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE)
+#error Mellanox OFED >= 3.3 is required, please refer to the documentation.
+#endif
+
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 9a19835..8d2ec7a 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -76,13 +76,4 @@
/* Alarm timeout. */
#define MLX5_ALARM_TIMEOUT_US 100000

-/*
- * Extended flow priorities necessary to support flow director are available
- * since MLNX_OFED 3.2. Considering this version adds support for VLAN
- * offloads as well, their availability means flow director can be used.
- */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-#define MLX5_FDIR_SUPPORT 1
-#endif
-
#endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index e3b97ba..1850218 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -122,7 +122,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
desc->type = HASH_RXQ_IPV4;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
desc->type = HASH_RXQ_UDPV6;
break;
@@ -132,7 +131,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
desc->type = HASH_RXQ_IPV6;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -147,7 +145,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
desc->src_ip[0] = fdir_filter->input.flow.ip4_flow.src_ip;
desc->dst_ip[0] = fdir_filter->input.flow.ip4_flow.dst_ip;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
desc->src_port = fdir_filter->input.flow.udp6_flow.src_port;
@@ -161,7 +158,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
fdir_filter->input.flow.ipv6_flow.dst_ip,
sizeof(desc->dst_ip));
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -211,7 +207,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[0] & mask->ipv4_mask.dst_ip)))
return 0;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -222,7 +217,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[i] & mask->ipv6_mask.dst_ip[i])))
return 0;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -258,9 +252,7 @@ priv_fdir_flow_add(struct priv *priv,
uintptr_t spec_offset = (uintptr_t)&data->spec;
struct ibv_exp_flow_spec_eth *spec_eth;
struct ibv_exp_flow_spec_ipv4 *spec_ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 *spec_ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_tcp_udp *spec_tcp_udp;
struct mlx5_fdir_filter *iter_fdir_filter;
unsigned int i;
@@ -334,7 +326,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv4->size;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -368,7 +359,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv6->size;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
ERROR("invalid flow attribute type");
return EINVAL;
diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c
index 3a55f63..51e2aca 100644
--- a/drivers/net/mlx5/mlx5_rxmode.c
+++ b/drivers/net/mlx5/mlx5_rxmode.c
@@ -67,11 +67,9 @@ static const struct special_flow_init special_flow_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -82,10 +80,8 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -96,15 +92,12 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 1,
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_FLOW_TYPE_IPV6MULTI] = {
.dst_mac_val = "\x33\x33\x00\x00\x00\x00",
.dst_mac_mask = "\xff\xff\x00\x00\x00\x00",
@@ -115,7 +108,6 @@ static const struct special_flow_init special_flow_init[] = {
0,
.per_vlan = 1,
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
};

/**
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 8d32e74..7db4ce7 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -105,7 +105,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_TCPV6] = {
.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
IBV_EXP_RX_HASH_DST_IPV6 |
@@ -144,7 +143,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
[HASH_RXQ_ETH] = {
.hash_fields = 0,
.dpdk_rss_hf = 0,
@@ -168,17 +166,11 @@ static const struct ind_table_init ind_table_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
0,
-#ifdef HAVE_FLOW_SPEC_IPV6
.hash_types_n = 6,
-#else /* HAVE_FLOW_SPEC_IPV6 */
- .hash_types_n = 3,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
},
{
.max_size = 1,
@@ -243,12 +235,8 @@ priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
init = &hash_rxq_init[type];
*flow_attr = (struct ibv_exp_flow_attr){
.type = IBV_EXP_FLOW_ATTR_NORMAL,
-#ifdef MLX5_FDIR_SUPPORT
/* Priorities < 3 are reserved for flow director. */
.priority = init->flow_priority + 3,
-#else /* MLX5_FDIR_SUPPORT */
- .priority = init->flow_priority,
-#endif /* MLX5_FDIR_SUPPORT */
.num_of_specs = 0,
.port = priv->port,
.flags = 0,
@@ -589,9 +577,7 @@ priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
case HASH_RXQ_FLOW_TYPE_ALLMULTI:
return !!priv->allmulti_req;
case HASH_RXQ_FLOW_TYPE_BROADCAST:
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
-#endif /* HAVE_FLOW_SPEC_IPV6 */
/* If allmulti is enabled, broadcast and ipv6multi
* are unnecessary. */
return !priv->allmulti_req;
@@ -1038,19 +1024,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
0,
.res_domain = tmpl.rd,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
};
-
-#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
tmpl.rxq.crc_present = 0;
@@ -1071,9 +1051,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
(void *)dev,
tmpl.rxq.crc_present ? "disabled" : "enabled",
tmpl.rxq.crc_present << 2);
-#endif /* HAVE_VERBS_FCS */
-
-#ifdef HAVE_VERBS_RX_END_PADDING
if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
; /* Nothing else to do. */
else if (priv->hw_padding) {
@@ -1086,7 +1063,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" supported, make sure MLNX_OFED and firmware are"
" up to date",
(void *)dev);
-#endif /* HAVE_VERBS_RX_END_PADDING */

tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
if (tmpl.rxq.wq == NULL) {
@@ -1106,9 +1082,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
.obj = tmpl.rxq.cq,
};
@@ -1164,11 +1138,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f0b42e9..6a0d707 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -452,11 +452,9 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD);
-#ifdef HAVE_EXP_CQ_RX_TCP_PACKET
/* Set L4 checksum flag only for TCP/UDP packets. */
if (flags &
(IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
-#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */
ol_flags |=
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
@@ -589,12 +587,10 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
seg->packet_type = rxq_cq_to_pkt_type(flags);
seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
seg->ol_flags |= PKT_RX_VLAN_PKT;
seg->vlan_tci = vlan_tci;
}
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
}
/* Return packet. */
*(pkts++) = seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2c5e447..570345b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -120,11 +120,7 @@ struct rxq_ctrl {
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
unsigned int socket; /* CPU socket ID for allocations. */
struct rxq rxq; /* Data path structure. */
};
@@ -134,11 +130,9 @@ enum hash_rxq_type {
HASH_RXQ_TCPV4,
HASH_RXQ_UDPV4,
HASH_RXQ_IPV4,
-#ifdef HAVE_FLOW_SPEC_IPV6
HASH_RXQ_TCPV6,
HASH_RXQ_UDPV6,
HASH_RXQ_IPV6,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
HASH_RXQ_ETH,
};

@@ -169,9 +163,7 @@ struct hash_rxq_init {
} hdr;
struct ibv_exp_flow_spec_tcp_udp tcp_udp;
struct ibv_exp_flow_spec_ipv4 ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_eth eth;
} flow_spec; /* Flow specification template. */
const struct hash_rxq_init *underlayer; /* Pointer to underlayer. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4683775..9f3a33b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -375,13 +375,11 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
-#ifdef HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
-#endif
};
tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_qp == NULL) {
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index ff40538..3b9b771 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,7 +144,6 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -165,8 +164,6 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
return;
}

-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
/* Update related bits in RX queue. */
rxq->vlan_strip = !!on;
}
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:57 UTC
Permalink
The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e9cc38a..62e6e16 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -37,6 +37,7 @@
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
+#include <errno.h>
#include <net/if.h>

/* Verbs header. */
@@ -57,6 +58,7 @@
#include <rte_ethdev.h>
#include <rte_pci.h>
#include <rte_common.h>
+#include <rte_kvargs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -237,6 +239,70 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr)
return ret;
}

+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ * Key argument to verify.
+ * @param[in] val
+ * Value associated with key.
+ * @param opaque
+ * User data.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct priv *priv = opaque;
+
+ /* No parameters are expected at the moment. */
+ (void)priv;
+ (void)val;
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param devargs
+ * Device arguments structure.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args(struct priv *priv, struct rte_devargs *devargs)
+{
+ static const char *params[] = {
+ NULL,
+ };
+ struct rte_kvargs *kvlist;
+ int ret = 0;
+ int i;
+
+ if (devargs == NULL)
+ return 0;
+ kvlist = rte_kvargs_parse(devargs->args, params);
+ if (kvlist == NULL)
+ return 0;
+ /* Process parameters. */
+ for (i = 0; (i != RTE_DIM(params)); ++i) {
+ if (rte_kvargs_count(kvlist, params[i])) {
+ ret = rte_kvargs_process(kvlist, params[i],
+ mlx5_args_check, priv);
+ if (ret != 0)
+ return ret;
+ }
+ }
+ rte_kvargs_free(kvlist);
+ return 0;
+}
+
static struct eth_driver mlx5_driver;

/**
@@ -408,6 +474,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ err = mlx5_args(priv, pci_dev->devargs);
+ if (err) {
+ ERROR("failed to process device arguments: %s",
+ strerror(err));
+ goto port_error;
+ }
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:56 UTC
Permalink
These structures and macros extend those exposed by libmlx5 (in mlx5_hw.h)
to let the PMD manage work queue and completion queue elements directly.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_prm.h | 155 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 155 insertions(+)
create mode 100644 drivers/net/mlx5/mlx5_prm.h

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
new file mode 100644
index 0000000..c4fb1c2
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -0,0 +1,155 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/mlx5_hw.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* Get CQE owner bit. */
+#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK)
+
+/* Get CQE format. */
+#define MLX5_CQE_FORMAT(op_own) (((op_own) & MLX5E_CQE_FORMAT_MASK) >> 2)
+
+/* Get CQE opcode. */
+#define MLX5_CQE_OPCODE(op_own) (((op_own) & 0xf0) >> 4)
+
+/* Get CQE solicited event. */
+#define MLX5_CQE_SE(op_own) (((op_own) >> 1) & 1)
+
+/* Invalidate a CQE. */
+#define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
+
+/* CQE value to inform that VLAN is stripped. */
+#define MLX5_CQE_VLAN_STRIPPED 0x1
+
+/* Maximum number of packets a multi-packet WQE can handle. */
+#define MLX5_MPW_DSEG_MAX 5
+
+/* Room for inline data in regular work queue element. */
+#define MLX5_WQE64_INL_DATA 12
+
+/* Room for inline data in multi-packet WQE. */
+#define MLX5_MWQE64_INL_DATA 28
+
+/* Subset of struct mlx5_wqe_eth_seg. */
+struct mlx5_wqe_eth_seg_small {
+ uint32_t rsvd0;
+ uint8_t cs_flags;
+ uint8_t rsvd1;
+ uint16_t mss;
+ uint32_t rsvd2;
+ uint16_t inline_hdr_sz;
+};
+
+/* Regular WQE. */
+struct mlx5_wqe_regular {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ struct mlx5_wqe_data_seg dseg;
+} __rte_aligned(64);
+
+/* Inline WQE. */
+struct mlx5_wqe_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_WQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Multi-packet WQE. */
+struct mlx5_wqe_mpw {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ struct mlx5_wqe_data_seg dseg[2];
+} __rte_aligned(64);
+
+/* Multi-packet WQE with inline. */
+struct mlx5_wqe_mpw_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_MWQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Union of all WQE types. */
+union mlx5_wqe {
+ struct mlx5_wqe_regular wqe;
+ struct mlx5_wqe_inl inl;
+ struct mlx5_wqe_mpw mpw;
+ struct mlx5_wqe_mpw_inl mpw_inl;
+ uint8_t data[64];
+};
+
+/* MPW session status. */
+enum mlx5_mpw_state {
+ MLX5_MPW_STATE_OPENED,
+ MLX5_MPW_INL_STATE_OPENED,
+ MLX5_MPW_STATE_CLOSED,
+};
+
+/* MPW session descriptor. */
+struct mlx5_mpw {
+ enum mlx5_mpw_state state;
+ unsigned int pkts_n;
+ unsigned int len;
+ unsigned int total_len;
+ volatile union mlx5_wqe *wqe;
+ union {
+ volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
+ volatile uint8_t *raw;
+ } data;
+};
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:58 UTC
Permalink
These wrappers are meant to prevent code duplication later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.h | 2 ++
drivers/net/mlx5/mlx5_ethdev.c | 34 ++++++++++++++++++++++++++++------
drivers/net/mlx5/mlx5_txq.c | 2 +-
3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 4170e3b..382aac5 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -197,6 +197,8 @@ void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
int mlx5_set_link_down(struct rte_eth_dev *dev);
int mlx5_set_link_up(struct rte_eth_dev *dev);
struct priv *mlx5_secondary_data_setup(struct priv *priv);
+void priv_select_tx_function(struct priv *);
+void priv_select_rx_function(struct priv *);

/* mlx5_mac.c */

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 3710bba..c612b31 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1100,8 +1100,8 @@ priv_set_link(struct priv *priv, int up)
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- dev->rx_pkt_burst = mlx5_rx_burst;
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
if (err)
@@ -1290,13 +1290,11 @@ mlx5_secondary_data_setup(struct priv *priv)
rte_mb();
priv->dev->data = &sd->data;
rte_mb();
- priv->dev->tx_pkt_burst = mlx5_tx_burst;
- priv->dev->rx_pkt_burst = removed_rx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
priv_unlock(priv);
end:
/* More sanity checks. */
- assert(priv->dev->tx_pkt_burst == mlx5_tx_burst);
- assert(priv->dev->rx_pkt_burst == removed_rx_burst);
assert(priv->dev->data == &sd->data);
rte_spinlock_unlock(&sd->lock);
return priv;
@@ -1307,3 +1305,27 @@ error:
rte_spinlock_unlock(&sd->lock);
return NULL;
}
+
+/**
+ * Configure the TX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_tx_function(struct priv *priv)
+{
+ priv->dev->tx_pkt_burst = mlx5_tx_burst;
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_rx_function(struct priv *priv)
+{
+ priv->dev->rx_pkt_burst = mlx5_rx_burst;
+}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9f3a33b..d7cc39d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -477,7 +477,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)txq_ctrl);
(*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
}
priv_unlock(priv);
return -ret;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:47:59 UTC
Permalink
Bypass Verbs to improve RX performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 4 +-
drivers/net/mlx5/mlx5_fdir.c | 2 +-
drivers/net/mlx5/mlx5_rxq.c | 291 +++++++++++++++++++----------------------
drivers/net/mlx5/mlx5_rxtx.c | 288 +++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 37 +++---
drivers/net/mlx5/mlx5_vlan.c | 3 +-
6 files changed, 310 insertions(+), 315 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index c612b31..4cfcbd5 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1263,7 +1263,9 @@ mlx5_secondary_data_setup(struct priv *priv)
}
/* RX queues. */
for (i = 0; i != nb_rx_queues; ++i) {
- struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
+ struct rxq_ctrl *primary_rxq =
+ container_of((*sd->primary_priv->rxqs)[i],
+ struct rxq_ctrl, rxq);

if (primary_rxq == NULL)
continue;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 1850218..73eb00e 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -431,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx)
ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){
.pd = priv->pd,
.log_ind_tbl_size = 0,
- .ind_tbl = &((*priv->rxqs)[idx]->wq),
+ .ind_tbl = &rxq_ctrl->wq,
.comp_mask = 0,
};

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 7db4ce7..ac2b69f 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -43,6 +43,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/arch.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -373,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv)
DEBUG("indirection table extended to assume %u WQs",
priv->reta_idx_n);
}
- for (i = 0; (i != priv->reta_idx_n); ++i)
- wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
+ for (i = 0; (i != priv->reta_idx_n); ++i) {
+ struct rxq_ctrl *rxq_ctrl;
+
+ rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
+ struct rxq_ctrl, rxq);
+ wqs[i] = rxq_ctrl->wq;
+ }
/* Get number of hash RX queues to configure. */
for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
hash_rxqs_n += ind_table_init[i].hash_types_n;
@@ -638,21 +645,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf **pool)
{
unsigned int i;
- struct rxq_elt (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq_ctrl->socket);
int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
- ret = ENOMEM;
- goto error;
- }
/* For each WR (packet). */
for (i = 0; (i != elts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_sge *sge = &(*elts)[i].sge;
struct rte_mbuf *buf;
+ volatile struct mlx5_wqe_data_seg *scat =
+ &(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
buf = *(pool++);
@@ -666,40 +665,36 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
ret = ENOMEM;
goto error;
}
- elt->buf = buf;
/* Headroom is reserved by rte_pktmbuf_alloc(). */
assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
/* Buffer is supposed to be empty. */
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq_ctrl->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
+ assert(!buf->next);
+ PORT(buf) = rxq_ctrl->rxq.port_id;
+ DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+ PKT_LEN(buf) = DATA_LEN(buf);
+ NB_SEGS(buf) = 1;
+ /* scat->addr must be able to store a pointer. */
+ assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx5_wqe_data_seg){
+ .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = htonl(rxq_ctrl->mr->lkey),
+ };
+ (*rxq_ctrl->rxq.elts)[i] = buf;
}
DEBUG("%p: allocated and configured %u single-segment WRs",
(void *)rxq_ctrl, elts_n);
- rxq_ctrl->rxq.elts_n = elts_n;
- rxq_ctrl->rxq.elts_head = 0;
- rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
+ assert(pool == NULL);
+ elts_n = i;
+ for (i = 0; (i != elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
@@ -716,22 +711,16 @@ static void
rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq_ctrl->rxq.elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
- rxq_ctrl->rxq.elts_n = 0;
- rxq_ctrl->rxq.elts = NULL;
- if (elts == NULL)
+ if (rxq_ctrl->rxq.elts == NULL)
return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;

- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
+ for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
- rte_free(elts);
}

/**
@@ -749,42 +738,40 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)

DEBUG("cleaning up %p", (void *)rxq_ctrl);
rxq_free_elts(rxq_ctrl);
- rxq_ctrl->rxq.poll = NULL;
- rxq_ctrl->rxq.recv = NULL;
if (rxq_ctrl->if_wq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.wq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_wq,
&params));
}
if (rxq_ctrl->if_cq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.cq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_cq,
&params));
}
- if (rxq_ctrl->rxq.wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
- if (rxq_ctrl->rxq.cq != NULL)
- claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
+ if (rxq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
rxq_ctrl->rd,
&attr));
}
@@ -811,14 +798,13 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->rxq.priv;
+ struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
@@ -840,7 +826,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -854,60 +840,33 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq_ctrl->rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- pool[k++] = buf;
- }
+ for (i = 0; (i != desc_n); ++i)
+ pool[k++] = (*rxq_ctrl->rxq.elts)[i];
assert(k == mbuf_n);
- tmpl.rxq.elts_n = 0;
- tmpl.rxq.elts = NULL;
- assert((void *)&tmpl.rxq.elts == NULL);
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
- assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
- /* Clean up original data. */
- rxq_ctrl->rxq.elts_n = 0;
- rte_free(rxq_ctrl->rxq.elts);
- rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
/* Post SGEs. */
- assert(tmpl.if_wq != NULL);
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, err);
- /* Set err because it does not contain a valid errno value. */
- err = EIO;
- goto error;
+ ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
+ rte_free(pool);
+ assert(err > 0);
+ return err;
}
- tmpl.rxq.recv = tmpl.if_wq->recv_burst;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc_n;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
*rxq_ctrl = tmpl;
assert(err >= 0);
@@ -915,6 +874,36 @@ error:
}

/**
+ * Initialize RX queue.
+ *
+ * @param tmpl
+ * Pointer to RX queue control template.
+ * @param rxq_ctrl
+ * Pointer to RX queue control.
+ */
+static inline void
+rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+{
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+ struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+
+ tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cq_ci = 0;
+ tmpl->rxq.rq_ci = 0;
+ tmpl->rxq.cq_db = cq->dbrec;
+ tmpl->rxq.wqes =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)rwq->rq.buff;
+ tmpl->rxq.cqes =
+ (volatile struct mlx5_cqe64 (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->rxq.elts =
+ (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
+ ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+}
+
+/**
* Configure a RX queue.
*
* @param dev
@@ -934,15 +923,16 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp)
+rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
struct rxq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
.rxq = {
- .priv = priv,
+ .elts_n = desc,
.mp = mp,
},
};
@@ -952,17 +942,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_wq_init_attr wq;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
- struct rxq_elt (*elts)[desc];
int ret = 0;
- unsigned int i;
- unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors", (void *)dev);
+ ERROR("%p: invalid number of RX descriptors (must be a"
+ " multiple of 2)", (void *)dev);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -996,9 +985,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.rxq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1015,13 +1004,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
+ .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
priv->device_attr.max_qp_wr :
- (int)cq_size),
+ (int)desc),
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.rxq.cq,
+ .cq = tmpl.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
@@ -1064,19 +1053,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" up to date",
(void *)dev);

- tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.rxq.wq == NULL) {
+ tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
- if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1084,7 +1067,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf_version = 1,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.rxq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1095,7 +1078,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.rxq.wq,
+ .obj = tmpl.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1108,38 +1091,29 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- /* Post SGEs. */
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
- }
+ rxq_setup(&tmpl, rxq_ctrl);
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, ret);
- /* Set ret because it does not contain a valid errno value. */
- ret = EIO;
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(ret));
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
*rxq_ctrl = tmpl;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
- /* Assign function in queue. */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
- rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1173,14 +1147,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
- struct rxq_ctrl *rxq_ctrl;
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in RX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1199,8 +1178,9 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->rxqs)[idx] = NULL;
rxq_cleanup(rxq_ctrl);
} else {
- rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
- socket);
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -1208,7 +1188,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
rte_free(rxq_ctrl);
else {
@@ -1243,12 +1223,12 @@ mlx5_rx_queue_release(void *dpdk_rxq)
if (rxq == NULL)
return;
rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
- priv = rxq->priv;
+ priv = rxq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
if ((*priv->rxqs)[i] == rxq) {
DEBUG("%p: removing RX queue %p from list",
- (void *)priv->dev, (void *)rxq);
+ (void *)priv->dev, (void *)rxq_ctrl);
(*priv->rxqs)[i] = NULL;
break;
}
@@ -1278,7 +1258,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6a0d707..7d74074 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -42,6 +42,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -55,7 +57,7 @@
#include <rte_prefetch.h>
#include <rte_common.h>
#include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -65,6 +67,47 @@
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe64 cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+ __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe64 cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+{
+ volatile struct mlx5_cqe64 *cqe;
+ uint16_t idx = *ci;
+ uint8_t op_own;
+
+ cqe = &cqes[idx & (cqes_n - 1)];
+ op_own = cqe->op_own;
+ if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+ return NULL;
+ } else if (unlikely(op_own & 0x80)) {
+ switch (op_own >> 4) {
+ case MLX5_CQE_INVALID:
+ return NULL; /* No CQE */
+ case MLX5_CQE_REQ_ERR:
+ return cqe;
+ case MLX5_CQE_RESP_ERR:
+ ++(*ci);
+ return NULL;
+ default:
+ return NULL;
+ }
+ }
+ if (cqe) {
+ *ci = idx + 1;
+ return cqe;
+ }
+ return NULL;
+}

/**
* Manage TX completions.
@@ -390,8 +433,8 @@ stop:
/**
* Translate RX completion flags to packet type.
*
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @note: fix mlx5_dev_supported_ptypes_get() if any change here.
*
@@ -399,11 +442,13 @@ stop:
* Packet type for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
{
uint32_t pkt_type;
+ uint8_t flags = cqe->l4_hdr_type_etc;
+ uint8_t info = cqe->rsvd0[0];

- if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+ if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
pkt_type =
TRANSPOSE(flags,
IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
@@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags)
else
pkt_type =
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4) |
+ MLX5_CQE_L3_HDR_TYPE_IPV6,
+ RTE_PTYPE_L3_IPV6) |
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6);
+ MLX5_CQE_L3_HDR_TYPE_IPV4,
+ RTE_PTYPE_L3_IPV4);
return pkt_type;
}

@@ -433,50 +478,69 @@ rxq_cq_to_pkt_type(uint32_t flags)
*
* @param[in] rxq
* Pointer to RX queue structure.
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @return
* Offload flags (ol_flags) for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
{
uint32_t ol_flags = 0;
+ uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+ uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+ uint8_t info = cqe->rsvd0[0];

- if (rxq->csum) {
- /* Set IP checksum flag only for IPv4/IPv6 packets. */
- if (flags &
- (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_BAD);
- /* Set L4 checksum flag only for TCP/UDP packets. */
- if (flags &
- (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_BAD);
- }
+ if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+ (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+ PKT_RX_IP_CKSUM_BAD);
+ if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+ PKT_RX_L4_CKSUM_BAD);
/*
* PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
* of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
* (its value is 0).
*/
- if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+ if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
ol_flags |=
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD) |
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
PKT_RX_L4_CKSUM_BAD);
return ol_flags;
}

/**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ * RX queue to fetch packet from.
+ *
+ * @return
+ * Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+ if (cqe)
+ return ntohl(cqe->byte_cnt);
+ return 0;
+}
+
+/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -492,133 +556,81 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_sge sges[pkts_n];
- unsigned int i;
+ struct rxq *rxq = dpdk_rxq;
unsigned int pkts_ret = 0;
- int ret;
+ unsigned int i;
+ unsigned int rq_ci = rxq->rq_ci;
+ const unsigned int elts_n = rxq->elts_n;
+ const unsigned int wqe_cnt = elts_n - 1;

for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[elts_head];
- unsigned int len;
- struct rte_mbuf *seg = elt->buf;
+ unsigned int idx = rq_ci & wqe_cnt;
struct rte_mbuf *rep;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(seg != NULL);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
+ struct rte_mbuf *pkt;
+ unsigned int len;
+ volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & wqe_cnt];
+
+ pkt = (*rxq->elts)[idx];
+ rte_prefetch0(cqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increment out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ break;
}
-
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- elt->buf = rep;
-
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
-
- /* Update seg information. */
- SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(seg) = 1;
- PORT(seg) = rxq->port_id;
- NEXT(seg) = NULL;
- PKT_LEN(seg) = len;
- DATA_LEN(seg) = len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- seg->packet_type = rxq_cq_to_pkt_type(flags);
- seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- seg->ol_flags |= PKT_RX_VLAN_PKT;
- seg->vlan_tci = vlan_tci;
+ SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+ NB_SEGS(rep) = 1;
+ PORT(rep) = rxq->port_id;
+ NEXT(rep) = NULL;
+ len = rx_poll_len(rxq);
+ if (unlikely(len == 0)) {
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ /* Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes. */
+ wqe->addr = htonll((uintptr_t)rep->buf_addr +
+ RTE_PKTMBUF_HEADROOM);
+ (*rxq->elts)[idx] = rep;
+ /* Update pkt information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
}
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
}
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
+ PKT_LEN(pkt) = len;
+ DATA_LEN(pkt) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
rxq->stats.ibytes += len;
#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++pkts_ret;
+ ++rq_ci;
}
- if (unlikely(i == 0))
+ if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
return 0;
/* Repost WRs. */
#ifdef DEBUG_RECV
DEBUG("%p: reposting %u WRs", (void *)rxq, i);
#endif
- ret = rxq->recv(rxq->wq, sges, i);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci;
+ rte_wmb();
+ *rxq->cq_db = htonl(rxq->cq_ci);
+ rte_wmb();
+ *rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
rxq->stats.ipackets += pkts_ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 570345b..7e4e7cf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -43,6 +43,7 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -81,12 +82,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element. */
-struct rxq_elt {
- struct ibv_sge sge; /* Scatter/Gather Element. */
- struct rte_mbuf *buf; /* SGE buffer. */
-};
-
/* Flow director queue structure. */
struct fdir_queue {
struct ibv_qp *qp; /* Associated RX QP. */
@@ -97,25 +92,28 @@ struct priv;

/* RX queue descriptor. */
struct rxq {
- struct priv *priv; /* Back pointer to private data. */
- struct rte_mempool *mp; /* Memory Pool for allocations. */
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_exp_wq *wq; /* Work Queue. */
- int32_t (*poll)(); /* Verbs poll function. */
- int32_t (*recv)(); /* Verbs receive function. */
- unsigned int port_id; /* Port ID for incoming packets. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- struct rxq_elt (*elts)[]; /* RX elements. */
- struct mlx5_rxq_stats stats; /* RX queue counters. */
+ uint16_t rq_ci;
+ uint16_t cq_ci;
+ uint16_t elts_n;
+ uint16_t port_id;
+ volatile struct mlx5_wqe_data_seg(*wqes)[];
+ volatile struct mlx5_cqe64(*cqes)[];
+ volatile uint32_t *rq_db;
+ volatile uint32_t *cq_db;
+ struct rte_mbuf *(*elts)[];
+ struct rte_mempool *mp;
+ struct mlx5_rxq_stats stats;
} __rte_cache_aligned;

/* RX queue control descriptor. */
struct rxq_ctrl {
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_exp_wq *wq; /* Work Queue. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -284,8 +282,9 @@ int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
void rxq_cleanup(struct rxq_ctrl *);
int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
-int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_rxconf *, struct rte_mempool *);
+int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_rxconf *,
+ struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
void mlx5_rx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 3b9b771..4719e69 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,6 +144,7 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -157,7 +158,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
.vlan_offloads = vlan_offloads,
};

- err = ibv_exp_modify_wq(rxq->wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: failed to modified stripping mode: %s",
(void *)priv, strerror(err));
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:00 UTC
Permalink
Bypass Verbs to improve TX performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 5 -
drivers/net/mlx5/mlx5_ethdev.c | 10 +-
drivers/net/mlx5/mlx5_mr.c | 4 +-
drivers/net/mlx5/mlx5_rxtx.c | 359 ++++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 53 +++---
drivers/net/mlx5/mlx5_txq.c | 210 ++++++++++++------------
6 files changed, 334 insertions(+), 307 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index a63d6b3..9b4455b 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -102,11 +102,6 @@ endif
mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
infiniband/verbs_exp.h \
enum IBV_EXP_CQ_COMPRESSED_CQE \
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 4cfcbd5..aaa6c16 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1243,11 +1243,11 @@ mlx5_secondary_data_setup(struct priv *priv)
txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
primary_txq_ctrl->socket);
if (txq_ctrl != NULL) {
- if (txq_setup(priv->dev,
- primary_txq_ctrl,
- primary_txq->elts_n,
- primary_txq_ctrl->socket,
- NULL) == 0) {
+ if (txq_ctrl_setup(priv->dev,
+ primary_txq_ctrl,
+ primary_txq->elts_n,
+ primary_txq_ctrl->socket,
+ NULL) == 0) {
txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
tx_queues[i] = &txq_ctrl->txq;
continue;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 79d5568..e5e8a04 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -189,7 +189,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq_ctrl, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
+ mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq_ctrl);
@@ -208,7 +208,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Store the new entry. */
txq_ctrl->txq.mp2mr[idx].mp = mp;
txq_ctrl->txq.mp2mr[idx].mr = mr;
- txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq_ctrl, mp->name, (void *)mp,
txq_ctrl->txq.mp2mr[idx].lkey);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7d74074..cee6067 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe64 cqes[],
*
* @param txq
* Pointer to TX queue structure.
- *
- * @return
- * 0 on success, -1 on failure.
*/
-static int
+static void
txq_complete(struct txq *txq)
{
- unsigned int elts_comp = txq->elts_comp;
- unsigned int elts_tail = txq->elts_tail;
- unsigned int elts_free = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
- int wcs_n;
-
- if (unlikely(elts_comp == 0))
- return 0;
-#ifdef DEBUG_SEND
- DEBUG("%p: processing %u work requests completions",
- (void *)txq, elts_comp);
-#endif
- wcs_n = txq->poll_cnt(txq->cq, elts_comp);
- if (unlikely(wcs_n == 0))
- return 0;
- if (unlikely(wcs_n < 0)) {
- DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
- (void *)txq, wcs_n);
- return -1;
+ const unsigned int cqe_n = txq->cqe_n;
+ uint16_t elts_free = txq->elts_tail;
+ uint16_t elts_tail;
+ uint16_t cq_ci = txq->cq_ci;
+ unsigned int wqe_ci = (unsigned int)-1;
+ int ret = 0;
+
+ while (ret == 0) {
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
+ if (cqe == NULL)
+ break;
+ wqe_ci = ntohs(cqe->wqe_counter);
}
- elts_comp -= wcs_n;
- assert(elts_comp <= txq->elts_comp);
- /*
- * Assume WC status is successful as nothing can be done about it
- * anyway.
- */
- elts_tail += wcs_n * txq->elts_comp_cd_init;
- if (elts_tail >= elts_n)
- elts_tail -= elts_n;
-
- while (elts_free != elts_tail) {
- struct txq_elt *elt = &(*txq->elts)[elts_free];
+ if (unlikely(wqe_ci == (unsigned int)-1))
+ return;
+ /* Free buffers. */
+ elts_tail = (wqe_ci + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
- (((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
- struct rte_mbuf *tmp = elt->buf;
- struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+ (elts_free + 1) & (elts_n - 1);
+ struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];

#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x66, sizeof(*elt));
+ memset(&(*txq->elts)[elts_free],
+ 0x66,
+ sizeof((*txq->elts)[elts_free]));
#endif
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- /* Faster than rte_pktmbuf_free(). */
- do {
- struct rte_mbuf *next = NEXT(tmp);
-
- rte_pktmbuf_free_seg(tmp);
- tmp = next;
- } while (tmp != NULL);
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next);
+ /* Only one segment needs to be freed. */
+ rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- }
-
+ } while (elts_free != elts_tail);
+ txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
- txq->elts_comp = elts_comp;
- return 0;
+ /* Update the consumer index. */
+ rte_wmb();
+ *txq->cq_db = htonl(cq_ci);
}

/**
@@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
- assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+ assert(htonl(txq->mp2mr[i].mr->lkey) ==
+ txq->mp2mr[i].lkey);
lkey = txq->mp2mr[i].lkey;
break;
}
@@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}

/**
- * Insert VLAN using mbuf headroom space.
- *
- * @param buf
- * Buffer for VLAN insertion.
+ * Write a regular WQE.
*
- * @return
- * 0 on success, errno value on failure.
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
*/
-static inline int
-insert_vlan_sw(struct rte_mbuf *buf)
+static inline void
+mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey)
{
- uintptr_t addr;
- uint32_t vlan;
- uint16_t head_room_len = rte_pktmbuf_headroom(buf);
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- if (head_room_len < 4)
- return EINVAL;
+/**
+ * Write a regular WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey,
+ uint16_t vlan_tci)
+{
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr, 12);
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
+ (uint8_t *)((uintptr_t)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- vlan = htonl(0x81000000 | buf->vlan_tci);
- memmove((void *)(addr - 4), (void *)addr, 12);
- memcpy((void *)(addr + 8), &vlan, sizeof(vlan));
+/**
+ * Ring TX queue doorbell.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ */
+static inline void
+mlx5_tx_dbrec(struct txq *txq)
+{
+ uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
+ uint32_t data[4] = {
+ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+ htonl(txq->qp_num_8s),
+ 0,
+ 0,
+ };
+ rte_wmb();
+ *txq->qp_db = htonl(txq->wqe_ci);
+ /* Ensure ordering between DB record and BF copy. */
+ rte_wmb();
+ rte_mov16(dst, (uint8_t *)data);
+ txq->bf_offset ^= txq->bf_buf_size;
+}

- SET_DATA_OFF(buf, head_room_len - 4);
- DATA_LEN(buf) += 4;
+/**
+ * Prefetch a CQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param cqe_ci
+ * CQE consumer index.
+ */
+static inline void
+tx_prefetch_cqe(struct txq *txq, uint16_t ci)
+{
+ volatile struct mlx5_cqe64 *cqe;

- return 0;
+ cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)];
+ rte_prefetch0(cqe);
}

/**
@@ -288,18 +376,21 @@ uint16_t
mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
- unsigned int elts_head = txq->elts_head;
+ uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int elts_comp_cd = txq->elts_comp_cd;
- unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
- int err;
- struct rte_mbuf *buf = pkts[0];
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;

- assert(elts_comp_cd != 0);
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
/* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
rte_prefetch0(buf);
+ /* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
@@ -313,101 +404,51 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf_next = pkts[i + 1];
- unsigned int elts_head_next =
- (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
- struct txq_elt *elt = &(*txq->elts)[elts_head];
- uint32_t send_flags = 0;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int insert_vlan = 0;
-#endif /* HAVE_VERBS_VLAN_INSERTION */
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
uintptr_t addr;
uint32_t length;
uint32_t lkey;
- uintptr_t buf_next_addr;

+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
if (i + 1 < max)
- rte_prefetch0(buf_next);
- /* Request TX completion. */
- if (unlikely(--elts_comp_cd == 0)) {
- elts_comp_cd = txq->elts_comp_cd_init;
- ++elts_comp;
- send_flags |= IBV_EXP_QP_BURST_SIGNALED;
- }
- /* Should we enable HW CKSUM offload */
- if (buf->ol_flags &
- (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
- /* HW does not support checksum offloads at arbitrary
- * offsets but automatically recognizes the packet
- * type. For inner L3/L4 checksums, only VXLAN (UDP)
- * tunnels are currently supported. */
- if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
- send_flags |= IBV_EXP_QP_BURST_TUNNEL;
- }
- if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (!txq->priv->mps)
- insert_vlan = 1;
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- {
- err = insert_vlan_sw(buf);
- if (unlikely(err))
- goto stop;
- }
- }
+ rte_prefetch0(pkts[i + 1]);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
+ (*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
+ buf->vlan_tci);
else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- if (unlikely(err))
- goto stop;
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ /* Request completion if needed. */
+ if (unlikely(--txq->elts_comp == 0)) {
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ txq->elts_comp = txq->elts_comp_cd_init;
+ } else
+ wqe->wqe.ctrl.data[2] = 0;
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->wqe.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->wqe.eseg.cs_flags = 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
-stop:
elts_head = elts_head_next;
- buf = buf_next;
+ buf = pkts[i + 1];
}
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
@@ -417,16 +458,8 @@ stop:
txq->stats.opackets += i;
#endif
/* Ring QP doorbell. */
- err = txq->send_flush(txq->qp);
- if (unlikely(err)) {
- /* A nonzero value is not supposed to be returned.
- * Nothing can be done about it. */
- DEBUG("%p: send_flush() failed with error %d",
- (void *)txq, err);
- }
+ mlx5_tx_dbrec(txq);
txq->elts_head = elts_head;
- txq->elts_comp += elts_comp;
- txq->elts_comp_cd = elts_comp_cd;
return i;
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 7e4e7cf..3c1c5a5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -62,6 +62,7 @@
#include "mlx5.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"

struct mlx5_rxq_stats {
unsigned int idx; /**< Mapping index. */
@@ -222,44 +223,40 @@ struct hash_rxq {
[MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
};

-/* TX element. */
-struct txq_elt {
- struct rte_mbuf *buf;
-};
-
/* TX queue descriptor. */
struct txq {
- struct priv *priv; /* Back pointer to private data. */
- int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
- int (*send_pending)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_vlan)();
-#endif
- int (*send_flush)(struct ibv_qp *qp);
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_qp *qp; /* Queue Pair. */
- struct txq_elt (*elts)[]; /* TX elements. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int elts_tail; /* First element awaiting completion. */
- unsigned int elts_comp; /* Number of completion requests. */
- unsigned int elts_comp_cd; /* Countdown for next completion request. */
- unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_head; /* Current index in (*elts)[]. */
+ uint16_t elts_tail; /* First element awaiting completion. */
+ uint16_t elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_n; /* (*elts)[] length. */
+ uint16_t cq_ci; /* Consumer index for completion queue. */
+ uint16_t cqe_n; /* Number of CQ elements. */
+ uint16_t wqe_ci; /* Consumer index for work queue. */
+ uint16_t wqe_n; /* Number of WQ elements. */
+ uint16_t bf_offset; /* Blueflame offset. */
+ uint16_t bf_buf_size; /* Blueflame size. */
+ volatile struct mlx5_cqe64 (*cqes)[]; /* Completion queue. */
+ volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+ volatile uint32_t *qp_db; /* Work queue doorbell. */
+ volatile uint32_t *cq_db; /* Completion queue doorbell. */
+ volatile void *bf_reg; /* Blueflame register. */
struct {
const struct rte_mempool *mp; /* Cached Memory Pool. */
struct ibv_mr *mr; /* Memory Region (for mp). */
- uint32_t lkey; /* mr->lkey */
+ uint32_t lkey; /* htonl(mr->lkey) */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+ struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
struct txq_ctrl {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
-#else
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-#endif
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
@@ -293,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_txq.c */

void txq_cleanup(struct txq_ctrl *);
-int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_txconf *);
+int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
void mlx5_tx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d7cc39d..95c6f2b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -60,6 +60,7 @@
#endif

#include "mlx5_utils.h"
+#include "mlx5_defs.h"
#include "mlx5.h"
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
@@ -72,48 +73,22 @@
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
- *
- * @return
- * 0 on success, errno value on failure.
*/
-static int
+static void
txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
- struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
- int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
- ret = ENOMEM;
- goto error;
- }
- for (i = 0; (i != elts_n); ++i) {
- struct txq_elt *elt = &(*elts)[i];
+ for (i = 0; (i != elts_n); ++i)
+ (*txq_ctrl->txq.elts)[i] = NULL;
+ for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+ volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];

- elt->buf = NULL;
+ memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
}
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
- txq_ctrl->txq.elts_n = elts_n;
- txq_ctrl->txq.elts = elts;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- txq_ctrl->txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
- assert(ret == 0);
- return 0;
-error:
- rte_free(elts);
-
- DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
- assert(ret > 0);
- return ret;
}

/**
@@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
unsigned int elts_n = txq_ctrl->txq.elts_n;
unsigned int elts_head = txq_ctrl->txq.elts_head;
unsigned int elts_tail = txq_ctrl->txq.elts_tail;
- struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;
+ struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;

DEBUG("%p: freeing WRs", (void *)txq_ctrl);
- txq_ctrl->txq.elts_n = 0;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- txq_ctrl->txq.elts_comp_cd = 0;
- txq_ctrl->txq.elts_comp_cd_init = 0;
- txq_ctrl->txq.elts = NULL;

- if (elts == NULL)
- return;
while (elts_tail != elts_head) {
- struct txq_elt *elt = &(*elts)[elts_tail];
+ struct rte_mbuf *elt = (*elts)[elts_tail];

- assert(elt->buf != NULL);
- rte_pktmbuf_free(elt->buf);
+ assert(elt != NULL);
+ rte_pktmbuf_free(elt);
#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x77, sizeof(*elt));
+ memset(&(*elts)[elts_tail],
+ 0x77,
+ sizeof((*elts)[elts_tail]));
#endif
if (++elts_tail == elts_n)
elts_tail = 0;
}
- rte_free(elts);
}

/**
@@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)

DEBUG("cleaning up %p", (void *)txq_ctrl);
txq_free_elts(txq_ctrl);
- txq_ctrl->txq.poll_cnt = NULL;
- txq_ctrl->txq.send_flush = NULL;
if (txq_ctrl->if_qp != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.qp != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_qp,
&params));
}
if (txq_ctrl->if_cq != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.cq != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_cq,
&params));
}
- if (txq_ctrl->txq.qp != NULL)
- claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
- if (txq_ctrl->txq.cq != NULL)
- claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->qp));
+ if (txq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->cq));
if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx,
txq_ctrl->rd,
&attr));
}
@@ -221,6 +188,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
}

/**
+ * Initialize TX queue.
+ *
+ * @param tmpl
+ * Pointer to TX queue control template.
+ * @param txq_ctrl
+ * Pointer to TX queue control.
+ */
+static inline void
+txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+{
+ struct mlx5_qp *qp = to_mqp(tmpl->qp);
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+
+ tmpl->txq.cqe_n = ibcq->cqe + 1;
+ tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
+ tmpl->txq.wqes =
+ (volatile union mlx5_wqe (*)[])
+ (uintptr_t)qp->gen_data.sqstart;
+ tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+ tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
+ tmpl->txq.bf_reg = qp->gen_data.bf->reg;
+ tmpl->txq.bf_offset = qp->gen_data.bf->offset;
+ tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+ tmpl->txq.cq_db = cq->dbrec;
+ tmpl->txq.cqes =
+ (volatile struct mlx5_cqe64 (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->txq.elts =
+ (struct rte_mbuf *(*)[tmpl->txq.elts_n])
+ ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+}
+
+/**
* Configure a TX queue.
*
* @param dev
@@ -238,15 +239,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf)
+txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
struct txq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
- .txq = {
- .priv = priv,
- },
};
union {
struct ibv_exp_query_intf_params params;
@@ -254,15 +254,19 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_qp_attr mod;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of TX descriptors", (void *)dev);
- return EINVAL;
- }
+ tmpl.txq.elts_n = desc;
+ /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+ * at least 4 times per ring. */
+ tmpl.txq.elts_comp_cd_init =
+ ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
+ MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
+ tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -281,8 +285,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.txq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx,
+ (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ NULL, NULL, 0, &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -294,9 +300,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.txq.cq,
+ .send_cq = tmpl.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.txq.cq,
+ .recv_cq = tmpl.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -314,8 +320,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.txq.qp == NULL) {
+ tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -327,30 +333,26 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = txq_alloc_elts(&tmpl, desc);
- if (ret) {
- ERROR("%p: TXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
+ txq_setup(&tmpl, txq_ctrl);
+ txq_alloc_elts(&tmpl, desc);
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -359,7 +361,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.txq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -371,10 +373,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.txq.qp,
-#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
-#endif
+ .obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
@@ -392,12 +392,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
txq_cleanup(txq_ctrl);
*txq_ctrl = tmpl;
- txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
- txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
-#endif
- txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
@@ -432,15 +426,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
- struct txq_ctrl *txq_ctrl;
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- if (txq)
- txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in TX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -459,8 +457,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->txqs)[idx] = NULL;
txq_cleanup(txq_ctrl);
} else {
- txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
- 0, socket);
+ txq_ctrl =
+ rte_calloc_socket("TXQ", 1,
+ sizeof(*txq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -468,7 +469,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
+ ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
rte_free(txq_ctrl);
else {
@@ -503,7 +504,7 @@ mlx5_tx_queue_release(void *dpdk_txq)
if (txq == NULL)
return;
txq_ctrl = container_of(txq, struct txq_ctrl, txq);
- priv = txq->priv;
+ priv = txq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
@@ -538,7 +539,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct txq *txq = dpdk_txq;
- struct priv *priv = mlx5_secondary_data_setup(txq->priv);
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:01 UTC
Permalink
Mini (compressed) CQEs are returned by the NIC when PCI back pressure is
detected, in which case the first CQE64 contains common packet information
followed by a number of CQE8 providing the rest, followed by a matching
number of empty CQE64 entries to be used by software for decompression.

Before decompression:

0 1 2 6 7 8
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 |
|-------| |---------| |-------| |-------| |-------| |-------|
| ..... | | cqe8[0] | | | . | | | | | ..... |
| ..... | | cqe8[1] | | | . | | | | | ..... |
| ..... | | ....... | | | . | | | | | ..... |
| ..... | | cqe8[7] | | | | | | | | ..... |
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+

After decompression:

0 1 ... 8
+-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 |
|-------| |-------| |-------|
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | | ..... |
+-------+ +-------+ +-------+

This patch does not perform the entire decompression step as it would be
really expensive, instead the first CQE64 is consumed and an internal
context is maintained to interpret the following CQE8 entries directly.

Intermediate empty CQE64 entries are handed back to HW without further
processing.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 6 +
drivers/net/mlx5/mlx5.c | 25 ++++-
drivers/net/mlx5/mlx5.h | 1 +
drivers/net/mlx5/mlx5_rxq.c | 9 +-
drivers/net/mlx5/mlx5_rxtx.c | 259 ++++++++++++++++++++++++++++++++-----------
drivers/net/mlx5/mlx5_rxtx.h | 11 ++
drivers/net/mlx5/mlx5_txq.c | 5 +
7 files changed, 247 insertions(+), 69 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3a07928..756153b 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -148,6 +148,12 @@ Run-time configuration

- **ethtool** operations on related kernel interfaces also affect the PMD.

+- ``rxq_cqe_comp_en`` parameter [int]
+
+ A nonzero value enables the compression of CQE on RX side. This feature
+ allows to save PCI bandwidth and improve performance at the cost of a
+ slightly higher CPU usage. Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 62e6e16..9bb08b6 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,9 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -256,12 +259,21 @@ static int
mlx5_args_check(const char *key, const char *val, void *opaque)
{
struct priv *priv = opaque;
+ unsigned long tmp;

- /* No parameters are expected at the moment. */
- (void)priv;
- (void)val;
- WARN("%s: unknown parameter", key);
- return EINVAL;
+ errno = 0;
+ tmp = strtoul(val, NULL, 0);
+ if (errno) {
+ WARN("%s: \"%s\" is not a valid integer", key, val);
+ return errno;
+ }
+ if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
+ priv->cqe_comp = !!tmp;
+ else {
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+ }
+ return 0;
}

/**
@@ -279,7 +291,7 @@ static int
mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
- NULL,
+ MLX5_RXQ_CQE_COMP_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -474,6 +486,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
ERROR("failed to process device arguments: %s",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 382aac5..3344360 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -112,6 +112,7 @@ struct priv {
unsigned int hw_padding:1; /* End alignment padding is supported. */
unsigned int sriov:1; /* This is a VF or PF with VF devices. */
unsigned int mps:1; /* Whether multi-packet send is supported. */
+ unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index ac2b69f..b3972ff 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -889,6 +889,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);

tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
tmpl->rxq.rq_ci = 0;
tmpl->rxq.cq_db = cq->dbrec;
@@ -946,6 +947,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ unsigned int cqe_n = desc - 1;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -985,7 +987,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ if (priv->cqe_comp) {
+ attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+ attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+ cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
+ }
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
&attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index cee6067..05b9c88 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -69,44 +69,85 @@
#include "mlx5_defs.h"
#include "mlx5_prm.h"

-static inline volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe64 cqes[],
- unsigned int cqes_n, uint16_t *ci)
- __attribute__((always_inline));
+#ifndef NDEBUG
+
+/**
+ * Verify or set magic value in CQE.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ *
+ * @return
+ * 0 the first time.
+ */
+static inline int
+check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
+{
+ static const uint8_t magic[] = "seen";
+ volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
+ int ret = 1;
+ unsigned int i;
+
+ for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
+ if (!ret || !(ret = ((*buf)[i] == magic[i])))
+ (*buf)[i] = magic[i];
+ return ret;
+}
+
+#endif /* NDEBUG */

static inline int
-rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
+ __attribute__((always_inline));

-static volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe64 cqes[],
- unsigned int cqes_n, uint16_t *ci)
+/**
+ * Check whether CQE is valid.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ * @param cqes_n
+ * Size of completion queue.
+ * @param ci
+ * Consumer index.
+ *
+ * @return
+ * 0 on success, 1 on failure.
+ */
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
{
- volatile struct mlx5_cqe64 *cqe;
- uint16_t idx = *ci;
- uint8_t op_own;
-
- cqe = &cqes[idx & (cqes_n - 1)];
- op_own = cqe->op_own;
- if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
- return NULL;
- } else if (unlikely(op_own & 0x80)) {
- switch (op_own >> 4) {
- case MLX5_CQE_INVALID:
- return NULL; /* No CQE */
- case MLX5_CQE_REQ_ERR:
- return cqe;
- case MLX5_CQE_RESP_ERR:
- ++(*ci);
- return NULL;
- default:
- return NULL;
- }
- }
- if (cqe) {
- *ci = idx + 1;
- return cqe;
+ uint16_t idx = ci & cqes_n;
+ uint8_t op_own = cqe->op_own;
+ uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+ uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+
+ if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
+ return 1; /* No CQE. */
+#ifndef NDEBUG
+ if ((op_code == MLX5_CQE_RESP_ERR) ||
+ (op_code == MLX5_CQE_REQ_ERR)) {
+ volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
+ uint8_t syndrome = err_cqe->syndrome;
+
+ if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
+ (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
+ return 0;
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE error %u (0x%02x)"
+ " syndrome 0x%02x",
+ op_code, op_code, syndrome);
+ return 1;
+ } else if ((op_code != MLX5_CQE_RESP_SEND) &&
+ (op_code != MLX5_CQE_REQ)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE opcode %u (0x%02x)",
+ op_code, op_code);
+ return 1;
}
- return NULL;
+#endif /* NDEBUG */
+ return 0;
}

/**
@@ -125,20 +166,34 @@ txq_complete(struct txq *txq)
{
const unsigned int elts_n = txq->elts_n;
const unsigned int cqe_n = txq->cqe_n;
+ const unsigned int cqe_cnt = cqe_n - 1;
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
unsigned int wqe_ci = (unsigned int)-1;
- int ret = 0;

- while (ret == 0) {
- volatile struct mlx5_cqe64 *cqe;
+ do {
+ unsigned int idx = cq_ci & cqe_cnt;
+ volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx];

- cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
- if (cqe == NULL)
+ if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
break;
+#ifndef NDEBUG
+ if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected compressed CQE, TX stopped");
+ return;
+ }
+ if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
+ (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected error CQE, TX stopped");
+ return;
+ }
+#endif /* NDEBUG */
wqe_ci = ntohs(cqe->wqe_counter);
- }
+ ++cq_ci;
+ } while (1);
if (unlikely(wqe_ci == (unsigned int)-1))
return;
/* Free buffers. */
@@ -507,6 +562,97 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
}

/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param cqe
+ * CQE to process.
+ *
+ * @return
+ * Packet size in bytes (0 if there is none), -1 in case of completion
+ * with error.
+ */
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
+ uint16_t cqe_cnt)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ uint16_t cqe_n = cqe_cnt + 1;
+ int len = 0;
+
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)&(*rxq->cqes)[zip->ca & cqe_cnt];
+
+ len = ntohl((*mc)[zip->ai & 7].byte_cnt);
+ if ((++zip->ai & 7) == 0) {
+ /* Increment consumer index to skip the number of
+ * CQEs consumed. Hardware leaves holes in the CQ
+ * ring for software use. */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ uint16_t idx = rxq->cq_ci;
+ uint16_t end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /* No compressed data, get next CQE and verify if it is compressed. */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret == 1))
+ return 0;
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
+ cqe_cnt];
+
+ /* Fix endianness. */
+ zip->cqe_cnt = ntohl(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one returned by
+ * check_cqe64().
+ *
+ * If completion comprises several mini arrays, as a
+ * special case the second one is located 7 CQEs after
+ * the initial CQE instead of 8 for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci & cqe_cnt;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = ntohl((*mc)[0].byte_cnt);
+ zip->ai = 1;
+ } else
+ len = ntohl(cqe->byte_cnt);
+ /* Error while receiving packet. */
+ if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
+ return -1;
+ }
+ return len;
+}
+
+/**
* Translate RX completion flags to offload flags.
*
* @param[in] rxq
@@ -554,26 +700,6 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
}

/**
- * Get size of the next packet.
- *
- * @param rxq
- * RX queue to fetch packet from.
- *
- * @return
- * Packet size in bytes.
- */
-static inline int __attribute__((always_inline))
-rx_poll_len(struct rxq *rxq)
-{
- volatile struct mlx5_cqe64 *cqe;
-
- cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
- if (cqe)
- return ntohl(cqe->byte_cnt);
- return 0;
-}
-
-/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -595,15 +721,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int rq_ci = rxq->rq_ci;
const unsigned int elts_n = rxq->elts_n;
const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int cqe_cnt = rxq->cqe_n - 1;

for (i = 0; (i != pkts_n); ++i) {
unsigned int idx = rq_ci & wqe_cnt;
+ int len;
struct rte_mbuf *rep;
struct rte_mbuf *pkt;
- unsigned int len;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & wqe_cnt];
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];

pkt = (*rxq->elts)[idx];
rte_prefetch0(cqe);
@@ -616,11 +743,18 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
NB_SEGS(rep) = 1;
PORT(rep) = rxq->port_id;
NEXT(rep) = NULL;
- len = rx_poll_len(rxq);
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
if (unlikely(len == 0)) {
__rte_mbuf_raw_free(rep);
break;
}
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ --i;
+ goto skip;
+ }
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
@@ -650,6 +784,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Return packet. */
*(pkts++) = pkt;
++pkts_ret;
+ skip:
++rq_ci;
}
if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3c1c5a5..792a5f0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -91,6 +91,15 @@ struct fdir_queue {

struct priv;

+/* Compressed CQE context. */
+struct rxq_zip {
+ uint16_t ai; /* Array index. */
+ uint16_t ca; /* Current array index. */
+ uint16_t na; /* Next array index. */
+ uint16_t cq_ci; /* The next CQE. */
+ uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
/* RX queue descriptor. */
struct rxq {
unsigned int csum:1; /* Enable checksum offloading. */
@@ -100,9 +109,11 @@ struct rxq {
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
+ uint16_t cqe_n; /* Number of CQ elements. */
uint16_t port_id;
volatile struct mlx5_wqe_data_seg(*wqes)[];
volatile struct mlx5_cqe64(*cqes)[];
+ struct rxq_zip zip; /* Compressed context. */
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
struct rte_mbuf *(*elts)[];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 95c6f2b..ee2db12 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -259,6 +259,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
enum ibv_exp_query_intf_status status;
int ret = 0;

+ if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+ ret = ENOTSUP;
+ ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
+ goto error;
+ }
(void)conf; /* Thresholds configuration (ignored). */
tmpl.txq.elts_n = desc;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:02 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Replacing the variable countdown (which depends on the number of
descriptors) with a fixed relative threshold known at compile time improves
performance by reducing the TX queue structure footprint and the amount of
code to manage completions during a burst.

Completions are now requested at most once per burst after threshold is
reached.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_defs.h | 7 +++++--
drivers/net/mlx5/mlx5_rxtx.c | 42 ++++++++++++++++++++++++------------------
drivers/net/mlx5/mlx5_rxtx.h | 5 ++---
drivers/net/mlx5/mlx5_txq.c | 19 ++++++++++++-------
4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 8d2ec7a..cc2a6f3 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -48,8 +48,11 @@
/* Maximum number of special flows. */
#define MLX5_MAX_SPECIAL_FLOWS 4

-/* Request send completion once in every 64 sends, might be less. */
-#define MLX5_PMD_TX_PER_COMP_REQ 64
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32

/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 05b9c88..1495a53 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -154,9 +154,6 @@ check_cqe64(volatile struct mlx5_cqe64 *cqe,
* Manage TX completions.
*
* When sending a burst, mlx5_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to TX queue structure.
@@ -170,14 +167,16 @@ txq_complete(struct txq *txq)
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
- unsigned int wqe_ci = (unsigned int)-1;
+ volatile struct mlx5_cqe64 *cqe = NULL;
+ volatile union mlx5_wqe *wqe;

do {
- unsigned int idx = cq_ci & cqe_cnt;
- volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx];
+ volatile struct mlx5_cqe64 *tmp;

- if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
+ tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
+ if (check_cqe64(tmp, cqe_n, cq_ci))
break;
+ cqe = tmp;
#ifndef NDEBUG
if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
if (!check_cqe64_seen(cqe))
@@ -191,14 +190,15 @@ txq_complete(struct txq *txq)
return;
}
#endif /* NDEBUG */
- wqe_ci = ntohs(cqe->wqe_counter);
++cq_ci;
} while (1);
- if (unlikely(wqe_ci == (unsigned int)-1))
+ if (unlikely(cqe == NULL))
return;
+ wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
+ elts_tail = wqe->wqe.ctrl.data[3];
+ assert(elts_tail < txq->wqe_n);
/* Free buffers. */
- elts_tail = (wqe_ci + 1) & (elts_n - 1);
- do {
+ while (elts_free != elts_tail) {
struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
(elts_free + 1) & (elts_n - 1);
@@ -214,7 +214,7 @@ txq_complete(struct txq *txq)
/* Only one segment needs to be freed. */
rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- } while (elts_free != elts_tail);
+ }
txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
/* Update the consumer index. */
@@ -435,6 +435,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
const unsigned int elts_n = txq->elts_n;
unsigned int i;
unsigned int max;
+ unsigned int comp;
volatile union mlx5_wqe *wqe;
struct rte_mbuf *buf;

@@ -484,12 +485,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- /* Request completion if needed. */
- if (unlikely(--txq->elts_comp == 0)) {
- wqe->wqe.ctrl.data[2] = htonl(8);
- txq->elts_comp = txq->elts_comp_cd_init;
- } else
- wqe->wqe.ctrl.data[2] = 0;
+ wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -508,6 +504,16 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->wqe.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent packets counter. */
txq->stats.opackets += i;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 792a5f0..0de6bd5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -238,8 +238,7 @@ struct hash_rxq {
struct txq {
uint16_t elts_head; /* Current index in (*elts)[]. */
uint16_t elts_tail; /* First element awaiting completion. */
- uint16_t elts_comp_cd_init; /* Initial value for countdown. */
- uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_comp; /* Counter since last completion request. */
uint16_t elts_n; /* (*elts)[] length. */
uint16_t cq_ci; /* Consumer index for completion queue. */
uint16_t cqe_n; /* Number of CQ elements. */
@@ -247,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe64 (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
volatile uint32_t *qp_db; /* Work queue doorbell. */
@@ -259,7 +259,6 @@ struct txq {
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index ee2db12..4b8b3e0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -89,6 +89,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
}

/**
@@ -108,6 +109,7 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
DEBUG("%p: freeing WRs", (void *)txq_ctrl);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;

while (elts_tail != elts_head) {
struct rte_mbuf *elt = (*elts)[elts_tail];
@@ -265,13 +267,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
goto error;
}
(void)conf; /* Thresholds configuration (ignored). */
+ assert(desc > MLX5_TX_COMP_THRESH);
tmpl.txq.elts_n = desc;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- tmpl.txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
- tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -291,7 +288,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.res_domain = tmpl.rd,
};
tmpl.cq = ibv_exp_create_cq(priv->ctx,
- (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ (((desc / MLX5_TX_COMP_THRESH) - 1) ?
+ ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
NULL, NULL, 0, &attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
@@ -438,6 +436,13 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (desc <= MLX5_TX_COMP_THRESH) {
+ WARN("%p: number of descriptors requested for TX queue %u"
+ " must be higher than MLX5_TX_COMP_THRESH, using"
+ " %u instead of %u",
+ (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
+ desc = MLX5_TX_COMP_THRESH + 1;
+ }
if (!rte_is_power_of_2(desc)) {
desc = 1 << log2above(desc);
WARN("%p: increased number of descriptors in TX queue %u"
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:03 UTC
Permalink
From: Yaacov Hazan <***@mellanox.com>

Implement send inline feature which copies packet data directly into WQEs
for improved latency. The maximum packet size and the minimum number of TX
queues to qualify for inline send are user-configurable.

This feature is effective when HW causes a performance bottleneck.

Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 17 +++
drivers/net/mlx5/mlx5.c | 13 ++
drivers/net/mlx5/mlx5.h | 2 +
drivers/net/mlx5/mlx5_ethdev.c | 5 +
drivers/net/mlx5/mlx5_rxtx.c | 271 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 4 +
7 files changed, 314 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 756153b..9ada221 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -154,6 +154,23 @@ Run-time configuration
allows to save PCI bandwidth and improve performance at the cost of a
slightly higher CPU usage. Enabled by default.

+- ``txq_inline`` parameter [int]
+
+ Amount of data to be inlined during TX operations. Improves latency.
+ Can improve PPS performance when PCI back pressure is detected and may be
+ useful for scenarios involving heavy traffic on many queues.
+
+ It is not enabled by default (set to 0) since the additional software
+ logic necessary to handle this mode can lower performance when back
+ pressure is not expected.
+
+- ``txqs_min_inline`` parameter [int]
+
+ Enable inline send only when the number of TX queues is greater or equal
+ to this value.
+
+ This option should be used in combination with ``txq_inline`` above.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9bb08b6..4213286 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -72,6 +72,13 @@
/* Device parameter to enable RX completion queue compression. */
#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"

+/* Device parameter to configure inline send. */
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/* Device parameter to configure the number of TX queues threshold for
+ * enabling inline send. */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
/**
* Retrieve integer value from environment variable.
*
@@ -269,6 +276,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
}
if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
priv->cqe_comp = !!tmp;
+ else if (strcmp(MLX5_TXQ_INLINE, key) == 0)
+ priv->txq_inline = tmp;
+ else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
+ priv->txqs_inline = tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -292,6 +303,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
MLX5_RXQ_CQE_COMP_EN,
+ MLX5_TXQ_INLINE,
+ MLX5_TXQS_MIN_INLINE,
};
struct rte_kvargs *kvlist;
int ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3344360..c99ef7e 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -114,6 +114,8 @@ struct priv {
unsigned int mps:1; /* Whether multi-packet send is supported. */
unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
+ unsigned int txq_inline; /* Maximum packet size for inlining. */
+ unsigned int txqs_inline; /* Queue number threshold for inlining. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index aaa6c16..9dfb3ca 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1318,6 +1318,11 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
+ if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
+ DEBUG("selected inline TX function (%u >= %u queues)",
+ priv->txqs_n, priv->txqs_inline);
+ }
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1495a53..1ccb69d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -374,6 +374,139 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
}

/**
+ * Write a inline WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ */
+static inline void
+mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length)
+{
+ uint32_t size;
+ uint16_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (void *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ size = 3 + ((4 + length + 15) / 16);
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
+ * Write a inline WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint16_t vlan_tci)
+{
+ uint32_t size;
+ uint32_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (uint8_t *)addr, 12);
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12,
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 16,
+ ((uint8_t *)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ size = (sizeof(wqe->inl.ctrl.ctrl) +
+ sizeof(wqe->inl.eseg) +
+ sizeof(wqe->inl.byte_cnt) +
+ length + 15) / 16;
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
* Ring TX queue doorbell.
*
* @param txq
@@ -415,6 +548,23 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
}

/**
+ * Prefetch a WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe_ci
+ * WQE consumer index.
+ */
+static inline void
+tx_prefetch_wqe(struct txq *txq, uint16_t ci)
+{
+ volatile union mlx5_wqe *wqe;
+
+ wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
+}
+
+/**
* DPDK callback for TX.
*
* @param dpdk_txq
@@ -525,6 +675,127 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * DPDK callback for TX with inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;
+ unsigned int max_inline = txq->max_inline;
+
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
+ rte_prefetch0(buf);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+
+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ if (i + 1 < max)
+ rte_prefetch0(pkts[i + 1]);
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->inl.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->inl.eseg.cs_flags = 0;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Prefetch next buffer data. */
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
+ if (length <= max_inline) {
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_inline_vlan(txq, wqe,
+ addr, length,
+ buf->vlan_tci);
+ else
+ mlx5_wqe_write_inline(txq, wqe, addr, length);
+ } else {
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length,
+ lkey, buf->vlan_tci);
+ else
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ }
+ wqe->inl.ctrl.data[2] = 0;
+ elts_head = elts_head_next;
+ buf = pkts[i + 1];
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 0de6bd5..10462ac 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint16_t max_inline; /* Maximum size to inline in a WQE. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe64 (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
@@ -310,6 +311,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4b8b3e0..e1f7280 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -323,6 +323,10 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
+ if (priv->txq_inline && priv->txqs_n >= priv->txqs_inline) {
+ tmpl.txq.max_inline = priv->txq_inline;
+ attr.init.cap.max_inline_data = tmpl.txq.max_inline;
+ }
tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:04 UTC
Permalink
This feature enables the TX burst function to emit up to 5 packets using
only two WQEs on devices that support it. Saves PCI bandwidth and improves
performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 10 ++
drivers/net/mlx5/mlx5.c | 14 +-
drivers/net/mlx5/mlx5_ethdev.c | 15 +-
drivers/net/mlx5/mlx5_rxtx.c | 400 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 2 +-
6 files changed, 439 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 9ada221..063c4a5 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -171,6 +171,16 @@ Run-time configuration

This option should be used in combination with ``txq_inline`` above.

+- ``txq_mpw_en`` parameter [int]
+
+ A nonzero value enables multi-packet send. This feature allows the TX
+ burst function to pack up to five packets in two descriptors in order to
+ save PCI bandwidth and improve performance at the cost of a slightly
+ higher CPU usage.
+
+ It is currently only supported on the ConnectX-4 Lx family of adapters.
+ Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4213286..411486d 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -79,6 +79,9 @@
* enabling inline send. */
#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"

+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -280,6 +283,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
priv->txq_inline = tmp;
else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
priv->txqs_inline = tmp;
+ else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0)
+ priv->mps = !!tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -305,6 +310,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
MLX5_RXQ_CQE_COMP_EN,
MLX5_TXQ_INLINE,
MLX5_TXQS_MIN_INLINE,
+ MLX5_TXQ_MPW_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -499,6 +505,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->mps = mps; /* Enable MPW by default if supported. */
priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
@@ -547,7 +554,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)

priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
- priv->mps = mps;
+ if (priv->mps && !mps) {
+ ERROR("multi-packet send not supported on this device"
+ " (" MLX5_TXQ_MPW_EN ")");
+ err = ENOTSUP;
+ goto port_error;
+ }
/* Allocate and register default RSS hash keys. */
priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 9dfb3ca..1767fe4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -585,7 +585,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
DEV_RX_OFFLOAD_UDP_CKSUM |
DEV_RX_OFFLOAD_TCP_CKSUM) :
0);
- info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
+ if (!priv->mps)
+ info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
if (priv->hw_csum)
info->tx_offload_capa |=
(DEV_TX_OFFLOAD_IPV4_CKSUM |
@@ -1318,7 +1319,17 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
- if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ /* Display warning for unsupported configurations. */
+ if (priv->sriov && priv->mps)
+ WARN("multi-packet send WQE cannot be used on a SR-IOV setup");
+ /* Select appropriate TX function. */
+ if ((priv->sriov == 0) && priv->mps && priv->txq_inline) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
+ DEBUG("selected MPW inline TX function");
+ } else if ((priv->sriov == 0) && priv->mps) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
+ DEBUG("selected MPW TX function");
+ } else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
DEBUG("selected inline TX function (%u >= %u queues)",
priv->txqs_n, priv->txqs_inline);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1ccb69d..b6ee47b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -796,6 +796,406 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * Open a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+ volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+
+ mpw->state = MLX5_MPW_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw.eseg.mss = htons(length);
+ mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw.eseg.rsvd0 = 0;
+ mpw->wqe->mpw.eseg.rsvd1 = 0;
+ mpw->wqe->mpw.eseg.rsvd2 = 0;
+ mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw.ctrl.data[2] = 0;
+ mpw->wqe->mpw.ctrl.data[3] = 0;
+ mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
+ mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+ mpw->data.dseg[2] = &(*dseg)[0];
+ mpw->data.dseg[3] = &(*dseg)[1];
+ mpw->data.dseg[4] = &(*dseg)[2];
+}
+
+/**
+ * Close a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int num = mpw->pkts_n;
+
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ if (num < 3)
+ ++txq->wqe_ci;
+ else
+ txq->wqe_ci += 2;
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+}
+
+/**
+ * DPDK callback for TX with MPW support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
+ ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+ mlx5_mpw_close(txq, &mpw);
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ }
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
+ * Open a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+
+ mpw->state = MLX5_MPW_INL_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw_inl.ctrl.data[2] = 0;
+ mpw->wqe->mpw_inl.ctrl.data[3] = 0;
+ mpw->wqe->mpw_inl.eseg.mss = htons(length);
+ mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw_inl.eseg.cs_flags = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
+ mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+}
+
+/**
+ * Close a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int size;
+
+ size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw_inl.ctrl.data[1] =
+ htonl(txq->qp_num_8s | ((size + 15) / 16));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+ txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+}
+
+/**
+ * DPDK callback for TX with MPW inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ unsigned int inline_room = txq->max_inline;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+ mlx5_mpw_close(txq, &mpw);
+ } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (length > inline_room) ||
+ (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ if (length > inline_room) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ } else {
+ mlx5_mpw_inline_new(txq, &mpw, length);
+ mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ assert(inline_room == txq->max_inline);
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ } else {
+ unsigned int max;
+
+ assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
+ assert(length <= inline_room);
+ /* Maximum number of bytes before wrapping. */
+ max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+ (uintptr_t)mpw.data.raw);
+ if (length > max) {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ max);
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)(addr + max),
+ length - max);
+ mpw.data.raw += length - max;
+ } else {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ length);
+ mpw.data.raw += length;
+ }
+ if ((uintptr_t)mpw.data.raw ==
+ (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ } else
+ inline_room -= length;
+ }
+ mpw.total_len += length;
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw_inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw_inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
+ mlx5_mpw_inline_close(txq, &mpw);
+ else if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 10462ac..e3ad596 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -312,6 +312,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e1f7280..15c8f73 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -384,7 +384,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
- (priv->mps ?
+ ((priv->mps && !priv->sriov) ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
};
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:05 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_txq.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 15c8f73..d013230 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -334,6 +334,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u,"
+ " max_inline_data=%u",
+ attr.init.cap.max_send_wr,
+ attr.init.cap.max_send_sge,
+ attr.init.cap.max_inline_data);
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:06 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The space necessary to store segmented packets cannot be known in advance
and must be verified for each of them.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 136 ++++++++++++++++++++++---------------------
1 file changed, 70 insertions(+), 66 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b6ee47b..1478b2d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -583,50 +583,49 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
rte_prefetch0(wqe);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -649,8 +648,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
txq->stats.obytes += length;
#endif
elts_head = elts_head_next;
- buf = pkts[i + 1];
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -693,44 +692,43 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -745,8 +743,8 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
if (length <= max_inline) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
@@ -766,12 +764,12 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}
wqe->inl.ctrl.data[2] = 0;
elts_head = elts_head_next;
- buf = pkts[i + 1];
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -879,13 +877,15 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -895,22 +895,22 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
+ do {
+ struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *dseg;
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -943,7 +943,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -1048,7 +1049,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1056,6 +1057,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -1065,21 +1068,21 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -1165,7 +1168,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:07 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Compared to its previous incarnation, the software limit on the number of
mbuf segments is no more (previously MLX5_PMD_SGE_WR_N, set to 4 by
default) hence no need for linearization code and related buffers that
permanently consumed a non negligible amount of memory to handle oversized
mbufs.

The resulting code is both lighter and faster.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 231 +++++++++++++++++++++++++++++++++----------
drivers/net/mlx5/mlx5_txq.c | 6 +-
2 files changed, 182 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1478b2d..53d2a57 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -301,6 +301,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
{
wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -346,6 +347,7 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,

wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -423,6 +425,7 @@ mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -496,6 +499,7 @@ mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -584,6 +588,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -600,21 +605,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
rte_prefetch0(wqe);
if (pkts_n)
rte_prefetch0(*pkts);
@@ -634,7 +643,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -643,6 +651,35 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
MLX5_ETH_WQE_L4_CSUM;
} else
wqe->wqe.eseg.cs_flags = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+ elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
@@ -654,7 +691,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->wqe.ctrl.data[2] = htonl(8);
@@ -693,6 +730,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -710,21 +748,25 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
if (pkts_n)
@@ -746,13 +788,14 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (pkts_n)
rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
- if (length <= max_inline) {
+ if ((length <= max_inline) && (segs_n == 1)) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
mlx5_wqe_write_inline_vlan(txq, wqe,
addr, length,
buf->vlan_tci);
else
mlx5_wqe_write_inline(txq, wqe, addr, length);
+ goto skip_segs;
} else {
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -762,7 +805,35 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
}
- wqe->inl.ctrl.data[2] = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->inl.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f);
+ skip_segs:
elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
@@ -774,7 +845,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->inl.ctrl.data[2] = htonl(8);
@@ -878,6 +949,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
@@ -896,46 +968,67 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
- volatile struct mlx5_wqe_data_seg *dseg;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
- uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
+ break;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
+ assert(length);
/* Start new session if packet differs. */
if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
mlx5_mpw_close(txq, &mpw);
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
}
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+ uintptr_t addr;
+
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
elts_head = elts_head_next;
@@ -949,7 +1042,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

@@ -1050,6 +1144,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1069,36 +1164,38 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
+ break;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
/* Start new session if packet differs. */
if (mpw.state == MLX5_MPW_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags))
mlx5_mpw_close(txq, &mpw);
} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(length > inline_room) ||
(mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
mlx5_mpw_inline_close(txq, &mpw);
@@ -1106,7 +1203,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
}
}
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
- if (length > inline_room) {
+ if ((segs_n != 1) ||
+ (length > inline_room)) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
} else {
@@ -1114,17 +1212,36 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
}
}
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
if (mpw.state == MLX5_MPW_STATE_OPENED) {
- volatile struct mlx5_wqe_data_seg *dseg;
-
assert(inline_room == txq->max_inline);
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ elts_head_next =
+ (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
} else {
@@ -1132,6 +1249,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,

assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
assert(length <= inline_room);
+ assert(length == DATA_LEN(buf));
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ (*txq->elts)[elts_head] = buf;
/* Maximum number of bytes before wrapping. */
max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
(uintptr_t)mpw.data.raw);
@@ -1156,6 +1277,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.data.raw =
(volatile void *)&(*txq->wqes)[0];
++mpw.pkts_n;
+ ++j;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
mlx5_mpw_inline_close(txq, &mpw);
inline_room = txq->max_inline;
@@ -1174,7 +1296,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d013230..38900c1 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -311,7 +311,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
priv->device_attr.max_qp_wr :
desc),
- /* Max number of scatter/gather elements in a WR. */
+ /* Max number of scatter/gather elements in a WR,
+ * must be 1 to prevent libmlx5 from trying to affect
+ * too much memory. TX gather is not impacted by the
+ * priv->device_attr.max_sge limit and will still work
+ * properly. */
.max_send_sge = 1,
},
.qp_type = IBV_QPT_RAW_PACKET,
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:08 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Since commit "mlx5: resurrect TX gather support", older GCC versions (such
as 4.8.5) may complain about the following:

mlx5_rxtx.c: In function `mlx5_tx_burst':
mlx5_rxtx.c:705:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

mlx5_rxtx.c: In function `mlx5_tx_burst_inline':
mlx5_rxtx.c:864:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

In both cases, this code cannot be reached when wqe is not initialized.

Considering older GCC versions are still widely used, work around this
issue by initializing wqe preemptively, even if it should not be necessary.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 53d2a57..f4af769 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -591,7 +591,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;

if (unlikely(!pkts_n))
return 0;
@@ -733,7 +733,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:09 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Toggling RX checksum offloads is already done at initialization time. This
code does not belong in rxq_rehash().

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxq.c | 10 ----------
1 file changed, 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b3972ff..20a236e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -798,7 +798,6 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
@@ -811,15 +810,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
/* Number of descriptors and mbufs currently allocated. */
desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
- /* Toggle RX checksum offload if hardware supports it. */
- if (priv->hw_csum) {
- tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum = tmpl.rxq.csum;
- }
- if (priv->hw_csum_l2tun) {
- tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:10 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The primary purpose of rxq_rehash() function is to stop and restart
reception on a queue after re-posting buffers. This may fail if the array
that temporarily stores existing buffers for reuse cannot be allocated.

Update rxq_rehash() to work on the target queue directly (not through a
template copy) and avoid this allocation.

rxq_alloc_elts() is modified accordingly to take buffers from an existing
queue directly and update their refcount.

Unlike rxq_rehash(), rxq_setup() must work on a temporary structure but
should not allocate new mbufs from the pool while reinitializing an
existing queue. This is achieved by using the refcount-aware
rxq_alloc_elts() before overwriting queue data.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxq.c | 94 ++++++++++++++++++++++++---------------------
1 file changed, 51 insertions(+), 43 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 20a236e..17a28e4 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -642,7 +642,7 @@ priv_rehash_flows(struct priv *priv)
*/
static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
- struct rte_mbuf **pool)
+ struct rte_mbuf *(*pool)[])
{
unsigned int i;
int ret = 0;
@@ -654,9 +654,10 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
&(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
- buf = *(pool++);
+ buf = (*pool)[i];
assert(buf != NULL);
rte_pktmbuf_reset(buf);
+ rte_pktmbuf_refcnt_update(buf, 1);
} else
buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
@@ -781,7 +782,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
}

/**
- * Reconfigure a RX queue with new parameters.
+ * Reconfigure RX queue buffers.
*
* rxq_rehash() does not allocate mbufs, which, if not done from the right
* thread (such as a control thread), may corrupt the pool.
@@ -798,67 +799,48 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct rxq_ctrl tmpl = *rxq_ctrl;
- unsigned int mbuf_n;
- unsigned int desc_n;
- struct rte_mbuf **pool;
- unsigned int i, k;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ unsigned int i;
struct ibv_exp_wq_attr mod;
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
- /* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.rxq.elts_n;
- mbuf_n = desc_n;
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
return err;
}
- /* Allocate pool. */
- pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
- if (pool == NULL) {
- ERROR("%p: cannot allocate memory", (void *)dev);
- return ENOBUFS;
- }
/* Snatch mbufs from original queue. */
- k = 0;
- for (i = 0; (i != desc_n); ++i)
- pool[k++] = (*rxq_ctrl->rxq.elts)[i];
- assert(k == mbuf_n);
- rte_free(pool);
+ claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
+ for (i = 0; i != elts_n; ++i) {
+ struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
+
+ assert(rte_mbuf_refcnt_read(buf) == 2);
+ rte_pktmbuf_free_seg(buf);
+ }
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
- /* Post SGEs. */
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc_n;
+ rxq_ctrl->rxq.rq_ci = elts_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
- *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -868,16 +850,21 @@ error:
*
* @param tmpl
* Pointer to RX queue control template.
- * @param rxq_ctrl
- * Pointer to RX queue control.
+ *
+ * @return
+ * 0 on success, errno value on failure.
*/
-static inline void
-rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+static inline int
+rxq_setup(struct rxq_ctrl *tmpl)
{
struct ibv_cq *ibcq = tmpl->cq;
struct mlx5_cq *cq = to_mxxx(cq, cq);
struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+ struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+ rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);

+ if (elts == NULL)
+ return ENOMEM;
tmpl->rxq.rq_db = rwq->rq.db;
tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
@@ -889,9 +876,8 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
tmpl->rxq.cqes =
(volatile struct mlx5_cqe64 (*)[])
(uintptr_t)cq->active_buf->buf;
- tmpl->rxq.elts =
- (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
- ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+ tmpl->rxq.elts = elts;
+ return 0;
}

/**
@@ -938,6 +924,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
unsigned int cqe_n = desc - 1;
+ struct rte_mbuf *(*elts)[desc] = NULL;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -1094,8 +1081,19 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
- rxq_setup(&tmpl, rxq_ctrl);
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ ret = rxq_setup(&tmpl);
+ if (ret) {
+ ERROR("%p: cannot initialize RX queue structure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ /* Reuse buffers from original queue if possible. */
+ if (rxq_ctrl->rxq.elts_n) {
+ assert(rxq_ctrl->rxq.elts_n == desc);
+ assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
+ ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
+ } else
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1104,6 +1102,14 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
+ /* Move mbuf pointers to dedicated storage area in RX queue. */
+ elts = (void *)(rxq_ctrl + 1);
+ rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
+#ifndef NDEBUG
+ memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
+#endif
+ rte_free(tmpl.rxq.elts);
+ tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
rxq_ctrl->rxq.rq_ci = desc;
@@ -1113,7 +1119,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
assert(ret == 0);
return 0;
error:
+ elts = tmpl.rxq.elts;
rxq_cleanup(&tmpl);
+ rte_free(elts);
assert(ret > 0);
return ret;
}
--
2.1.4
Nelio Laranjeiro
2016-06-08 09:48:11 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

This commit brings back RX scatter and related support by the MTU update
function. The maximum number of segments per packet is not a fixed value
anymore (previously MLX5_PMD_SGE_WR_N, set to 4 by default) as it caused
performance issues when fewer segments were actually needed as well as
limitations on the maximum packet size that could be received with the
default mbuf size (supporting at most 8576 bytes).

These limitations are now lifted as the number of SGEs is derived from the
MTU (which implies MRU) at queue initialization and during MTU update.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 84 ++++++++++++++++++++++----
drivers/net/mlx5/mlx5_rxq.c | 73 +++++++++++++++++-----
drivers/net/mlx5/mlx5_rxtx.c | 133 ++++++++++++++++++++++++-----------------
drivers/net/mlx5/mlx5_rxtx.h | 1 +
4 files changed, 211 insertions(+), 80 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 1767fe4..32af304 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -726,6 +726,9 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
unsigned int i;
uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
mlx5_rx_burst;
+ unsigned int max_frame_len;
+ int rehash;
+ int restart = priv->started;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;
@@ -739,7 +742,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
goto out;
} else
DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
- priv->mtu = mtu;
/* Temporarily replace RX handler with a fake one, assuming it has not
* been copied elsewhere. */
dev->rx_pkt_burst = removed_rx_burst;
@@ -747,28 +749,88 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
* removed_rx_burst() instead. */
rte_wmb();
usleep(1000);
+ /* MTU does not include header and CRC. */
+ max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN;
+ /* Check if at least one queue is going to need a SGE update. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct rxq *rxq = (*priv->rxqs)[i];
+ unsigned int mb_len;
+ unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len;
+ unsigned int sges_n;
+
+ if (rxq == NULL)
+ continue;
+ mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ if (sges_n != rxq->sges_n)
+ break;
+ }
+ /* If all queues have the right number of SGEs, a simple rehash
+ * of their buffers is enough, otherwise SGE information can only
+ * be updated in a queue by recreating it. All resources that depend
+ * on queues (flows, indirection tables) must be recreated as well in
+ * that case. */
+ rehash = (i == priv->rxqs_n);
+ if (!rehash) {
+ /* Clean up everything as with mlx5_dev_stop(). */
+ priv_special_flow_disable_all(priv);
+ priv_mac_addrs_disable(priv);
+ priv_destroy_hash_rxqs(priv);
+ priv_fdir_disable(priv);
+ priv_dev_interrupt_handler_uninstall(priv, dev);
+ }
+recover:
/* Reconfigure each RX queue. */
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
- unsigned int mb_len;
- unsigned int max_frame_len;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct rxq_ctrl, rxq);
int sp;
+ unsigned int mb_len;
+ unsigned int tmp;

if (rxq == NULL)
continue;
- /* Calculate new maximum frame length according to MTU and
- * toggle scattered support (sp) if necessary. */
- max_frame_len = (priv->mtu + ETHER_HDR_LEN +
- (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Toggle scattered support (sp) if necessary. */
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- if (sp) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- ret = ENOTSUP;
- goto out;
+ /* Provide new values to rxq_setup(). */
+ dev->data->dev_conf.rxmode.jumbo_frame = sp;
+ dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
+ if (rehash)
+ ret = rxq_rehash(dev, rxq_ctrl);
+ else
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+ rxq_ctrl->socket, NULL, rxq->mp);
+ if (!ret)
+ continue;
+ /* Attempt to roll back in case of error. */
+ tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM;
+ if (max_frame_len != tmp) {
+ max_frame_len = tmp;
+ goto recover;
}
+ /* Double fault, disable RX. */
+ break;
}
+ /* Use a safe RX burst function in case of error, otherwise mimic
+ * mlx5_dev_start(). */
+ if (ret) {
+ ERROR("unable to reconfigure RX queues, RX disabled");
+ rx_func = removed_rx_burst;
+ } else if (restart &&
+ !rehash &&
+ !priv_create_hash_rxqs(priv) &&
+ !priv_rehash_flows(priv)) {
+ if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE)
+ priv_fdir_enable(priv);
+ priv_dev_interrupt_handler_install(priv, dev);
+ }
+ priv->mtu = mtu;
/* Burst functions can now be called again. */
rte_wmb();
dev->rx_pkt_burst = rx_func;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 17a28e4..e8e29f2 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -644,10 +644,11 @@ static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf *(*pool)[])
{
+ const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
unsigned int i;
int ret = 0;

- /* For each WR (packet). */
+ /* Iterate on segments. */
for (i = 0; (i != elts_n); ++i) {
struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *scat =
@@ -672,6 +673,9 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
assert(!buf->next);
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ SET_DATA_OFF(buf, 0);
PORT(buf) = rxq_ctrl->rxq.port_id;
DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
PKT_LEN(buf) = DATA_LEN(buf);
@@ -685,8 +689,8 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
};
(*rxq_ctrl->rxq.elts)[i] = buf;
}
- DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq_ctrl, elts_n);
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
assert(ret == 0);
return 0;
error:
@@ -804,7 +808,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
struct ibv_exp_wq_attr mod;
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
+ DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
+ (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
+ assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -837,7 +843,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
goto error;
}
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = elts_n;
+ rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
@@ -928,9 +934,40 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of 2)", (void *)dev);
+ /* Enable scattered packets support for this queue if necessary. */
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+ (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+ (mb_len - RTE_PKTMBUF_HEADROOM))) {
+ unsigned int size =
+ RTE_PKTMBUF_HEADROOM +
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ unsigned int sges_n;
+
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ tmpl.rxq.sges_n = sges_n;
+ /* Make sure rxq.sges_n did not overflow. */
+ size = mb_len * (1 << tmpl.rxq.sges_n);
+ size -= RTE_PKTMBUF_HEADROOM;
+ if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+ ERROR("%p: too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u",
+ (void *)dev,
+ 1 << sges_n,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len);
+ return EOVERFLOW;
+ }
+ }
+ DEBUG("%p: maximum number of segments per packet: %u",
+ (void *)dev, 1 << tmpl.rxq.sges_n);
+ if (desc % (1 << tmpl.rxq.sges_n)) {
+ ERROR("%p: number of RX queue descriptors (%u) is not a"
+ " multiple of SGEs per packet (%u)",
+ (void *)dev,
+ desc,
+ 1 << tmpl.rxq.sges_n);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -938,7 +975,6 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -988,11 +1024,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
- priv->device_attr.max_qp_wr :
- (int)desc),
+ .max_recv_wr = desc >> tmpl.rxq.sges_n,
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = 1,
+ .max_recv_sge = 1 << tmpl.rxq.sges_n,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1044,6 +1078,17 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ /* Make sure number of WRs*SGEs match expectations since a queue
+ * cannot allocate more than "desc" buffers. */
+ if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
+ ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
+ ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+ (void *)dev,
+ (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
+ attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+ ret = EINVAL;
+ goto error;
+ }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1112,7 +1157,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc;
+ rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f4af769..f2934b8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1520,94 +1520,117 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- unsigned int pkts_ret = 0;
- unsigned int i;
- unsigned int rq_ci = rxq->rq_ci;
- const unsigned int elts_n = rxq->elts_n;
- const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int wqe_cnt = rxq->elts_n - 1;
const unsigned int cqe_cnt = rxq->cqe_n - 1;
+ const unsigned int sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ volatile struct mlx5_cqe64 *cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ unsigned int i = 0;
+ unsigned int rq_ci = rxq->rq_ci << sges_n;
+ int len;

- for (i = 0; (i != pkts_n); ++i) {
+ while (pkts_n) {
unsigned int idx = rq_ci & wqe_cnt;
- int len;
- struct rte_mbuf *rep;
- struct rte_mbuf *pkt;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
- volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ struct rte_mbuf *rep = (*rxq->elts)[idx];

- pkt = (*rxq->elts)[idx];
+ if (pkt)
+ NEXT(seg) = rep;
+ seg = rep;
+ rte_prefetch0(seg);
rte_prefetch0(cqe);
+ rte_prefetch0(wqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
+ while (pkt) {
+ seg = NEXT(pkt);
+ __rte_mbuf_raw_free(pkt);
+ pkt = seg;
+ }
++rxq->stats.rx_nombuf;
break;
}
- SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(rep) = 1;
- PORT(rep) = rxq->port_id;
- NEXT(rep) = NULL;
- len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
- if (unlikely(len == 0)) {
- __rte_mbuf_raw_free(rep);
- break;
- }
- if (unlikely(len == -1)) {
- /* RX error, packet is likely too large. */
- __rte_mbuf_raw_free(rep);
- ++rxq->stats.idropped;
- --i;
- goto skip;
+ if (!pkt) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
+ if (len == 0) {
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
+ /* Update packet information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type =
+ rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags =
+ rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc &
+ MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
+ }
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
+ }
+ PKT_LEN(pkt) = len;
}
+ DATA_LEN(rep) = DATA_LEN(seg);
+ PKT_LEN(rep) = PKT_LEN(seg);
+ SET_DATA_OFF(rep, DATA_OFF(seg));
+ NB_SEGS(rep) = NB_SEGS(seg);
+ PORT(rep) = PORT(seg);
+ NEXT(rep) = NULL;
+ (*rxq->elts)[idx] = rep;
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
- wqe->addr = htonll((uintptr_t)rep->buf_addr +
- RTE_PKTMBUF_HEADROOM);
- (*rxq->elts)[idx] = rep;
- /* Update pkt information. */
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
- rxq->crc_present) {
- if (rxq->csum) {
- pkt->packet_type = rxq_cq_to_pkt_type(cqe);
- pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
- }
- if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
- pkt->ol_flags |= PKT_RX_VLAN_PKT;
- pkt->vlan_tci = ntohs(cqe->vlan_info);
- }
- if (rxq->crc_present)
- len -= ETHER_CRC_LEN;
+ wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > DATA_LEN(seg)) {
+ len -= DATA_LEN(seg);
+ ++NB_SEGS(pkt);
+ ++rq_ci;
+ continue;
}
- PKT_LEN(pkt) = len;
- DATA_LEN(pkt) = len;
+ DATA_LEN(seg) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
- rxq->stats.ibytes += len;
+ rxq->stats.ibytes += PKT_LEN(pkt);
#endif
/* Return packet. */
*(pkts++) = pkt;
- ++pkts_ret;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
++rq_ci;
+ rq_ci <<= sges_n;
}
- if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
+ if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
return 0;
- /* Repost WRs. */
-#ifdef DEBUG_RECV
- DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
/* Update the consumer index. */
- rxq->rq_ci = rq_ci;
+ rxq->rq_ci = rq_ci >> sges_n;
rte_wmb();
*rxq->cq_db = htonl(rxq->cq_ci);
rte_wmb();
*rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
+ rxq->stats.ipackets += i;
#endif
- return pkts_ret;
+ return i;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index e3ad596..dfaf81a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -106,6 +106,7 @@ struct rxq {
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
+ unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
--
2.1.4
Nélio Laranjeiro
2016-06-14 06:57:52 UTC
Permalink
[...]
This is a very exciting patch. I applied it and reran some microbenchmarks
of mine that test the TX and RX paths separately. These are the results I
TX path (burst = 64 packets)
1 thread - 2 ports - 4 queues per port: 39Mpps => 48Mpps
2 threads - 2 ports - 2 queues per port: 60Mpps => 60Mpps (hardware
limitation?)
To be able to reach higher values you will need to configure the inline
feature with the device argument txq_inline, and only activate it with
more than 1 queue, this can be done with the txq_min_inline argument.

This feature helps the NIC by reducing the PCI back-pressure, in
counterpart it will consume more CPU cycles.

You can take a look to the NIC documentation (doc/guides/nics/mlx5.rst)
updated in this path-set which explains both txq_inline and
txqs_min_inline device arguments.
RX path (burst = 32 packets)
1 thread - 2 ports - 4 queues per port: 38Mpps => 46Mpps
2 threads - 2 ports - 2 queues per port: 43Mpps => 50Mpps
The tests were run on the following hardware, using DPDK master with this
2x Intel Xeon E5-2680 v3 2.5GHz
64GB DDR4-2133
1x Mellanox ConnectX-4 EN, 40/56GbE dual-port, PCIe3.0 x8 (MCX414A-BCAT)
I haven't test it extensively outside of these microbenchmarks, but so far
Regards,
--
Nélio Laranjeiro
6WIND
Ferruh Yigit
2016-06-17 16:09:43 UTC
Permalink
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
mlx5: replace countdown with threshold for TX completions
mlx5: add debugging information about TX queues capabilities
mlx5: check remaining space while processing TX burst
mlx5: resurrect TX gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant RX queue initialization code
mlx5: make RX queue reinitialization safer
mlx5: resurrect RX scatter support
mlx5: split memory registration function for better performance
mlx5: remove TX gather support
mlx5: remove RX scatter support
mlx5: remove configuration variable for maximum number of segments
mlx5: remove inline TX support
mlx5: split TX queue structure
mlx5: split RX queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add TX/RX burst function selection wrapper
mlx5: refactor RX data path
mlx5: refactor TX data path
mlx5: handle RX CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
I run basic checks to the patchset:

There are various checkpatch warnings, all are warning or check level

Patch 8 and 13 failed to apply with via git, -looks line line numbers
shifted a little, this is not a problem since eventually it applies but
just for your information.

check-git-log is giving following errors, it is mainly case issue in Rx/Tx:
Wrong headline lowercase:
mlx5: resurrect RX scatter support
mlx5: make RX queue reinitialization safer
mlx5: remove redundant RX queue initialization code
mlx5: resurrect TX gather support
mlx5: check remaining space while processing TX burst
mlx5: add debugging information about TX queues capabilities
mlx5: replace countdown with threshold for TX completions
mlx5: handle RX CQE compression
mlx5: refactor RX data path
mlx5: add TX/RX burst function selection wrapper
mlx5: split RX queue structure
mlx5: split TX queue structure
mlx5: remove inline TX support
mlx5: remove RX scatter support
mlx5: remove TX gather support
Headline too long:
mlx5: remove configuration variable for maximum number of segments
mlx5: split memory registration function for better performance


It compiles fine.

Regards,
ferruh
Nélio Laranjeiro
2016-06-20 07:38:54 UTC
Permalink
Post by Ferruh Yigit
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
mlx5: replace countdown with threshold for TX completions
mlx5: add debugging information about TX queues capabilities
mlx5: check remaining space while processing TX burst
mlx5: resurrect TX gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant RX queue initialization code
mlx5: make RX queue reinitialization safer
mlx5: resurrect RX scatter support
mlx5: split memory registration function for better performance
mlx5: remove TX gather support
mlx5: remove RX scatter support
mlx5: remove configuration variable for maximum number of segments
mlx5: remove inline TX support
mlx5: split TX queue structure
mlx5: split RX queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add TX/RX burst function selection wrapper
mlx5: refactor RX data path
mlx5: refactor TX data path
mlx5: handle RX CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
There are various checkpatch warnings, all are warning or check level
Patch 8 and 13 failed to apply with via git, -looks line line numbers
shifted a little, this is not a problem since eventually it applies but
just for your information.
mlx5: resurrect RX scatter support
mlx5: make RX queue reinitialization safer
mlx5: remove redundant RX queue initialization code
mlx5: resurrect TX gather support
mlx5: check remaining space while processing TX burst
mlx5: add debugging information about TX queues capabilities
mlx5: replace countdown with threshold for TX completions
mlx5: handle RX CQE compression
mlx5: refactor RX data path
mlx5: add TX/RX burst function selection wrapper
mlx5: split RX queue structure
mlx5: split TX queue structure
mlx5: remove inline TX support
mlx5: remove RX scatter support
mlx5: remove TX gather support
mlx5: remove configuration variable for maximum number of segments
mlx5: split memory registration function for better performance
It compiles fine.
Regards,
ferruh
Hi ferruh,

In fact, It does not apply well on top the current DPDK master branch.

Thanks.
--
Nélio Laranjeiro
6WIND
Ferruh Yigit
2016-06-20 15:03:00 UTC
Permalink
Post by Nélio Laranjeiro
Post by Ferruh Yigit
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
mlx5: replace countdown with threshold for TX completions
mlx5: add debugging information about TX queues capabilities
mlx5: check remaining space while processing TX burst
mlx5: resurrect TX gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant RX queue initialization code
mlx5: make RX queue reinitialization safer
mlx5: resurrect RX scatter support
mlx5: split memory registration function for better performance
mlx5: remove TX gather support
mlx5: remove RX scatter support
mlx5: remove configuration variable for maximum number of segments
mlx5: remove inline TX support
mlx5: split TX queue structure
mlx5: split RX queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add TX/RX burst function selection wrapper
mlx5: refactor RX data path
mlx5: refactor TX data path
mlx5: handle RX CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
There are various checkpatch warnings, all are warning or check level
Patch 8 and 13 failed to apply with via git, -looks line line numbers
shifted a little, this is not a problem since eventually it applies but
just for your information.
mlx5: resurrect RX scatter support
mlx5: make RX queue reinitialization safer
mlx5: remove redundant RX queue initialization code
mlx5: resurrect TX gather support
mlx5: check remaining space while processing TX burst
mlx5: add debugging information about TX queues capabilities
mlx5: replace countdown with threshold for TX completions
mlx5: handle RX CQE compression
mlx5: refactor RX data path
mlx5: add TX/RX burst function selection wrapper
mlx5: split RX queue structure
mlx5: split TX queue structure
mlx5: remove inline TX support
mlx5: remove RX scatter support
mlx5: remove TX gather support
mlx5: remove configuration variable for maximum number of segments
mlx5: split memory registration function for better performance
It compiles fine.
Regards,
ferruh
Hi ferruh,
In fact, It does not apply well on top the current DPDK master branch.
I did able to apply on top of rel_16_07 branch using "patch" binary. but
if you think it doesn't apply well, any plan to send a new version?

Thanks,
ferruh
Nélio Laranjeiro
2016-06-20 15:11:28 UTC
Permalink
Post by Ferruh Yigit
Post by Nélio Laranjeiro
Post by Ferruh Yigit
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
mlx5: replace countdown with threshold for TX completions
mlx5: add debugging information about TX queues capabilities
mlx5: check remaining space while processing TX burst
mlx5: resurrect TX gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant RX queue initialization code
mlx5: make RX queue reinitialization safer
mlx5: resurrect RX scatter support
mlx5: split memory registration function for better performance
mlx5: remove TX gather support
mlx5: remove RX scatter support
mlx5: remove configuration variable for maximum number of segments
mlx5: remove inline TX support
mlx5: split TX queue structure
mlx5: split RX queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add TX/RX burst function selection wrapper
mlx5: refactor RX data path
mlx5: refactor TX data path
mlx5: handle RX CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
There are various checkpatch warnings, all are warning or check level
Patch 8 and 13 failed to apply with via git, -looks line line numbers
shifted a little, this is not a problem since eventually it applies but
just for your information.
mlx5: resurrect RX scatter support
mlx5: make RX queue reinitialization safer
mlx5: remove redundant RX queue initialization code
mlx5: resurrect TX gather support
mlx5: check remaining space while processing TX burst
mlx5: add debugging information about TX queues capabilities
mlx5: replace countdown with threshold for TX completions
mlx5: handle RX CQE compression
mlx5: refactor RX data path
mlx5: add TX/RX burst function selection wrapper
mlx5: split RX queue structure
mlx5: split TX queue structure
mlx5: remove inline TX support
mlx5: remove RX scatter support
mlx5: remove TX gather support
mlx5: remove configuration variable for maximum number of segments
mlx5: split memory registration function for better performance
It compiles fine.
Regards,
ferruh
Hi ferruh,
In fact, It does not apply well on top the current DPDK master branch.
I did able to apply on top of rel_16_07 branch using "patch" binary. but
if you think it doesn't apply well, any plan to send a new version?
Thanks,
ferruh
I am finishing the V2, with some small fixes (it will be more detailed
in the cover letter).

It will be sent in few minutes the necessary time to run the check-*
scripts on it.

Thanks,
--
Nélio Laranjeiro
6WIND
Nelio Laranjeiro
2016-06-20 16:10:12 UTC
Permalink
Enhance mlx5 with a data path that bypasses Verbs.

The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.

The PMD remains usable during the transition.

This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".

Changes in v2:
- Rebased patchset on top of dpdk/master.
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.

Adrien Mazarguil (8):
mlx5: replace countdown with threshold for Tx completions
mlx5: add debugging information about Tx queues capabilities
mlx5: check remaining space while processing Tx burst
mlx5: resurrect Tx gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant Rx queue initialization code
mlx5: make Rx queue reinitialization safer
mlx5: resurrect Rx scatter support

Nelio Laranjeiro (16):
drivers: fix PCI class id support
mlx5: split memory registration function
mlx5: remove Tx gather support
mlx5: remove Rx scatter support
mlx5: remove configuration variable
mlx5: remove inline Tx support
mlx5: split Tx queue structure
mlx5: split Rx queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add Tx/Rx burst function selection wrapper
mlx5: refactor Rx data path
mlx5: refactor Tx data path
mlx5: handle Rx CQE compression
mlx5: add support for multi-packet send

Yaacov Hazan (1):
mlx5: add support for inline send

config/common_base | 2 -
doc/guides/nics/mlx5.rst | 94 +-
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +-
drivers/net/mlx4/mlx4.c | 18 +-
drivers/net/mlx5/Makefile | 49 +-
drivers/net/mlx5/mlx5.c | 182 ++-
drivers/net/mlx5/mlx5.h | 10 +
drivers/net/mlx5/mlx5_defs.h | 26 +-
drivers/net/mlx5/mlx5_ethdev.c | 188 ++-
drivers/net/mlx5/mlx5_fdir.c | 20 +-
drivers/net/mlx5/mlx5_mr.c | 280 ++++
drivers/net/mlx5/mlx5_prm.h | 163 +++
drivers/net/mlx5/mlx5_rxmode.c | 8 -
drivers/net/mlx5/mlx5_rxq.c | 762 ++++-------
drivers/net/mlx5/mlx5_rxtx.c | 2212 +++++++++++++++++++-------------
drivers/net/mlx5/mlx5_rxtx.h | 176 ++-
drivers/net/mlx5/mlx5_txq.c | 368 +++---
drivers/net/mlx5/mlx5_vlan.c | 6 +-
drivers/net/nfp/nfp_net.c | 12 +-
19 files changed, 2624 insertions(+), 1957 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c
create mode 100644 drivers/net/mlx5/mlx5_prm.h
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:13 UTC
Permalink
Fixes: 701c8d80c820 ("pci: support class id probing")

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +----
drivers/net/mlx4/mlx4.c | 18 ++++++------------
drivers/net/mlx5/mlx5.c | 24 ++++++++----------------
drivers/net/nfp/nfp_net.c | 12 ++++--------
4 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/drivers/crypto/qat/rte_qat_cryptodev.c b/drivers/crypto/qat/rte_qat_cryptodev.c
index a7912f5..f46ec85 100644
--- a/drivers/crypto/qat/rte_qat_cryptodev.c
+++ b/drivers/crypto/qat/rte_qat_cryptodev.c
@@ -69,10 +69,7 @@ static struct rte_cryptodev_ops crypto_qat_ops = {

static struct rte_pci_id pci_id_qat_map[] = {
{
- .vendor_id = 0x8086,
- .device_id = 0x0443,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(0x8086, 0x0443),
},
{.device_id = 0},
};
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 9e94630..6228688 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -5807,22 +5807,16 @@ error:

static const struct rte_pci_id mlx4_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
},
{
.vendor_id = 0
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 67a541c..350028b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -610,28 +610,20 @@ error:

static const struct rte_pci_id mlx5_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LX,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
},
{
.vendor_id = 0
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 5c9f350..6afd49b 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -2446,16 +2446,12 @@ nfp_net_init(struct rte_eth_dev *eth_dev)

static struct rte_pci_id pci_id_nfp_net_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_PF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_PF_NIC)
},
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_VF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_VF_NIC)
},
{
.vendor_id = 0,
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:14 UTC
Permalink
Except for the first time when memory registration occurs, the lkey is
always cached. Since memory registration is slow and performs system calls,
performance can be improved by moving that code to its own function outside
of the data path so only the lookup code is left in the original inlined
function.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 1 +
drivers/net/mlx5/mlx5_mr.c | 277 +++++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.c | 209 ++------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 8 +-
4 files changed, 295 insertions(+), 200 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 81061fe..dcc8833 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -47,6 +47,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c

# Dependencies.
DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 0000000..7c3e87f
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,277 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mempool.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+struct mlx5_check_mempool_data {
+ int ret;
+ char *start;
+ char *end;
+};
+
+/* Called by mlx5_check_mempool() when iterating the memory chunks. */
+static void mlx5_check_mempool_cb(struct rte_mempool *mp,
+ void *opaque, struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx)
+{
+ struct mlx5_check_mempool_data *data = opaque;
+
+ (void)mp;
+ (void)mem_idx;
+
+ /* It already failed, skip the next chunks. */
+ if (data->ret != 0)
+ return;
+ /* It is the first chunk. */
+ if (data->start == NULL && data->end == NULL) {
+ data->start = memhdr->addr;
+ data->end = data->start + memhdr->len;
+ return;
+ }
+ if (data->end == memhdr->addr) {
+ data->end += memhdr->len;
+ return;
+ }
+ if (data->start == (char *)memhdr->addr + memhdr->len) {
+ data->start -= memhdr->len;
+ return;
+ }
+ /* Error, mempool is not virtually contiguous. */
+ data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ * Pointer to memory pool.
+ * @param[out] start
+ * Pointer to the start address of the mempool virtual memory area
+ * @param[out] end
+ * Pointer to the end address of the mempool virtual memory area
+ *
+ * @return
+ * 0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
+ uintptr_t *end)
+{
+ struct mlx5_check_mempool_data data;
+
+ memset(&data, 0, sizeof(data));
+ rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
+ *start = (uintptr_t)data.start;
+ *end = (uintptr_t)data.end;
+
+ return data.ret;
+}
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ * Pointer to protection domain.
+ * @param mp
+ * Pointer to memory pool.
+ *
+ * @return
+ * Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ uintptr_t start;
+ uintptr_t end;
+ unsigned int i;
+
+ if (mlx5_check_mempool(mp, &start, &end) != 0) {
+ ERROR("mempool %p: not virtually contiguous",
+ (void *)mp);
+ return NULL;
+ }
+
+ DEBUG("mempool %p area start=%p end=%p size=%zu",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ /* Round start and end to page boundary if found in memory segments. */
+ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+ uintptr_t addr = (uintptr_t)ms[i].addr;
+ size_t len = ms[i].len;
+ unsigned int align = ms[i].hugepage_sz;
+
+ if ((start > addr) && (start < addr + len))
+ start = RTE_ALIGN_FLOOR(start, align);
+ if ((end > addr) && (end < addr + len))
+ end = RTE_ALIGN_CEIL(end, align);
+ }
+ DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ return ibv_reg_mr(pd,
+ (void *)start,
+ end - start,
+ IBV_ACCESS_LOCAL_WRITE);
+}
+
+/**
+ * Register a Memory Region (MR) <-> Memory Pool (MP) association in
+ * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ *
+ * This function should only be called by txq_mp2mr().
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] mp
+ * Memory Pool for which a Memory Region lkey must be returned.
+ * @param idx
+ * Index of the next available entry.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
+{
+ struct ibv_mr *mr;
+
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq->priv->pd, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --idx;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[idx].mp = mp;
+ txq->mp2mr[idx].mr = mr;
+ txq->mp2mr[idx].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
+ return txq->mp2mr[idx].lkey;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+ int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ * The mempool pointer
+ * @param[in] arg
+ * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
+ * return value.
+ * @param[in] obj
+ * Object address.
+ * @param index
+ * Object index, unused.
+ */
+static void
+txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+ uint32_t index __rte_unused)
+{
+ struct txq_mp2mr_mbuf_check_data *data = arg;
+ struct rte_mbuf *buf = obj;
+
+ /* Check whether mbuf structure fits element size and whether mempool
+ * pointer is valid. */
+ if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+ data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a TX queue.
+ *
+ * @param[in] mp
+ * Memory Pool to register.
+ * @param *arg
+ * Pointer to TX queue structure.
+ */
+void
+txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+ struct txq *txq = arg;
+ struct txq_mp2mr_mbuf_check_data data = {
+ .ret = 0,
+ };
+ unsigned int i;
+
+ /* Register mempool only if the first element looks like a mbuf. */
+ if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
+ data.ret == -1)
+ return;
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp)
+ return;
+ }
+ txq_mp2mr_reg(txq, mp, i);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e0bcfa6..aeeddfb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -140,121 +140,6 @@ txq_complete(struct txq *txq)
return 0;
}

-struct mlx5_check_mempool_data {
- int ret;
- char *start;
- char *end;
-};
-
-/* Called by mlx5_check_mempool() when iterating the memory chunks. */
-static void mlx5_check_mempool_cb(struct rte_mempool *mp,
- void *opaque, struct rte_mempool_memhdr *memhdr,
- unsigned mem_idx)
-{
- struct mlx5_check_mempool_data *data = opaque;
-
- (void)mp;
- (void)mem_idx;
-
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
- return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
- return;
- }
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
- }
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
- }
- /* Error, mempool is not virtually contigous. */
- data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area
- *
- * @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
- uintptr_t *end)
-{
- struct mlx5_check_mempool_data data;
-
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
-
- return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __attribute__((noinline));
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- * Pointer to protection domain.
- * @param mp
- * Pointer to memory pool.
- *
- * @return
- * Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
-
- if (mlx5_check_mempool(mp, &start, &end) != 0) {
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
- }
-
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- return ibv_reg_mr(pd,
- (void *)start,
- end - start,
- IBV_ACCESS_LOCAL_WRITE);
-}
-
/**
* Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
@@ -273,6 +158,10 @@ txq_mb2mp(struct rte_mbuf *buf)
return buf->pool;
}

+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+ __attribute__((always_inline));
+
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
@@ -286,11 +175,11 @@ txq_mb2mp(struct rte_mbuf *buf)
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
-static uint32_t
+static inline uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
- struct ibv_mr *mr;
+ uint32_t lkey = (uint32_t)-1;

for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
@@ -300,89 +189,13 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ lkey = txq->mp2mr[i].lkey;
+ break;
}
}
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index __rte_unused)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- /* Check whether mbuf structure fits element size and whether mempool
- * pointer is valid. */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to TX queue structure.
- */
-void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- txq_mp2mr(txq, mp);
+ if (unlikely(lkey == (uint32_t)-1))
+ lkey = txq_mp2mr_reg(txq, mp, i);
+ return lkey;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 47f6299..462eddf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -337,12 +337,16 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_rxtx.c */

-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
-void txq_mp2mr_iter(struct rte_mempool *, void *);
uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);

+/* mlx5_mr.c */
+
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
+void txq_mp2mr_iter(struct rte_mempool *, void *);
+uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int);
+
#endif /* RTE_PMD_MLX5_RXTX_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:15 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. TX gather cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 315 ++++++++---------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 17 ---
drivers/net/mlx5/mlx5_txq.c | 49 ++-----
4 files changed, 69 insertions(+), 314 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 0a881b6..280a90a 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1260,7 +1260,7 @@ mlx5_secondary_data_setup(struct priv *priv)
if (txq != NULL) {
if (txq_setup(priv->dev,
txq,
- primary_txq->elts_n * MLX5_PMD_SGE_WR_N,
+ primary_txq->elts_n,
primary_txq->socket,
NULL) == 0) {
txq->stats.idx = primary_txq->stats.idx;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index aeeddfb..4d90631 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -228,156 +228,6 @@ insert_vlan_sw(struct rte_mbuf *buf)
return 0;
}

-#if MLX5_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- * Linear output buffer.
- * @param[in] buf
- * Scattered input buffer.
- *
- * @return
- * Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
- unsigned int size = 0;
- unsigned int offset;
-
- do {
- unsigned int len = DATA_LEN(buf);
-
- offset = size;
- size += len;
- if (unlikely(size > sizeof(*linear)))
- return 0;
- memcpy(&(*linear)[offset],
- rte_pktmbuf_mtod(buf, uint8_t *),
- len);
- buf = NEXT(buf);
- } while (buf != NULL);
- return size;
-}
-
-/**
- * Handle scattered buffers for mlx5_tx_burst().
- *
- * @param txq
- * TX queue structure.
- * @param segs
- * Number of segments in buf.
- * @param elt
- * TX queue element to fill.
- * @param[in] buf
- * Buffer to process.
- * @param elts_head
- * Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- * Array filled with SGEs on success.
- *
- * @return
- * A structure containing the processed packet size in bytes and the
- * number of SGEs. Both fields are set to (unsigned int)-1 in case of
- * failure.
- */
-static struct tx_burst_sg_ret {
- unsigned int length;
- unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
- struct rte_mbuf *buf, unsigned int elts_head,
- struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
-{
- unsigned int sent_size = 0;
- unsigned int j;
- int linearize = 0;
-
- /* When there are too many segments, extra segments are
- * linearized in the last SGE. */
- if (unlikely(segs > RTE_DIM(*sges))) {
- segs = (RTE_DIM(*sges) - 1);
- linearize = 1;
- }
- /* Update element. */
- elt->buf = buf;
- /* Register segments as SGEs. */
- for (j = 0; (j != segs); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- uint32_t lkey;
-
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update SGE. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)sge->addr);
- sge->length = DATA_LEN(buf);
- sge->lkey = lkey;
- sent_size += sge->length;
- buf = NEXT(buf);
- }
- /* If buf is not NULL here and is not going to be linearized,
- * nb_segs is not valid. */
- assert(j == segs);
- assert((buf == NULL) || (linearize));
- /* Linearize extra segments. */
- if (linearize) {
- struct ibv_sge *sge = &(*sges)[segs];
- linear_t *linear = &(*txq->elts_linear)[elts_head];
- unsigned int size = linearize_mbuf(linear, buf);
-
- assert(segs == (RTE_DIM(*sges) - 1));
- if (size == 0) {
- /* Invalid packet. */
- DEBUG("%p: packet too large to be linearized.",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
- if (RTE_DIM(*sges) == 1) {
- do {
- struct rte_mbuf *next = NEXT(buf);
-
- rte_pktmbuf_free_seg(buf);
- buf = next;
- } while (buf != NULL);
- elt->buf = NULL;
- }
- /* Update SGE. */
- sge->addr = (uintptr_t)&(*linear)[0];
- sge->length = size;
- sge->lkey = txq->mr_linear->lkey;
- sent_size += size;
- /* Include last segment. */
- segs++;
- }
- return (struct tx_burst_sg_ret){
- .length = sent_size,
- .num = segs,
- };
-stop:
- return (struct tx_burst_sg_ret){
- .length = -1,
- .num = -1,
- };
-}
-
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
-
/**
* DPDK callback for TX.
*
@@ -424,14 +274,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt = &(*txq->elts)[elts_head];
- unsigned int segs = NB_SEGS(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
- unsigned int sent_size = 0;
-#endif
uint32_t send_flags = 0;
#ifdef HAVE_VERBS_VLAN_INSERTION
int insert_vlan = 0;
#endif /* HAVE_VERBS_VLAN_INSERTION */
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+ uintptr_t buf_next_addr;

if (i + 1 < max)
rte_prefetch0(buf_next);
@@ -464,126 +314,81 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto stop;
}
}
- if (likely(segs == 1)) {
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
- uintptr_t buf_next_addr;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- /* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
- /* Put packet into send queue. */
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->sriov)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ /* Prefetch next buffer data. */
+ if (i + 1 < max) {
+ buf_next_addr =
+ rte_pktmbuf_mtod(buf_next, uintptr_t);
+ rte_prefetch0((volatile void *)
+ (uintptr_t)buf_next_addr);
+ }
+ /* Put packet into send queue. */
#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
+ if (length <= txq->max_inline) {
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_inline_vlan
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += length;
+ err = txq->send_pending_inline
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags);
+ } else
#endif
- } else {
-#if MLX5_PMD_SGE_WR_N > 1
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
- struct tx_burst_sg_ret ret;
-
- ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
- &sges);
- if (ret.length == (unsigned int)-1)
+ {
+ /* Retrieve Memory Region key for this
+ * memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
goto stop;
- /* Put SG list into send queue. */
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
if (insert_vlan)
- err = txq->send_pending_sg_list_vlan
+ err = txq->send_pending_vlan
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags,
&buf->vlan_tci);
else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_sg_list
+ err = txq->send_pending
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += ret.length;
-#endif
-#else /* MLX5_PMD_SGE_WR_N > 1 */
- DEBUG("%p: TX scattered buffers support not"
- " compiled in", (void *)txq);
- goto stop;
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
}
- elts_head = elts_head_next;
- buf = buf_next;
+ if (unlikely(err))
+ goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
+ txq->stats.obytes += length;
#endif
- }
stop:
+ elts_head = elts_head_next;
+ buf = buf_next;
+ }
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 462eddf..8358ccb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -242,14 +242,6 @@ struct txq_elt {
struct rte_mbuf *buf;
};

-/* Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes. */
-typedef uint8_t linear_t[16384];
-
/* TX queue descriptor. */
struct txq {
struct priv *priv; /* Back pointer to private data. */
@@ -264,12 +256,6 @@ struct txq {
int (*send_pending_inline_vlan)();
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- int (*send_pending_sg_list)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_sg_list_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
@@ -289,9 +275,6 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- /* Elements used only for init part are here. */
- linear_t (*elts_linear)[]; /* Linearized buffers. */
- struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e20df21..5a248c9 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,26 +82,13 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
unsigned int i;
struct txq_elt (*elts)[elts_n] =
rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
- linear_t (*elts_linear)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
- txq->socket);
- struct ibv_mr *mr_linear = NULL;
int ret = 0;

- if ((elts == NULL) || (elts_linear == NULL)) {
+ if (elts == NULL) {
ERROR("%p: can't allocate packets array", (void *)txq);
ret = ENOMEM;
goto error;
}
- mr_linear =
- ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
- IBV_ACCESS_LOCAL_WRITE);
- if (mr_linear == NULL) {
- ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
- (void *)txq);
- ret = EINVAL;
- goto error;
- }
for (i = 0; (i != elts_n); ++i) {
struct txq_elt *elt = &(*elts)[i];

@@ -119,15 +106,9 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
txq->elts_comp_cd = txq->elts_comp_cd_init;
- txq->elts_linear = elts_linear;
- txq->mr_linear = mr_linear;
assert(ret == 0);
return 0;
error:
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
rte_free(elts);

DEBUG("%p: failed, freed everything", (void *)txq);
@@ -148,8 +129,6 @@ txq_free_elts(struct txq *txq)
unsigned int elts_head = txq->elts_head;
unsigned int elts_tail = txq->elts_tail;
struct txq_elt (*elts)[elts_n] = txq->elts;
- linear_t (*elts_linear)[elts_n] = txq->elts_linear;
- struct ibv_mr *mr_linear = txq->mr_linear;

DEBUG("%p: freeing WRs", (void *)txq);
txq->elts_n = 0;
@@ -159,12 +138,7 @@ txq_free_elts(struct txq *txq)
txq->elts_comp_cd = 0;
txq->elts_comp_cd_init = 0;
txq->elts = NULL;
- txq->elts_linear = NULL;
- txq->mr_linear = NULL;
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));

- rte_free(elts_linear);
if (elts == NULL)
return;
while (elts_tail != elts_head) {
@@ -286,12 +260,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of TX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of TX descriptors", (void *)dev);
+ return EINVAL;
+ }
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: TX gather is not supported", (void *)dev);
return EINVAL;
}
- desc /= MLX5_PMD_SGE_WR_N;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -332,10 +308,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
- .max_send_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_send_sge = 1,
#if MLX5_PMD_MAX_INLINE > 0
.max_inline_data = MLX5_PMD_MAX_INLINE,
#endif
@@ -440,12 +413,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_sg_list_vlan = txq->if_qp->send_pending_sg_list_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:16 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. RX scatter cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 31 +---
drivers/net/mlx5/mlx5_rxq.c | 314 ++++++-----------------------------------
drivers/net/mlx5/mlx5_rxtx.c | 212 +---------------------------
drivers/net/mlx5/mlx5_rxtx.h | 13 +-
4 files changed, 53 insertions(+), 517 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 280a90a..ca57021 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -623,8 +623,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)

};

- if (dev->rx_pkt_burst == mlx5_rx_burst ||
- dev->rx_pkt_burst == mlx5_rx_burst_sp)
+ if (dev->rx_pkt_burst == mlx5_rx_burst)
return ptypes;
return NULL;
}
@@ -762,19 +761,11 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- /* Provide new values to rxq_setup(). */
- dev->data->dev_conf.rxmode.jumbo_frame = sp;
- dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
- ret = rxq_rehash(dev, rxq);
- if (ret) {
- /* Force SP RX if that queue requires it and abort. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
- break;
+ if (sp) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ ret = ENOTSUP;
+ goto out;
}
- /* Scattered burst function takes priority. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
}
/* Burst functions can now be called again. */
rte_wmb();
@@ -1103,22 +1094,12 @@ priv_set_link(struct priv *priv, int up)
{
struct rte_eth_dev *dev = priv->dev;
int err;
- unsigned int i;

if (up) {
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- for (i = 0; i < priv->rxqs_n; i++)
- if ((*priv->rxqs)[i]->sp)
- break;
- /* Check if an sp queue exists.
- * Note: Some old frames might be received.
- */
- if (i == priv->rxqs_n)
- dev->rx_pkt_burst = mlx5_rx_burst;
- else
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
+ dev->rx_pkt_burst = mlx5_rx_burst;
dev->tx_pkt_burst = mlx5_tx_burst;
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0bcf55b..38ff9fd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -634,145 +634,6 @@ priv_rehash_flows(struct priv *priv)
}

/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt_sp (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
- struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
-
- /* These two arrays must have the same size. */
- assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
- /* For each SGE (segment). */
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- elt->bufs[j] = buf;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- if (j == 0) {
- /* The first SGE keeps its headroom. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- sge->length = (buf->buf_len -
- RTE_PKTMBUF_HEADROOM);
- } else {
- /* Subsequent SGEs lose theirs. */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- SET_DATA_OFF(buf, 0);
- sge->addr = (uintptr_t)buf->buf_addr;
- sge->length = buf->buf_len;
- }
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- }
- }
- DEBUG("%p: allocated and configured %u WRs (%zu segments)",
- (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
-}
-
-/**
* Allocate RX queue elements.
*
* @param rxq
@@ -838,7 +699,7 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
(void *)rxq, elts_n);
rxq->elts_n = elts_n;
rxq->elts_head = 0;
- rxq->elts.no_sp = elts;
+ rxq->elts = elts;
assert(ret == 0);
return 0;
error:
@@ -869,11 +730,11 @@ rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[elts_n] = rxq->elts;

DEBUG("%p: freeing WRs", (void *)rxq);
rxq->elts_n = 0;
- rxq->elts.no_sp = NULL;
+ rxq->elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -900,10 +761,7 @@ rxq_cleanup(struct rxq *rxq)
struct ibv_exp_release_intf_params params;

DEBUG("cleaning up %p", (void *)rxq);
- if (rxq->sp)
- rxq_free_elts_sp(rxq);
- else
- rxq_free_elts(rxq);
+ rxq_free_elts(rxq);
rxq->poll = NULL;
rxq->recv = NULL;
if (rxq->if_wq != NULL) {
@@ -973,12 +831,12 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- unsigned int mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ struct rxq_elt (*elts)[tmpl.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
+ desc_n = tmpl.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
@@ -989,22 +847,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
rxq->csum_l2tun = tmpl.csum_l2tun;
}
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc_n /= MLX5_PMD_SGE_WR_N;
- } else
- tmpl.sp = 0;
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
- /* If scatter mode is the same as before, nothing to do. */
- if (tmpl.sp == rxq->sp) {
- DEBUG("%p: nothing to do", (void *)dev);
- return 0;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -1025,35 +867,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- if (rxq->sp) {
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[i];
- unsigned int j;
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- assert(elt->bufs[j] != NULL);
- pool[k++] = elt->bufs[j];
- }
- }
- } else {
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
+ elts = rxq->elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf = elt->buf;

- pool[k++] = buf;
- }
+ pool[k++] = buf;
}
assert(k == mbuf_n);
tmpl.elts_n = 0;
- tmpl.elts.sp = NULL;
- assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
- err = ((tmpl.sp) ?
- rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
- rxq_alloc_elts(&tmpl, desc_n, pool));
+ tmpl.elts = NULL;
+ assert((void *)&tmpl.elts == NULL);
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
rte_free(pool);
@@ -1061,12 +886,11 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
return err;
}
assert(tmpl.elts_n == desc_n);
- assert(tmpl.elts.sp != NULL);
rte_free(pool);
/* Clean up original data. */
rxq->elts_n = 0;
- rte_free(rxq->elts.sp);
- rxq->elts.sp = NULL;
+ rte_free(rxq->elts);
+ rxq->elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
@@ -1080,28 +904,14 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (err)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ err = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (err)
+ break;
}
if (err) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1110,10 +920,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- if (tmpl.sp)
- tmpl.recv = tmpl.if_wq->recv_sg_list;
- else
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.recv = tmpl.if_wq->recv_burst;
error:
*rxq = tmpl;
assert(err >= 0);
@@ -1159,31 +966,26 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ struct rxq_elt (*elts)[desc];
int ret = 0;
unsigned int i;
unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ return ENOTSUP;
+ }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc /= MLX5_PMD_SGE_WR_N;
- }
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
+ (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -1232,10 +1034,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
priv->device_attr.max_qp_wr :
(int)cq_size),
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_recv_sge = 1,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1297,10 +1096,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
- if (tmpl.sp)
- ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
- else
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1346,28 +1142,14 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Post SGEs. */
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (ret)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ ret = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (ret)
+ break;
}
if (ret) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1388,10 +1170,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq->poll = rxq->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- if (rxq->sp)
- rxq->recv = rxq->if_wq->recv_sg_list;
- else
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq->recv = rxq->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1466,10 +1245,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)rxq);
(*priv->rxqs)[idx] = rxq;
/* Update receive callback. */
- if (rxq->sp)
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
- else
- dev->rx_pkt_burst = mlx5_rx_burst;
+ dev->rx_pkt_burst = mlx5_rx_burst;
}
priv_unlock(priv);
return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4d90631..4db72e9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -502,216 +502,8 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
}

/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(!rxq->sp))
- return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
- if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
- return 0;
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[elts_head];
- unsigned int len;
- unsigned int pkt_buf_len;
- struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
- struct rte_mbuf **pkt_buf_next = &pkt_buf;
- unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
- unsigned int j = 0;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
- pkt_buf_len = len;
- /*
- * Replace spent segments with new ones, concatenate and
- * return them as pkt_buf.
- */
- while (1) {
- struct ibv_sge *sge = &elt->sges[j];
- struct rte_mbuf *seg = elt->bufs[j];
- struct rte_mbuf *rep;
- unsigned int seg_tailroom;
-
- assert(seg != NULL);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_prefetch0(seg);
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- if (pkt_buf != NULL) {
- *pkt_buf_next = NULL;
- rte_pktmbuf_free(pkt_buf);
- }
- /* Increment out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
-#ifndef NDEBUG
- /* Poison user-modifiable fields in rep. */
- NEXT(rep) = (void *)((uintptr_t)-1);
- SET_DATA_OFF(rep, 0xdead);
- DATA_LEN(rep) = 0xd00d;
- PKT_LEN(rep) = 0xdeadd00d;
- NB_SEGS(rep) = 0x2a;
- PORT(rep) = 0x2a;
- rep->ol_flags = -1;
-#endif
- assert(rep->buf_len == seg->buf_len);
- /* Reconfigure sge to use rep instead of seg. */
- assert(sge->lkey == rxq->mr->lkey);
- sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
- elt->bufs[j] = rep;
- ++j;
- /* Update pkt_buf if it's the first segment, or link
- * seg to the previous one and update pkt_buf_next. */
- *pkt_buf_next = seg;
- pkt_buf_next = &NEXT(seg);
- /* Update seg information. */
- seg_tailroom = (seg->buf_len - seg_headroom);
- assert(sge->length == seg_tailroom);
- SET_DATA_OFF(seg, seg_headroom);
- if (likely(len <= seg_tailroom)) {
- /* Last segment. */
- DATA_LEN(seg) = len;
- PKT_LEN(seg) = len;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) ==
- seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) ==
- (seg_tailroom - len));
- break;
- }
- DATA_LEN(seg) = seg_tailroom;
- PKT_LEN(seg) = seg_tailroom;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) == seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) == 0);
- /* Fix len and clear headroom for next segments. */
- len -= seg_tailroom;
- seg_headroom = 0;
- }
- /* Update head and tail segments. */
- *pkt_buf_next = NULL;
- assert(pkt_buf != NULL);
- assert(j != 0);
- NB_SEGS(pkt_buf) = j;
- PORT(pkt_buf) = rxq->port_id;
- PKT_LEN(pkt_buf) = pkt_buf_len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
- pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- pkt_buf->ol_flags |= PKT_RX_VLAN_PKT |
- PKT_RX_VLAN_STRIPPED;
- pkt_buf->vlan_tci = vlan_tci;
- }
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- }
-
- /* Return packet. */
- *(pkts++) = pkt_buf;
- ++pkts_ret;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment bytes counter. */
- rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
- ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_sg_list(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- rxq->elts_head = elts_head;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
* DPDK callback for RX.
*
- * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
@@ -726,7 +518,7 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_sge sges[pkts_n];
@@ -734,8 +526,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int pkts_ret = 0;
int ret;

- if (unlikely(rxq->sp))
- return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
for (i = 0; (i != pkts_n); ++i) {
struct rxq_elt *elt = &(*elts)[elts_head];
unsigned int len;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8358ccb..2e1f83b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -81,12 +81,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element (scattered packets). */
-struct rxq_elt_sp {
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
- struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
/* RX element. */
struct rxq_elt {
struct ibv_sge sge; /* Scatter/Gather Element. */
@@ -112,15 +106,11 @@ struct rxq {
unsigned int port_id; /* Port ID for incoming packets. */
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int sp:1; /* Use scattered RX elements. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- union {
- struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
- struct rxq_elt (*no_sp)[]; /* RX elements. */
- } elts;
+ struct rxq_elt (*elts)[]; /* RX elements. */
unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
@@ -321,7 +311,6 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
-uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:17 UTC
Permalink
There is no scatter/gather support anymore, CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
has no purpose and can be removed.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 7 -------
drivers/net/mlx5/Makefile | 4 ----
drivers/net/mlx5/mlx5_defs.h | 5 -----
drivers/net/mlx5/mlx5_rxq.c | 4 ----
drivers/net/mlx5/mlx5_txq.c | 4 ----
6 files changed, 25 deletions(-)

diff --git a/config/common_base b/config/common_base
index 11ac81e..2c22a9a 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index d9196d1..84c35a0 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,13 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N`` (default **4**)
-
- Number of scatter/gather elements (SGEs) per work request (WR). Lowering
- this number improves performance but also limits the ability to receive
- scattered packets (packets that do not fit a single mbuf). The default
- value is a safe tradeoff.
-
- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)

Amount of data to be inlined during TX operations. Improves latency.
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index dcc8833..eca2ec3 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -85,10 +85,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
-CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 09207d9..da1c90e 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX5_PMD_SGE_WR_N
-#define MLX5_PMD_SGE_WR_N 4
-#endif
-
/* Maximum size for inline data. */
#ifndef MLX5_PMD_MAX_INLINE
#define MLX5_PMD_MAX_INLINE 0
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 38ff9fd..4000624 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -976,10 +976,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- return ENOTSUP;
- }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5a248c9..59974c5 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -264,10 +264,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
ERROR("%p: invalid number of TX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: TX gather is not supported", (void *)dev);
- return EINVAL;
- }
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:18 UTC
Permalink
Inline TX will be fully managed by the PMD after Verbs is bypassed in the
data path. Remove the current code until then.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 10 ------
drivers/net/mlx5/Makefile | 4 ---
drivers/net/mlx5/mlx5_defs.h | 5 ---
drivers/net/mlx5/mlx5_rxtx.c | 73 +++++++++++++++-----------------------------
drivers/net/mlx5/mlx5_rxtx.h | 9 ------
drivers/net/mlx5/mlx5_txq.c | 16 ----------
7 files changed, 25 insertions(+), 93 deletions(-)

diff --git a/config/common_base b/config/common_base
index 2c22a9a..f2d34c8 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

#
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 84c35a0..77fa957 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,16 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)
-
- Amount of data to be inlined during TX operations. Improves latency.
- Can improve PPS performance when PCI backpressure is detected and may be
- useful for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE`` (default **8**)

Maximum number of cached memory pools (MPs) per TX queue. Each MP from
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index eca2ec3..406caa5 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -85,10 +85,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
-CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE
CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index da1c90e..9a19835 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum size for inline data. */
-#ifndef MLX5_PMD_MAX_INLINE
-#define MLX5_PMD_MAX_INLINE 0
-#endif
-
/*
* Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
* from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4db72e9..7480a33 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -329,56 +329,33 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch0((volatile void *)
(uintptr_t)buf_next_addr);
}
- /* Put packet into send queue. */
-#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
+ goto stop;
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_vlan
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
+ err = txq->send_pending
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags);
if (unlikely(err))
goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2e1f83b..3a353b0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -240,19 +240,10 @@ struct txq {
#ifdef HAVE_VERBS_VLAN_INSERTION
int (*send_pending_vlan)();
#endif
-#if MLX5_PMD_MAX_INLINE > 0
- int (*send_pending_inline)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_inline_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
struct txq_elt (*elts)[]; /* TX elements. */
-#if MLX5_PMD_MAX_INLINE > 0
- uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
-#endif
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int elts_tail; /* First element awaiting completion. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 59974c5..75da65b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -173,9 +173,6 @@ txq_cleanup(struct txq *txq)
DEBUG("cleaning up %p", (void *)txq);
txq_free_elts(txq);
txq->poll_cnt = NULL;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = NULL;
-#endif
txq->send_flush = NULL;
if (txq->if_qp != NULL) {
assert(txq->priv != NULL);
@@ -305,9 +302,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
desc),
/* Max number of scatter/gather elements in a WR. */
.max_send_sge = 1,
-#if MLX5_PMD_MAX_INLINE > 0
- .max_inline_data = MLX5_PMD_MAX_INLINE,
-#endif
},
.qp_type = IBV_QPT_RAW_PACKET,
/* Do *NOT* enable this, completions events are managed per
@@ -325,10 +319,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
-#if MLX5_PMD_MAX_INLINE > 0
- /* ibv_create_qp() updates this value. */
- tmpl.max_inline = attr.init.cap.max_inline_data;
-#endif
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
@@ -403,12 +393,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq_cleanup(txq);
*txq = tmpl;
txq->poll_cnt = txq->if_cq->poll_cnt;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = txq->if_qp->send_pending_inline;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:20 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure rxq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 6 +-
drivers/net/mlx5/mlx5_fdir.c | 8 +-
drivers/net/mlx5/mlx5_rxq.c | 250 ++++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.c | 1 -
drivers/net/mlx5/mlx5_rxtx.h | 13 ++-
5 files changed, 148 insertions(+), 130 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3d30e00..27a7a30 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -122,12 +122,14 @@ mlx5_dev_close(struct rte_eth_dev *dev)
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
+ struct rxq_ctrl *rxq_ctrl;

if (rxq == NULL)
continue;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 63e43ad..e3b97ba 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -424,7 +424,9 @@ create_flow:
static struct fdir_queue *
priv_get_fdir_queue(struct priv *priv, uint16_t idx)
{
- struct fdir_queue *fdir_queue = &(*priv->rxqs)[idx]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq);
+ struct fdir_queue *fdir_queue = &rxq_ctrl->fdir_queue;
struct ibv_exp_rwq_ind_table *ind_table = NULL;
struct ibv_qp *qp = NULL;
struct ibv_exp_rwq_ind_table_init_attr ind_init_attr;
@@ -629,8 +631,10 @@ priv_fdir_disable(struct priv *priv)
/* Run on every RX queue to destroy related flow director QP and
* indirection table. */
for (i = 0; (i != priv->rxqs_n); i++) {
- fdir_queue = &(*priv->rxqs)[i]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[i], struct rxq_ctrl, rxq);

+ fdir_queue = &rxq_ctrl->fdir_queue;
if (fdir_queue->qp != NULL) {
claim_zero(ibv_destroy_qp(fdir_queue->qp));
fdir_queue->qp = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 4000624..8d32e74 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -636,7 +636,7 @@ priv_rehash_flows(struct priv *priv)
/**
* Allocate RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -648,16 +648,17 @@ priv_rehash_flows(struct priv *priv)
* 0 on success, errno value on failure.
*/
static int
-rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
+rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
+ struct rte_mbuf **pool)
{
unsigned int i;
struct rxq_elt (*elts)[elts_n] =
rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
+ rxq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
+ ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -672,10 +673,10 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
assert(buf != NULL);
rte_pktmbuf_reset(buf);
} else
- buf = rte_pktmbuf_alloc(rxq->mp);
+ buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
+ ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -691,15 +692,15 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
sge->addr = (uintptr_t)
((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
+ sge->lkey = rxq_ctrl->mr->lkey;
/* Redundant check for tailroom. */
assert(sge->length == rte_pktmbuf_tailroom(buf));
}
DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq, elts_n);
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts = elts;
+ (void *)rxq_ctrl, elts_n);
+ rxq_ctrl->rxq.elts_n = elts_n;
+ rxq_ctrl->rxq.elts_head = 0;
+ rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
@@ -714,7 +715,7 @@ error:
}
rte_free(elts);
}
- DEBUG("%p: failed, freed everything", (void *)rxq);
+ DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
return ret;
}
@@ -722,19 +723,19 @@ error:
/**
* Free RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
static void
-rxq_free_elts(struct rxq *rxq)
+rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
+ rxq_ctrl->rxq.elts_n = 0;
+ rxq_ctrl->rxq.elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -752,58 +753,58 @@ rxq_free_elts(struct rxq *rxq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
void
-rxq_cleanup(struct rxq *rxq)
+rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
{
struct ibv_exp_release_intf_params params;

- DEBUG("cleaning up %p", (void *)rxq);
- rxq_free_elts(rxq);
- rxq->poll = NULL;
- rxq->recv = NULL;
- if (rxq->if_wq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->wq != NULL);
+ DEBUG("cleaning up %p", (void *)rxq_ctrl);
+ rxq_free_elts(rxq_ctrl);
+ rxq_ctrl->rxq.poll = NULL;
+ rxq_ctrl->rxq.recv = NULL;
+ if (rxq_ctrl->if_wq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_wq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_wq,
&params));
}
- if (rxq->if_cq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->cq != NULL);
+ if (rxq_ctrl->if_cq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_cq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_cq,
&params));
}
- if (rxq->wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq->wq));
- if (rxq->cq != NULL)
- claim_zero(ibv_destroy_cq(rxq->cq));
- if (rxq->rd != NULL) {
+ if (rxq_ctrl->rxq.wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
+ if (rxq_ctrl->rxq.cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
- rxq->rd,
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->rd,
&attr));
}
- if (rxq->mr != NULL)
- claim_zero(ibv_dereg_mr(rxq->mr));
- memset(rxq, 0, sizeof(*rxq));
+ if (rxq_ctrl->mr != NULL)
+ claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
+ memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
}

/**
@@ -815,37 +816,37 @@ rxq_cleanup(struct rxq *rxq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* RX queue pointer.
*
* @return
* 0 on success, errno value on failure.
*/
int
-rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
+rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq->priv;
- struct rxq tmpl = *rxq;
+ struct priv *priv = rxq_ctrl->rxq.priv;
+ struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.elts_n];
+ struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
+ DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.elts_n;
+ desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum = tmpl.csum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum = tmpl.rxq.csum;
}
if (priv->hw_csum_l2tun) {
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum_l2tun = tmpl.csum_l2tun;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
}
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
@@ -853,7 +854,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -867,7 +868,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq->elts;
+ elts = rxq_ctrl->rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
struct rxq_elt *elt = &(*elts)[i];
struct rte_mbuf *buf = elt->buf;
@@ -875,9 +876,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
pool[k++] = buf;
}
assert(k == mbuf_n);
- tmpl.elts_n = 0;
- tmpl.elts = NULL;
- assert((void *)&tmpl.elts == NULL);
+ tmpl.rxq.elts_n = 0;
+ tmpl.rxq.elts = NULL;
+ assert((void *)&tmpl.rxq.elts == NULL);
err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
@@ -885,18 +886,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
assert(err > 0);
return err;
}
- assert(tmpl.elts_n == desc_n);
+ assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
/* Clean up original data. */
- rxq->elts_n = 0;
- rte_free(rxq->elts);
- rxq->elts = NULL;
+ rxq_ctrl->rxq.elts_n = 0;
+ rte_free(rxq_ctrl->rxq.elts);
+ rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
@@ -904,10 +905,10 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
err = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (err)
@@ -920,9 +921,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.rxq.recv = tmpl.if_wq->recv_burst;
error:
- *rxq = tmpl;
+ *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -932,7 +933,7 @@ error:
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -947,15 +948,17 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
+rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_rxconf *conf,
struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
- struct rxq tmpl = {
- .priv = priv,
- .mp = mp,
- .socket = socket
+ struct rxq_ctrl tmpl = {
+ .socket = socket,
+ .rxq = {
+ .priv = priv,
+ .mp = mp,
+ },
};
struct ibv_exp_wq_attr mod;
union {
@@ -978,9 +981,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
}
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
(void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
@@ -1007,9 +1010,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.rxq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1020,8 +1023,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
DEBUG("priv->device_attr.max_sge is %d",
priv->device_attr.max_sge);
/* Configure VLAN stripping. */
- tmpl.vlan_strip = (priv->hw_vlan_strip &&
- !!dev->data->dev_conf.rxmode.hw_vlan_strip);
+ tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
+ !!dev->data->dev_conf.rxmode.hw_vlan_strip);
attr.wq = (struct ibv_exp_wq_init_attr){
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
@@ -1032,7 +1035,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.cq,
+ .cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
@@ -1041,7 +1044,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
0,
.res_domain = tmpl.rd,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- .vlan_offloads = (tmpl.vlan_strip ?
+ .vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1050,24 +1053,24 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
} else if (priv->hw_fcs_strip) {
/* Ask HW/Verbs to leave CRC in place when supported. */
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
- tmpl.crc_present = 1;
+ tmpl.rxq.crc_present = 1;
} else {
WARN("%p: CRC stripping has been disabled but will still"
" be performed by hardware, make sure MLNX_OFED and"
" firmware are up to date",
(void *)dev);
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
}
DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
" incoming frames to hide it",
(void *)dev,
- tmpl.crc_present ? "disabled" : "enabled",
- tmpl.crc_present << 2);
+ tmpl.rxq.crc_present ? "disabled" : "enabled",
+ tmpl.rxq.crc_present << 2);
#endif /* HAVE_VERBS_FCS */

#ifdef HAVE_VERBS_RX_END_PADDING
@@ -1075,7 +1078,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
; /* Nothing else to do. */
else if (priv->hw_padding) {
INFO("%p: enabling packet padding on queue %p",
- (void *)dev, (void *)rxq);
+ (void *)dev, (void *)rxq_ctrl);
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
} else
@@ -1085,8 +1088,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev);
#endif /* HAVE_VERBS_RX_END_PADDING */

- tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.wq == NULL) {
+ tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.rxq.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1099,15 +1102,15 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Save port ID. */
- tmpl.port_id = dev->data->port_id;
- DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+ tmpl.rxq.port_id = dev->data->port_id;
+ DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.rxq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1118,7 +1121,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.wq,
+ .obj = tmpl.rxq.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1131,17 +1134,17 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
/* Post SGEs. */
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (ret)
@@ -1155,18 +1158,18 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
- rxq_cleanup(rxq);
- *rxq = tmpl;
- DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
+ rxq_cleanup(rxq_ctrl);
+ *rxq_ctrl = tmpl;
+ DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->poll = rxq->if_cq->poll_length_flags;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1200,12 +1203,14 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1222,24 +1227,25 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->rxqs)[idx] = NULL;
- rxq_cleanup(rxq);
+ rxq_cleanup(rxq_ctrl);
} else {
- rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
- if (rxq == NULL) {
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
+ socket);
+ if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
+ ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
- rte_free(rxq);
+ rte_free(rxq_ctrl);
else {
- rxq->stats.idx = idx;
+ rxq_ctrl->rxq.stats.idx = idx;
DEBUG("%p: adding RX queue %p to list",
- (void *)dev, (void *)rxq);
- (*priv->rxqs)[idx] = rxq;
+ (void *)dev, (void *)rxq_ctrl);
+ (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
/* Update receive callback. */
dev->rx_pkt_burst = mlx5_rx_burst;
}
@@ -1257,6 +1263,7 @@ void
mlx5_rx_queue_release(void *dpdk_rxq)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct rxq_ctrl *rxq_ctrl;
struct priv *priv;
unsigned int i;

@@ -1265,6 +1272,7 @@ mlx5_rx_queue_release(void *dpdk_rxq)

if (rxq == NULL)
return;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
priv = rxq->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
@@ -1274,8 +1282,8 @@ mlx5_rx_queue_release(void *dpdk_rxq)
(*priv->rxqs)[i] = NULL;
break;
}
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
priv_unlock(priv);
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7480a33..fda93a6 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -574,7 +574,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)

/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;

/* Add SGE to array for repost. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 5baefcb..2c5e447 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -111,8 +111,11 @@ struct rxq {
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
struct rxq_elt (*elts)[]; /* RX elements. */
- unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
+} __rte_cache_aligned;
+
+/* RX queue control descriptor. */
+struct rxq_ctrl {
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -122,6 +125,8 @@ struct rxq {
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ struct rxq rxq; /* Data path structure. */
};

/* Hash RX queue types. */
@@ -285,9 +290,9 @@ int priv_create_hash_rxqs(struct priv *);
void priv_destroy_hash_rxqs(struct priv *);
int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
-void rxq_cleanup(struct rxq *);
-int rxq_rehash(struct rte_eth_dev *, struct rxq *);
-int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int,
+void rxq_cleanup(struct rxq_ctrl *);
+int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
+int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:21 UTC
Permalink
The latest version of Mellanox OFED exposes hardware definitions necessary
to implement data path operation bypassing Verbs. Update the minimum
version requirement to MLNX_OFED >= 3.3 and clean up compatibility checks
for previous releases.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 44 +++---------------------------------------
drivers/net/mlx5/Makefile | 39 ++++++++-----------------------------
drivers/net/mlx5/mlx5.c | 23 ----------------------
drivers/net/mlx5/mlx5.h | 5 +++++
drivers/net/mlx5/mlx5_defs.h | 9 ---------
drivers/net/mlx5/mlx5_fdir.c | 10 ----------
drivers/net/mlx5/mlx5_rxmode.c | 8 --------
drivers/net/mlx5/mlx5_rxq.c | 30 ----------------------------
drivers/net/mlx5/mlx5_rxtx.c | 4 ----
drivers/net/mlx5/mlx5_rxtx.h | 8 --------
drivers/net/mlx5/mlx5_txq.c | 2 --
drivers/net/mlx5/mlx5_vlan.c | 3 ---
12 files changed, 16 insertions(+), 169 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 77fa957..3a07928 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -125,16 +125,6 @@ These options can be modified in the ``.config`` file.
Environment variables
~~~~~~~~~~~~~~~~~~~~~

-- ``MLX5_ENABLE_CQE_COMPRESSION``
-
- A nonzero value lets ConnectX-4 return smaller completion entries to
- improve performance when PCI backpressure is detected. It is most useful
- for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``MLX5_PMD_ENABLE_PADDING``

Enables HW packet padding in PCI bus transactions.
@@ -211,40 +201,12 @@ DPDK and must be installed separately:

Currently supported by DPDK:

-- Mellanox OFED **3.1-1.0.3**, **3.1-1.5.7.1** or **3.2-2.0.0.0** depending
- on usage.
-
- The following features are supported with version **3.1-1.5.7.1** and
- above only:
-
- - IPv6, UPDv6, TCPv6 RSS.
- - RX checksum offloads.
- - IBM POWER8.
-
- The following features are supported with version **3.2-2.0.0.0** and
- above only:
-
- - Flow director.
- - RX VLAN stripping.
- - TX VLAN insertion.
- - RX CRC stripping configuration.
+- Mellanox OFED **3.3-1.0.0.0**.

- Minimum firmware version:

- With MLNX_OFED **3.1-1.0.3**:
-
- - ConnectX-4: **12.12.1240**
- - ConnectX-4 Lx: **14.12.1100**
-
- With MLNX_OFED **3.1-1.5.7.1**:
-
- - ConnectX-4: **12.13.0144**
- - ConnectX-4 Lx: **14.13.0144**
-
- With MLNX_OFED **3.2-2.0.0.0**:
-
- - ConnectX-4: **12.14.2036**
- - ConnectX-4 Lx: **14.14.2036**
+ - ConnectX-4: **12.16.1006**
+ - ConnectX-4 Lx: **14.16.1006**

Getting Mellanox OFED
~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 406caa5..5888779 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -105,42 +105,19 @@ mlx5_autoconf.h.new: FORCE
mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_EXP_QUERY_DEVICE \
- infiniband/verbs.h \
- type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_FLOW_SPEC_IPV6 \
- infiniband/verbs.h \
- type 'struct ibv_exp_flow_spec_ipv6' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- infiniband/verbs.h \
- enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- infiniband/verbs.h \
- enum IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_CQ_RX_TCP_PACKET \
+ HAVE_VERBS_VLAN_INSERTION \
infiniband/verbs.h \
- enum IBV_EXP_CQ_RX_TCP_PACKET \
+ enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_FCS \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS \
+ HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
+ infiniband/verbs_exp.h \
+ enum IBV_EXP_CQ_COMPRESSED_CQE \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_RX_END_PADDING \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
+ HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
+ infiniband/mlx5_hw.h \
+ enum MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
$(AUTOCONF_OUTPUT)

# Create mlx5_autoconf.h or update it in case it differs from the new one.
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 27a7a30..3f45d84 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -195,17 +195,13 @@ static const struct eth_dev_ops mlx5_dev_ops = {
.mac_addr_add = mlx5_mac_addr_add,
.mac_addr_set = mlx5_mac_addr_set,
.mtu_set = mlx5_dev_set_mtu,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
.vlan_offload_set = mlx5_vlan_offload_set,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.reta_update = mlx5_dev_rss_reta_update,
.reta_query = mlx5_dev_rss_reta_query,
.rss_hash_update = mlx5_rss_hash_update,
.rss_hash_conf_get = mlx5_rss_hash_conf_get,
-#ifdef MLX5_FDIR_SUPPORT
.filter_ctrl = mlx5_dev_filter_ctrl,
-#endif /* MLX5_FDIR_SUPPORT */
};

static struct {
@@ -352,24 +348,16 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct ibv_pd *pd = NULL;
struct priv *priv = NULL;
struct rte_eth_dev *eth_dev;
-#ifdef HAVE_EXP_QUERY_DEVICE
struct ibv_exp_device_attr exp_device_attr;
-#endif /* HAVE_EXP_QUERY_DEVICE */
struct ether_addr mac;
uint16_t num_vfs = 0;

-#ifdef HAVE_EXP_QUERY_DEVICE
exp_device_attr.comp_mask =
IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
IBV_EXP_DEVICE_ATTR_RX_HASH |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-#ifdef HAVE_VERBS_RX_END_PADDING
IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN |
-#endif /* HAVE_VERBS_RX_END_PADDING */
0;
-#endif /* HAVE_EXP_QUERY_DEVICE */

DEBUG("using port %u (%08" PRIx32 ")", port, test);

@@ -420,7 +408,6 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
-#ifdef HAVE_EXP_QUERY_DEVICE
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
@@ -446,30 +433,20 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
DEBUG("maximum RX indirection table size is %u",
priv->ind_table_max_size);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap &
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP);
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
DEBUG("VLAN stripping is %ssupported",
(priv->hw_vlan_strip ? "" : "not "));

-#ifdef HAVE_VERBS_FCS
priv->hw_fcs_strip = !!(exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_SCATTER_FCS);
-#endif /* HAVE_VERBS_FCS */
DEBUG("FCS stripping configuration is %ssupported",
(priv->hw_fcs_strip ? "" : "not "));

-#ifdef HAVE_VERBS_RX_END_PADDING
priv->hw_padding = !!exp_device_attr.rx_pad_end_addr_align;
-#endif /* HAVE_VERBS_RX_END_PADDING */
DEBUG("hardware RX end alignment padding is %ssupported",
(priv->hw_padding ? "" : "not "));

-#else /* HAVE_EXP_QUERY_DEVICE */
- priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
priv->mps = mps;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index cbcb8b9..935e1b0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -68,6 +68,11 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+#if !defined(HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE) || \
+ !defined(HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE)
+#error Mellanox OFED >= 3.3 is required, please refer to the documentation.
+#endif
+
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 9a19835..8d2ec7a 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -76,13 +76,4 @@
/* Alarm timeout. */
#define MLX5_ALARM_TIMEOUT_US 100000

-/*
- * Extended flow priorities necessary to support flow director are available
- * since MLNX_OFED 3.2. Considering this version adds support for VLAN
- * offloads as well, their availability means flow director can be used.
- */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-#define MLX5_FDIR_SUPPORT 1
-#endif
-
#endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index e3b97ba..1850218 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -122,7 +122,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
desc->type = HASH_RXQ_IPV4;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
desc->type = HASH_RXQ_UDPV6;
break;
@@ -132,7 +131,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
desc->type = HASH_RXQ_IPV6;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -147,7 +145,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
desc->src_ip[0] = fdir_filter->input.flow.ip4_flow.src_ip;
desc->dst_ip[0] = fdir_filter->input.flow.ip4_flow.dst_ip;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
desc->src_port = fdir_filter->input.flow.udp6_flow.src_port;
@@ -161,7 +158,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
fdir_filter->input.flow.ipv6_flow.dst_ip,
sizeof(desc->dst_ip));
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -211,7 +207,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[0] & mask->ipv4_mask.dst_ip)))
return 0;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -222,7 +217,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[i] & mask->ipv6_mask.dst_ip[i])))
return 0;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -258,9 +252,7 @@ priv_fdir_flow_add(struct priv *priv,
uintptr_t spec_offset = (uintptr_t)&data->spec;
struct ibv_exp_flow_spec_eth *spec_eth;
struct ibv_exp_flow_spec_ipv4 *spec_ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 *spec_ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_tcp_udp *spec_tcp_udp;
struct mlx5_fdir_filter *iter_fdir_filter;
unsigned int i;
@@ -334,7 +326,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv4->size;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -368,7 +359,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv6->size;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
ERROR("invalid flow attribute type");
return EINVAL;
diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c
index 3a55f63..51e2aca 100644
--- a/drivers/net/mlx5/mlx5_rxmode.c
+++ b/drivers/net/mlx5/mlx5_rxmode.c
@@ -67,11 +67,9 @@ static const struct special_flow_init special_flow_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -82,10 +80,8 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -96,15 +92,12 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 1,
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_FLOW_TYPE_IPV6MULTI] = {
.dst_mac_val = "\x33\x33\x00\x00\x00\x00",
.dst_mac_mask = "\xff\xff\x00\x00\x00\x00",
@@ -115,7 +108,6 @@ static const struct special_flow_init special_flow_init[] = {
0,
.per_vlan = 1,
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
};

/**
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 8d32e74..7db4ce7 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -105,7 +105,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_TCPV6] = {
.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
IBV_EXP_RX_HASH_DST_IPV6 |
@@ -144,7 +143,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
[HASH_RXQ_ETH] = {
.hash_fields = 0,
.dpdk_rss_hf = 0,
@@ -168,17 +166,11 @@ static const struct ind_table_init ind_table_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
0,
-#ifdef HAVE_FLOW_SPEC_IPV6
.hash_types_n = 6,
-#else /* HAVE_FLOW_SPEC_IPV6 */
- .hash_types_n = 3,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
},
{
.max_size = 1,
@@ -243,12 +235,8 @@ priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
init = &hash_rxq_init[type];
*flow_attr = (struct ibv_exp_flow_attr){
.type = IBV_EXP_FLOW_ATTR_NORMAL,
-#ifdef MLX5_FDIR_SUPPORT
/* Priorities < 3 are reserved for flow director. */
.priority = init->flow_priority + 3,
-#else /* MLX5_FDIR_SUPPORT */
- .priority = init->flow_priority,
-#endif /* MLX5_FDIR_SUPPORT */
.num_of_specs = 0,
.port = priv->port,
.flags = 0,
@@ -589,9 +577,7 @@ priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
case HASH_RXQ_FLOW_TYPE_ALLMULTI:
return !!priv->allmulti_req;
case HASH_RXQ_FLOW_TYPE_BROADCAST:
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
-#endif /* HAVE_FLOW_SPEC_IPV6 */
/* If allmulti is enabled, broadcast and ipv6multi
* are unnecessary. */
return !priv->allmulti_req;
@@ -1038,19 +1024,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
0,
.res_domain = tmpl.rd,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
};
-
-#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
tmpl.rxq.crc_present = 0;
@@ -1071,9 +1051,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
(void *)dev,
tmpl.rxq.crc_present ? "disabled" : "enabled",
tmpl.rxq.crc_present << 2);
-#endif /* HAVE_VERBS_FCS */
-
-#ifdef HAVE_VERBS_RX_END_PADDING
if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
; /* Nothing else to do. */
else if (priv->hw_padding) {
@@ -1086,7 +1063,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" supported, make sure MLNX_OFED and firmware are"
" up to date",
(void *)dev);
-#endif /* HAVE_VERBS_RX_END_PADDING */

tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
if (tmpl.rxq.wq == NULL) {
@@ -1106,9 +1082,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
.obj = tmpl.rxq.cq,
};
@@ -1164,11 +1138,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index fda93a6..a6b0cf5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -452,11 +452,9 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD);
-#ifdef HAVE_EXP_CQ_RX_TCP_PACKET
/* Set L4 checksum flag only for TCP/UDP packets. */
if (flags &
(IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
-#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */
ol_flags |=
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
@@ -589,13 +587,11 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
seg->packet_type = rxq_cq_to_pkt_type(flags);
seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
seg->ol_flags |= PKT_RX_VLAN_PKT |
PKT_RX_VLAN_STRIPPED;
seg->vlan_tci = vlan_tci;
}
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
}
/* Return packet. */
*(pkts++) = seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2c5e447..570345b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -120,11 +120,7 @@ struct rxq_ctrl {
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
unsigned int socket; /* CPU socket ID for allocations. */
struct rxq rxq; /* Data path structure. */
};
@@ -134,11 +130,9 @@ enum hash_rxq_type {
HASH_RXQ_TCPV4,
HASH_RXQ_UDPV4,
HASH_RXQ_IPV4,
-#ifdef HAVE_FLOW_SPEC_IPV6
HASH_RXQ_TCPV6,
HASH_RXQ_UDPV6,
HASH_RXQ_IPV6,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
HASH_RXQ_ETH,
};

@@ -169,9 +163,7 @@ struct hash_rxq_init {
} hdr;
struct ibv_exp_flow_spec_tcp_udp tcp_udp;
struct ibv_exp_flow_spec_ipv4 ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_eth eth;
} flow_spec; /* Flow specification template. */
const struct hash_rxq_init *underlayer; /* Pointer to underlayer. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4683775..9f3a33b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -375,13 +375,11 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
-#ifdef HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
-#endif
};
tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_qp == NULL) {
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index ff40538..3b9b771 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,7 +144,6 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -165,8 +164,6 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
return;
}

-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
/* Update related bits in RX queue. */
rxq->vlan_strip = !!on;
}
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:22 UTC
Permalink
These structures and macros extend those exposed by libmlx5 (in mlx5_hw.h)
to let the PMD manage work queue and completion queue elements directly.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_prm.h | 163 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 163 insertions(+)
create mode 100644 drivers/net/mlx5/mlx5_prm.h

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
new file mode 100644
index 0000000..5db219b
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -0,0 +1,163 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/mlx5_hw.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* Get CQE owner bit. */
+#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK)
+
+/* Get CQE format. */
+#define MLX5_CQE_FORMAT(op_own) (((op_own) & MLX5E_CQE_FORMAT_MASK) >> 2)
+
+/* Get CQE opcode. */
+#define MLX5_CQE_OPCODE(op_own) (((op_own) & 0xf0) >> 4)
+
+/* Get CQE solicited event. */
+#define MLX5_CQE_SE(op_own) (((op_own) >> 1) & 1)
+
+/* Invalidate a CQE. */
+#define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
+
+/* CQE value to inform that VLAN is stripped. */
+#define MLX5_CQE_VLAN_STRIPPED 0x1
+
+/* Maximum number of packets a multi-packet WQE can handle. */
+#define MLX5_MPW_DSEG_MAX 5
+
+/* Room for inline data in regular work queue element. */
+#define MLX5_WQE64_INL_DATA 12
+
+/* Room for inline data in multi-packet WQE. */
+#define MLX5_MWQE64_INL_DATA 28
+
+/* Subset of struct mlx5_wqe_eth_seg. */
+struct mlx5_wqe_eth_seg_small {
+ uint32_t rsvd0;
+ uint8_t cs_flags;
+ uint8_t rsvd1;
+ uint16_t mss;
+ uint32_t rsvd2;
+ uint16_t inline_hdr_sz;
+};
+
+/* Regular WQE. */
+struct mlx5_wqe_regular {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ struct mlx5_wqe_data_seg dseg;
+} __rte_aligned(64);
+
+/* Inline WQE. */
+struct mlx5_wqe_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_WQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Multi-packet WQE. */
+struct mlx5_wqe_mpw {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ struct mlx5_wqe_data_seg dseg[2];
+} __rte_aligned(64);
+
+/* Multi-packet WQE with inline. */
+struct mlx5_wqe_mpw_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_MWQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Union of all WQE types. */
+union mlx5_wqe {
+ struct mlx5_wqe_regular wqe;
+ struct mlx5_wqe_inl inl;
+ struct mlx5_wqe_mpw mpw;
+ struct mlx5_wqe_mpw_inl mpw_inl;
+ uint8_t data[64];
+};
+
+/* MPW session status. */
+enum mlx5_mpw_state {
+ MLX5_MPW_STATE_OPENED,
+ MLX5_MPW_INL_STATE_OPENED,
+ MLX5_MPW_STATE_CLOSED,
+};
+
+/* MPW session descriptor. */
+struct mlx5_mpw {
+ enum mlx5_mpw_state state;
+ unsigned int pkts_n;
+ unsigned int len;
+ unsigned int total_len;
+ volatile union mlx5_wqe *wqe;
+ union {
+ volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
+ volatile uint8_t *raw;
+ } data;
+};
+
+/* CQ element structure - should be equal to the cache line size */
+struct mlx5_cqe {
+#if (RTE_CACHE_LINE_SIZE == 128)
+ uint8_t padding[64];
+#endif
+ struct mlx5_cqe64 cqe64;
+};
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:19 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure txq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 21 +++--
drivers/net/mlx5/mlx5_ethdev.c | 27 +++---
drivers/net/mlx5/mlx5_mr.c | 39 ++++----
drivers/net/mlx5/mlx5_rxtx.h | 9 +-
drivers/net/mlx5/mlx5_txq.c | 198 +++++++++++++++++++++--------------------
5 files changed, 158 insertions(+), 136 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 350028b..3d30e00 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -98,7 +98,6 @@ static void
mlx5_dev_close(struct rte_eth_dev *dev)
{
struct priv *priv = mlx5_get_priv(dev);
- void *tmp;
unsigned int i;

priv_lock(priv);
@@ -122,12 +121,13 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_rx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
- tmp = (*priv->rxqs)[i];
- if (tmp == NULL)
+ struct rxq *rxq = (*priv->rxqs)[i];
+
+ if (rxq == NULL)
continue;
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(tmp);
- rte_free(tmp);
+ rxq_cleanup(rxq);
+ rte_free(rxq);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
@@ -136,12 +136,15 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_tx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->txqs_n); ++i) {
- tmp = (*priv->txqs)[i];
- if (tmp == NULL)
+ struct txq *txq = (*priv->txqs)[i];
+ struct txq_ctrl *txq_ctrl;
+
+ if (txq == NULL)
continue;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
(*priv->txqs)[i] = NULL;
- txq_cleanup(tmp);
- rte_free(tmp);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
priv->txqs_n = 0;
priv->txqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index ca57021..3992b2c 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1232,28 +1232,31 @@ mlx5_secondary_data_setup(struct priv *priv)
/* TX queues. */
for (i = 0; i != nb_tx_queues; ++i) {
struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
- struct txq *txq;
+ struct txq_ctrl *primary_txq_ctrl;
+ struct txq_ctrl *txq_ctrl;

if (primary_txq == NULL)
continue;
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0,
- primary_txq->socket);
- if (txq != NULL) {
+ primary_txq_ctrl = container_of(primary_txq,
+ struct txq_ctrl, txq);
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
+ primary_txq_ctrl->socket);
+ if (txq_ctrl != NULL) {
if (txq_setup(priv->dev,
- txq,
+ primary_txq_ctrl,
primary_txq->elts_n,
- primary_txq->socket,
+ primary_txq_ctrl->socket,
NULL) == 0) {
- txq->stats.idx = primary_txq->stats.idx;
- tx_queues[i] = txq;
+ txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
+ tx_queues[i] = &txq_ctrl->txq;
continue;
}
- rte_free(txq);
+ rte_free(txq_ctrl);
}
while (i) {
- txq = tx_queues[--i];
- txq_cleanup(txq);
- rte_free(txq);
+ txq_ctrl = tx_queues[--i];
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
goto error;
}
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 7c3e87f..79d5568 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -183,33 +183,36 @@ mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
uint32_t
txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
{
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
struct ibv_mr *mr;

/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
+ (void *)txq_ctrl, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
+ (void *)txq_ctrl);
return (uint32_t)-1;
}
- if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ if (unlikely(idx == RTE_DIM(txq_ctrl->txq.mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
+ (void *)txq_ctrl);
--idx;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr));
+ memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1],
+ (sizeof(txq_ctrl->txq.mp2mr) -
+ sizeof(txq_ctrl->txq.mp2mr[0])));
}
/* Store the new entry. */
- txq->mp2mr[idx].mp = mp;
- txq->mp2mr[idx].mr = mr;
- txq->mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].mp = mp;
+ txq_ctrl->txq.mp2mr[idx].mr = mr;
+ txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
- return txq->mp2mr[idx].lkey;
+ (void *)txq_ctrl, mp->name, (void *)mp,
+ txq_ctrl->txq.mp2mr[idx].lkey);
+ return txq_ctrl->txq.mp2mr[idx].lkey;
}

struct txq_mp2mr_mbuf_check_data {
@@ -255,7 +258,7 @@ txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
void
txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
{
- struct txq *txq = arg;
+ struct txq_ctrl *txq_ctrl = arg;
struct txq_mp2mr_mbuf_check_data data = {
.ret = 0,
};
@@ -265,13 +268,13 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
data.ret == -1)
return;
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (unlikely(txq_ctrl->txq.mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
- if (txq->mp2mr[i].mp == mp)
+ if (txq_ctrl->txq.mp2mr[i].mp == mp)
return;
}
- txq_mp2mr_reg(txq, mp, i);
+ txq_mp2mr_reg(&txq_ctrl->txq, mp, i);
}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3a353b0..5baefcb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -256,6 +256,10 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+} __rte_cache_aligned;
+
+/* TX queue control descriptor. */
+struct txq_ctrl {
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
@@ -264,6 +268,7 @@ struct txq {
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
+ struct txq txq; /* Data path structure. */
};

/* mlx5_rxq.c */
@@ -291,8 +296,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_txq.c */

-void txq_cleanup(struct txq *);
-int txq_setup(struct rte_eth_dev *, struct txq *, uint16_t, unsigned int,
+void txq_cleanup(struct txq_ctrl *);
+int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 75da65b..4683775 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -68,7 +68,7 @@
/**
* Allocate TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -77,15 +77,15 @@
* 0 on success, errno value on failure.
*/
static int
-txq_alloc_elts(struct txq *txq, unsigned int elts_n)
+txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
+ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq);
+ ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -94,24 +94,24 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)

elt->buf = NULL;
}
- DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
- txq->elts_n = elts_n;
- txq->elts = elts;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
+ DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
+ txq_ctrl->txq.elts_n = elts_n;
+ txq_ctrl->txq.elts = elts;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
* at least 4 times per ring. */
- txq->elts_comp_cd_init =
+ txq_ctrl->txq.elts_comp_cd_init =
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq->elts_comp_cd = txq->elts_comp_cd_init;
+ txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
assert(ret == 0);
return 0;
error:
rte_free(elts);

- DEBUG("%p: failed, freed everything", (void *)txq);
+ DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
assert(ret > 0);
return ret;
}
@@ -119,25 +119,25 @@ error:
/**
* Free TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
static void
-txq_free_elts(struct txq *txq)
+txq_free_elts(struct txq_ctrl *txq_ctrl)
{
- unsigned int elts_n = txq->elts_n;
- unsigned int elts_head = txq->elts_head;
- unsigned int elts_tail = txq->elts_tail;
- struct txq_elt (*elts)[elts_n] = txq->elts;
+ unsigned int elts_n = txq_ctrl->txq.elts_n;
+ unsigned int elts_head = txq_ctrl->txq.elts_head;
+ unsigned int elts_tail = txq_ctrl->txq.elts_tail;
+ struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;

- DEBUG("%p: freeing WRs", (void *)txq);
- txq->elts_n = 0;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
- txq->elts_comp_cd = 0;
- txq->elts_comp_cd_init = 0;
- txq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)txq_ctrl);
+ txq_ctrl->txq.elts_n = 0;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
+ txq_ctrl->txq.elts_comp_cd = 0;
+ txq_ctrl->txq.elts_comp_cd_init = 0;
+ txq_ctrl->txq.elts = NULL;

if (elts == NULL)
return;
@@ -161,63 +161,63 @@ txq_free_elts(struct txq *txq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
void
-txq_cleanup(struct txq *txq)
+txq_cleanup(struct txq_ctrl *txq_ctrl)
{
struct ibv_exp_release_intf_params params;
size_t i;

- DEBUG("cleaning up %p", (void *)txq);
- txq_free_elts(txq);
- txq->poll_cnt = NULL;
- txq->send_flush = NULL;
- if (txq->if_qp != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->qp != NULL);
+ DEBUG("cleaning up %p", (void *)txq_ctrl);
+ txq_free_elts(txq_ctrl);
+ txq_ctrl->txq.poll_cnt = NULL;
+ txq_ctrl->txq.send_flush = NULL;
+ if (txq_ctrl->if_qp != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_qp,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_qp,
&params));
}
- if (txq->if_cq != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->cq != NULL);
+ if (txq_ctrl->if_cq != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_cq,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_cq,
&params));
}
- if (txq->qp != NULL)
- claim_zero(ibv_destroy_qp(txq->qp));
- if (txq->cq != NULL)
- claim_zero(ibv_destroy_cq(txq->cq));
- if (txq->rd != NULL) {
+ if (txq_ctrl->txq.qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
+ if (txq_ctrl->txq.cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
- txq->rd,
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->rd,
&attr));
}
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (txq->mp2mr[i].mp == NULL)
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (txq_ctrl->txq.mp2mr[i].mp == NULL)
break;
- assert(txq->mp2mr[i].mr != NULL);
- claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+ assert(txq_ctrl->txq.mp2mr[i].mr != NULL);
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr));
}
- memset(txq, 0, sizeof(*txq));
+ memset(txq_ctrl, 0, sizeof(*txq_ctrl));
}

/**
@@ -225,7 +225,7 @@ txq_cleanup(struct txq *txq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -238,13 +238,15 @@ txq_cleanup(struct txq *txq)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
+txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
- struct txq tmpl = {
- .priv = priv,
- .socket = socket
+ struct txq_ctrl tmpl = {
+ .socket = socket,
+ .txq = {
+ .priv = priv,
+ },
};
union {
struct ibv_exp_query_intf_params params;
@@ -279,8 +281,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
+ if (tmpl.txq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -292,9 +294,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.cq,
+ .send_cq = tmpl.txq.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.cq,
+ .recv_cq = tmpl.txq.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -312,8 +314,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.qp == NULL) {
+ tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.txq.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -325,7 +327,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
@@ -341,14 +343,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -357,7 +359,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.txq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -369,7 +371,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.qp,
+ .obj = tmpl.txq.qp,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
@@ -389,18 +391,18 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
goto error;
}
/* Clean up txq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
- txq_cleanup(txq);
- *txq = tmpl;
- txq->poll_cnt = txq->if_cq->poll_cnt;
- txq->send_pending = txq->if_qp->send_pending;
+ DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
+ txq_cleanup(txq_ctrl);
+ *txq_ctrl = tmpl;
+ txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
+ txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
+ txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
#endif
- txq->send_flush = txq->if_qp->send_flush;
- DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
+ txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
+ DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
- rte_mempool_walk(txq_mp2mr_iter, txq);
+ rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
assert(ret == 0);
return 0;
error:
@@ -432,12 +434,15 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
+ struct txq_ctrl *txq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (txq)
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -454,24 +459,25 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->txqs)[idx] = NULL;
- txq_cleanup(txq);
+ txq_cleanup(txq_ctrl);
} else {
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
- if (txq == NULL) {
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
+ 0, socket);
+ if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq, desc, socket, conf);
+ ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
- rte_free(txq);
+ rte_free(txq_ctrl);
else {
- txq->stats.idx = idx;
+ txq_ctrl->txq.stats.idx = idx;
DEBUG("%p: adding TX queue %p to list",
- (void *)dev, (void *)txq);
- (*priv->txqs)[idx] = txq;
+ (void *)dev, (void *)txq_ctrl);
+ (*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
dev->tx_pkt_burst = mlx5_tx_burst;
}
@@ -489,6 +495,7 @@ void
mlx5_tx_queue_release(void *dpdk_txq)
{
struct txq *txq = (struct txq *)dpdk_txq;
+ struct txq_ctrl *txq_ctrl;
struct priv *priv;
unsigned int i;

@@ -497,17 +504,18 @@ mlx5_tx_queue_release(void *dpdk_txq)

if (txq == NULL)
return;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
priv = txq->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
DEBUG("%p: removing TX queue %p from list",
- (void *)priv->dev, (void *)txq);
+ (void *)priv->dev, (void *)txq_ctrl);
(*priv->txqs)[i] = NULL;
break;
}
- txq_cleanup(txq);
- rte_free(txq);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
priv_unlock(priv);
}
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:24 UTC
Permalink
These wrappers are meant to prevent code duplication later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.h | 2 ++
drivers/net/mlx5/mlx5_ethdev.c | 34 ++++++++++++++++++++++++++++------
drivers/net/mlx5/mlx5_txq.c | 2 +-
3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 935e1b0..3dca03d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -196,6 +196,8 @@ void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
int mlx5_set_link_down(struct rte_eth_dev *dev);
int mlx5_set_link_up(struct rte_eth_dev *dev);
struct priv *mlx5_secondary_data_setup(struct priv *priv);
+void priv_select_tx_function(struct priv *);
+void priv_select_rx_function(struct priv *);

/* mlx5_mac.c */

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 3992b2c..771d8b5 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1099,8 +1099,8 @@ priv_set_link(struct priv *priv, int up)
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- dev->rx_pkt_burst = mlx5_rx_burst;
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
if (err)
@@ -1289,13 +1289,11 @@ mlx5_secondary_data_setup(struct priv *priv)
rte_mb();
priv->dev->data = &sd->data;
rte_mb();
- priv->dev->tx_pkt_burst = mlx5_tx_burst;
- priv->dev->rx_pkt_burst = removed_rx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
priv_unlock(priv);
end:
/* More sanity checks. */
- assert(priv->dev->tx_pkt_burst == mlx5_tx_burst);
- assert(priv->dev->rx_pkt_burst == removed_rx_burst);
assert(priv->dev->data == &sd->data);
rte_spinlock_unlock(&sd->lock);
return priv;
@@ -1306,3 +1304,27 @@ error:
rte_spinlock_unlock(&sd->lock);
return NULL;
}
+
+/**
+ * Configure the TX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_tx_function(struct priv *priv)
+{
+ priv->dev->tx_pkt_burst = mlx5_tx_burst;
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_rx_function(struct priv *priv)
+{
+ priv->dev->rx_pkt_burst = mlx5_rx_burst;
+}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9f3a33b..d7cc39d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -477,7 +477,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)txq_ctrl);
(*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
}
priv_unlock(priv);
return -ret;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:23 UTC
Permalink
The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3f45d84..56b1dfc 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -37,6 +37,7 @@
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
+#include <errno.h>
#include <net/if.h>

/* Verbs header. */
@@ -57,6 +58,7 @@
#include <rte_ethdev.h>
#include <rte_pci.h>
#include <rte_common.h>
+#include <rte_kvargs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -237,6 +239,70 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr)
return ret;
}

+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ * Key argument to verify.
+ * @param[in] val
+ * Value associated with key.
+ * @param opaque
+ * User data.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct priv *priv = opaque;
+
+ /* No parameters are expected at the moment. */
+ (void)priv;
+ (void)val;
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param devargs
+ * Device arguments structure.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args(struct priv *priv, struct rte_devargs *devargs)
+{
+ static const char *params[] = {
+ NULL,
+ };
+ struct rte_kvargs *kvlist;
+ int ret = 0;
+ int i;
+
+ if (devargs == NULL)
+ return 0;
+ kvlist = rte_kvargs_parse(devargs->args, params);
+ if (kvlist == NULL)
+ return 0;
+ /* Process parameters. */
+ for (i = 0; (i != RTE_DIM(params)); ++i) {
+ if (rte_kvargs_count(kvlist, params[i])) {
+ ret = rte_kvargs_process(kvlist, params[i],
+ mlx5_args_check, priv);
+ if (ret != 0)
+ return ret;
+ }
+ }
+ rte_kvargs_free(kvlist);
+ return 0;
+}
+
static struct eth_driver mlx5_driver;

/**
@@ -408,6 +474,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ err = mlx5_args(priv, pci_dev->devargs);
+ if (err) {
+ ERROR("failed to process device arguments: %s",
+ strerror(err));
+ goto port_error;
+ }
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:26 UTC
Permalink
Bypass Verbs to improve Tx performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 5 -
drivers/net/mlx5/mlx5_ethdev.c | 10 +-
drivers/net/mlx5/mlx5_mr.c | 4 +-
drivers/net/mlx5/mlx5_rxtx.c | 359 ++++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 52 +++---
drivers/net/mlx5/mlx5_txq.c | 216 +++++++++++++------------
6 files changed, 343 insertions(+), 303 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 5888779..283d8eb 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -105,11 +105,6 @@ mlx5_autoconf.h.new: FORCE
mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
infiniband/verbs_exp.h \
enum IBV_EXP_CQ_COMPRESSED_CQE \
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 8628321..4e125a7 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1242,11 +1242,11 @@ mlx5_secondary_data_setup(struct priv *priv)
txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
primary_txq_ctrl->socket);
if (txq_ctrl != NULL) {
- if (txq_setup(priv->dev,
- primary_txq_ctrl,
- primary_txq->elts_n,
- primary_txq_ctrl->socket,
- NULL) == 0) {
+ if (txq_ctrl_setup(priv->dev,
+ primary_txq_ctrl,
+ primary_txq->elts_n,
+ primary_txq_ctrl->socket,
+ NULL) == 0) {
txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
tx_queues[i] = &txq_ctrl->txq;
continue;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 79d5568..e5e8a04 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -189,7 +189,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq_ctrl, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
+ mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq_ctrl);
@@ -208,7 +208,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Store the new entry. */
txq_ctrl->txq.mp2mr[idx].mp = mp;
txq_ctrl->txq.mp2mr[idx].mr = mr;
- txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq_ctrl, mp->name, (void *)mp,
txq_ctrl->txq.mp2mr[idx].lkey);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 27d8852..95bf981 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe cqes[],
*
* @param txq
* Pointer to TX queue structure.
- *
- * @return
- * 0 on success, -1 on failure.
*/
-static int
+static void
txq_complete(struct txq *txq)
{
- unsigned int elts_comp = txq->elts_comp;
- unsigned int elts_tail = txq->elts_tail;
- unsigned int elts_free = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
- int wcs_n;
-
- if (unlikely(elts_comp == 0))
- return 0;
-#ifdef DEBUG_SEND
- DEBUG("%p: processing %u work requests completions",
- (void *)txq, elts_comp);
-#endif
- wcs_n = txq->poll_cnt(txq->cq, elts_comp);
- if (unlikely(wcs_n == 0))
- return 0;
- if (unlikely(wcs_n < 0)) {
- DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
- (void *)txq, wcs_n);
- return -1;
+ const unsigned int cqe_n = txq->cqe_n;
+ uint16_t elts_free = txq->elts_tail;
+ uint16_t elts_tail;
+ uint16_t cq_ci = txq->cq_ci;
+ unsigned int wqe_ci = (unsigned int)-1;
+ int ret = 0;
+
+ while (ret == 0) {
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
+ if (cqe == NULL)
+ break;
+ wqe_ci = ntohs(cqe->wqe_counter);
}
- elts_comp -= wcs_n;
- assert(elts_comp <= txq->elts_comp);
- /*
- * Assume WC status is successful as nothing can be done about it
- * anyway.
- */
- elts_tail += wcs_n * txq->elts_comp_cd_init;
- if (elts_tail >= elts_n)
- elts_tail -= elts_n;
-
- while (elts_free != elts_tail) {
- struct txq_elt *elt = &(*txq->elts)[elts_free];
+ if (unlikely(wqe_ci == (unsigned int)-1))
+ return;
+ /* Free buffers. */
+ elts_tail = (wqe_ci + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
- (((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
- struct rte_mbuf *tmp = elt->buf;
- struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+ (elts_free + 1) & (elts_n - 1);
+ struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];

#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x66, sizeof(*elt));
+ memset(&(*txq->elts)[elts_free],
+ 0x66,
+ sizeof((*txq->elts)[elts_free]));
#endif
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- /* Faster than rte_pktmbuf_free(). */
- do {
- struct rte_mbuf *next = NEXT(tmp);
-
- rte_pktmbuf_free_seg(tmp);
- tmp = next;
- } while (tmp != NULL);
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next);
+ /* Only one segment needs to be freed. */
+ rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- }
-
+ } while (elts_free != elts_tail);
+ txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
- txq->elts_comp = elts_comp;
- return 0;
+ /* Update the consumer index. */
+ rte_wmb();
+ *txq->cq_db = htonl(cq_ci);
}

/**
@@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
- assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+ assert(htonl(txq->mp2mr[i].mr->lkey) ==
+ txq->mp2mr[i].lkey);
lkey = txq->mp2mr[i].lkey;
break;
}
@@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}

/**
- * Insert VLAN using mbuf headroom space.
- *
- * @param buf
- * Buffer for VLAN insertion.
+ * Write a regular WQE.
*
- * @return
- * 0 on success, errno value on failure.
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
*/
-static inline int
-insert_vlan_sw(struct rte_mbuf *buf)
+static inline void
+mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey)
{
- uintptr_t addr;
- uint32_t vlan;
- uint16_t head_room_len = rte_pktmbuf_headroom(buf);
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- if (head_room_len < 4)
- return EINVAL;
+/**
+ * Write a regular WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey,
+ uint16_t vlan_tci)
+{
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr, 12);
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
+ (uint8_t *)((uintptr_t)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- vlan = htonl(0x81000000 | buf->vlan_tci);
- memmove((void *)(addr - 4), (void *)addr, 12);
- memcpy((void *)(addr + 8), &vlan, sizeof(vlan));
+/**
+ * Ring TX queue doorbell.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ */
+static inline void
+mlx5_tx_dbrec(struct txq *txq)
+{
+ uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
+ uint32_t data[4] = {
+ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+ htonl(txq->qp_num_8s),
+ 0,
+ 0,
+ };
+ rte_wmb();
+ *txq->qp_db = htonl(txq->wqe_ci);
+ /* Ensure ordering between DB record and BF copy. */
+ rte_wmb();
+ rte_mov16(dst, (uint8_t *)data);
+ txq->bf_offset ^= txq->bf_buf_size;
+}

- SET_DATA_OFF(buf, head_room_len - 4);
- DATA_LEN(buf) += 4;
+/**
+ * Prefetch a CQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param cqe_ci
+ * CQE consumer index.
+ */
+static inline void
+tx_prefetch_cqe(struct txq *txq, uint16_t ci)
+{
+ volatile struct mlx5_cqe64 *cqe;

- return 0;
+ cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+ rte_prefetch0(cqe);
}

/**
@@ -288,18 +376,21 @@ uint16_t
mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
- unsigned int elts_head = txq->elts_head;
+ uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int elts_comp_cd = txq->elts_comp_cd;
- unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
- int err;
- struct rte_mbuf *buf = pkts[0];
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;

- assert(elts_comp_cd != 0);
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
/* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
rte_prefetch0(buf);
+ /* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
@@ -313,101 +404,51 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf_next = pkts[i + 1];
- unsigned int elts_head_next =
- (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
- struct txq_elt *elt = &(*txq->elts)[elts_head];
- uint32_t send_flags = 0;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int insert_vlan = 0;
-#endif /* HAVE_VERBS_VLAN_INSERTION */
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
uintptr_t addr;
uint32_t length;
uint32_t lkey;
- uintptr_t buf_next_addr;

+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
if (i + 1 < max)
- rte_prefetch0(buf_next);
- /* Request TX completion. */
- if (unlikely(--elts_comp_cd == 0)) {
- elts_comp_cd = txq->elts_comp_cd_init;
- ++elts_comp;
- send_flags |= IBV_EXP_QP_BURST_SIGNALED;
- }
- /* Should we enable HW CKSUM offload */
- if (buf->ol_flags &
- (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
- /* HW does not support checksum offloads at arbitrary
- * offsets but automatically recognizes the packet
- * type. For inner L3/L4 checksums, only VXLAN (UDP)
- * tunnels are currently supported. */
- if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
- send_flags |= IBV_EXP_QP_BURST_TUNNEL;
- }
- if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (!txq->priv->mps)
- insert_vlan = 1;
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- {
- err = insert_vlan_sw(buf);
- if (unlikely(err))
- goto stop;
- }
- }
+ rte_prefetch0(pkts[i + 1]);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
+ (*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
+ buf->vlan_tci);
else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- if (unlikely(err))
- goto stop;
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ /* Request completion if needed. */
+ if (unlikely(--txq->elts_comp == 0)) {
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ txq->elts_comp = txq->elts_comp_cd_init;
+ } else
+ wqe->wqe.ctrl.data[2] = 0;
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->wqe.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->wqe.eseg.cs_flags = 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
-stop:
elts_head = elts_head_next;
- buf = buf_next;
+ buf = pkts[i + 1];
}
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
@@ -417,16 +458,8 @@ stop:
txq->stats.opackets += i;
#endif
/* Ring QP doorbell. */
- err = txq->send_flush(txq->qp);
- if (unlikely(err)) {
- /* A nonzero value is not supposed to be returned.
- * Nothing can be done about it. */
- DEBUG("%p: send_flush() failed with error %d",
- (void *)txq, err);
- }
+ mlx5_tx_dbrec(txq);
txq->elts_head = elts_head;
- txq->elts_comp += elts_comp;
- txq->elts_comp_cd = elts_comp_cd;
return i;
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1827123..6b3bb2d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -223,44 +223,40 @@ struct hash_rxq {
[MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
};

-/* TX element. */
-struct txq_elt {
- struct rte_mbuf *buf;
-};
-
/* TX queue descriptor. */
struct txq {
- struct priv *priv; /* Back pointer to private data. */
- int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
- int (*send_pending)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_vlan)();
-#endif
- int (*send_flush)(struct ibv_qp *qp);
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_qp *qp; /* Queue Pair. */
- struct txq_elt (*elts)[]; /* TX elements. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int elts_tail; /* First element awaiting completion. */
- unsigned int elts_comp; /* Number of completion requests. */
- unsigned int elts_comp_cd; /* Countdown for next completion request. */
- unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_head; /* Current index in (*elts)[]. */
+ uint16_t elts_tail; /* First element awaiting completion. */
+ uint16_t elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_n; /* (*elts)[] length. */
+ uint16_t cq_ci; /* Consumer index for completion queue. */
+ uint16_t cqe_n; /* Number of CQ elements. */
+ uint16_t wqe_ci; /* Consumer index for work queue. */
+ uint16_t wqe_n; /* Number of WQ elements. */
+ uint16_t bf_offset; /* Blueflame offset. */
+ uint16_t bf_buf_size; /* Blueflame size. */
+ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
+ volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+ volatile uint32_t *qp_db; /* Work queue doorbell. */
+ volatile uint32_t *cq_db; /* Completion queue doorbell. */
+ volatile void *bf_reg; /* Blueflame register. */
struct {
const struct rte_mempool *mp; /* Cached Memory Pool. */
struct ibv_mr *mr; /* Memory Region (for mp). */
- uint32_t lkey; /* mr->lkey */
+ uint32_t lkey; /* htonl(mr->lkey) */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+ struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
struct txq_ctrl {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
-#else
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-#endif
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
@@ -294,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_txq.c */

void txq_cleanup(struct txq_ctrl *);
-int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_txconf *);
+int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
void mlx5_tx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d7cc39d..dbf9c04 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -60,6 +60,7 @@
#endif

#include "mlx5_utils.h"
+#include "mlx5_defs.h"
#include "mlx5.h"
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
@@ -72,48 +73,22 @@
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
- *
- * @return
- * 0 on success, errno value on failure.
*/
-static int
+static void
txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
- struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
- int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
- ret = ENOMEM;
- goto error;
- }
- for (i = 0; (i != elts_n); ++i) {
- struct txq_elt *elt = &(*elts)[i];
+ for (i = 0; (i != elts_n); ++i)
+ (*txq_ctrl->txq.elts)[i] = NULL;
+ for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+ volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];

- elt->buf = NULL;
+ memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
}
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
- txq_ctrl->txq.elts_n = elts_n;
- txq_ctrl->txq.elts = elts;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- txq_ctrl->txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
- assert(ret == 0);
- return 0;
-error:
- rte_free(elts);
-
- DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
- assert(ret > 0);
- return ret;
}

/**
@@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
unsigned int elts_n = txq_ctrl->txq.elts_n;
unsigned int elts_head = txq_ctrl->txq.elts_head;
unsigned int elts_tail = txq_ctrl->txq.elts_tail;
- struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;
+ struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;

DEBUG("%p: freeing WRs", (void *)txq_ctrl);
- txq_ctrl->txq.elts_n = 0;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- txq_ctrl->txq.elts_comp_cd = 0;
- txq_ctrl->txq.elts_comp_cd_init = 0;
- txq_ctrl->txq.elts = NULL;

- if (elts == NULL)
- return;
while (elts_tail != elts_head) {
- struct txq_elt *elt = &(*elts)[elts_tail];
+ struct rte_mbuf *elt = (*elts)[elts_tail];

- assert(elt->buf != NULL);
- rte_pktmbuf_free(elt->buf);
+ assert(elt != NULL);
+ rte_pktmbuf_free(elt);
#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x77, sizeof(*elt));
+ memset(&(*elts)[elts_tail],
+ 0x77,
+ sizeof((*elts)[elts_tail]));
#endif
if (++elts_tail == elts_n)
elts_tail = 0;
}
- rte_free(elts);
}

/**
@@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)

DEBUG("cleaning up %p", (void *)txq_ctrl);
txq_free_elts(txq_ctrl);
- txq_ctrl->txq.poll_cnt = NULL;
- txq_ctrl->txq.send_flush = NULL;
if (txq_ctrl->if_qp != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.qp != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_qp,
&params));
}
if (txq_ctrl->if_cq != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.cq != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_cq,
&params));
}
- if (txq_ctrl->txq.qp != NULL)
- claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
- if (txq_ctrl->txq.cq != NULL)
- claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->qp));
+ if (txq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->cq));
if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx,
txq_ctrl->rd,
&attr));
}
@@ -221,6 +188,49 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
}

/**
+ * Initialize TX queue.
+ *
+ * @param tmpl
+ * Pointer to TX queue control template.
+ * @param txq_ctrl
+ * Pointer to TX queue control.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static inline int
+txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+{
+ struct mlx5_qp *qp = to_mqp(tmpl->qp);
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+
+ if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+ ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+ "it should be set to %u", RTE_CACHE_LINE_SIZE);
+ return EINVAL;
+ }
+ tmpl->txq.cqe_n = ibcq->cqe + 1;
+ tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
+ tmpl->txq.wqes =
+ (volatile union mlx5_wqe (*)[])
+ (uintptr_t)qp->gen_data.sqstart;
+ tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+ tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
+ tmpl->txq.bf_reg = qp->gen_data.bf->reg;
+ tmpl->txq.bf_offset = qp->gen_data.bf->offset;
+ tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+ tmpl->txq.cq_db = cq->dbrec;
+ tmpl->txq.cqes =
+ (volatile struct mlx5_cqe (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->txq.elts =
+ (struct rte_mbuf *(*)[tmpl->txq.elts_n])
+ ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+ return 0;
+}
+
+/**
* Configure a TX queue.
*
* @param dev
@@ -238,15 +248,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf)
+txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
struct txq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
- .txq = {
- .priv = priv,
- },
};
union {
struct ibv_exp_query_intf_params params;
@@ -254,15 +263,19 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_qp_attr mod;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of TX descriptors", (void *)dev);
- return EINVAL;
- }
+ tmpl.txq.elts_n = desc;
+ /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+ * at least 4 times per ring. */
+ tmpl.txq.elts_comp_cd_init =
+ ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
+ MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
+ tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -281,8 +294,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.txq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx,
+ (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ NULL, NULL, 0, &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -294,9 +309,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.txq.cq,
+ .send_cq = tmpl.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.txq.cq,
+ .recv_cq = tmpl.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -314,8 +329,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.txq.qp == NULL) {
+ tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -327,30 +342,31 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = txq_alloc_elts(&tmpl, desc);
+ ret = txq_setup(&tmpl, txq_ctrl);
if (ret) {
- ERROR("%p: TXQ allocation failed: %s",
+ ERROR("%p: cannot initialize TX queue structure: %s",
(void *)dev, strerror(ret));
goto error;
}
+ txq_alloc_elts(&tmpl, desc);
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -359,7 +375,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.txq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -371,10 +387,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.txq.qp,
-#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
-#endif
+ .obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
@@ -392,12 +406,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
txq_cleanup(txq_ctrl);
*txq_ctrl = tmpl;
- txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
- txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
-#endif
- txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
@@ -432,15 +440,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
- struct txq_ctrl *txq_ctrl;
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- if (txq)
- txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in TX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -459,8 +471,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->txqs)[idx] = NULL;
txq_cleanup(txq_ctrl);
} else {
- txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
- 0, socket);
+ txq_ctrl =
+ rte_calloc_socket("TXQ", 1,
+ sizeof(*txq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -468,7 +483,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
+ ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
rte_free(txq_ctrl);
else {
@@ -503,7 +518,7 @@ mlx5_tx_queue_release(void *dpdk_txq)
if (txq == NULL)
return;
txq_ctrl = container_of(txq, struct txq_ctrl, txq);
- priv = txq->priv;
+ priv = txq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
@@ -538,7 +553,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct txq *txq = dpdk_txq;
- struct priv *priv = mlx5_secondary_data_setup(txq->priv);
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:27 UTC
Permalink
Mini (compressed) CQEs are returned by the NIC when PCI back pressure is
detected, in which case the first CQE64 contains common packet information
followed by a number of CQE8 providing the rest, followed by a matching
number of empty CQE64 entries to be used by software for decompression.

Before decompression:

0 1 2 6 7 8
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 |
|-------| |---------| |-------| |-------| |-------| |-------|
| ..... | | cqe8[0] | | | . | | | | | ..... |
| ..... | | cqe8[1] | | | . | | | | | ..... |
| ..... | | ....... | | | . | | | | | ..... |
| ..... | | cqe8[7] | | | | | | | | ..... |
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+

After decompression:

0 1 ... 8
+-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 |
|-------| |-------| |-------|
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | | ..... |
+-------+ +-------+ +-------+

This patch does not perform the entire decompression step as it would be
really expensive, instead the first CQE64 is consumed and an internal
context is maintained to interpret the following CQE8 entries directly.

Intermediate empty CQE64 entries are handed back to HW without further
processing.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 6 +
drivers/net/mlx5/mlx5.c | 25 ++++-
drivers/net/mlx5/mlx5.h | 1 +
drivers/net/mlx5/mlx5_rxq.c | 9 +-
drivers/net/mlx5/mlx5_rxtx.c | 260 ++++++++++++++++++++++++++++++++-----------
drivers/net/mlx5/mlx5_rxtx.h | 11 ++
drivers/net/mlx5/mlx5_txq.c | 5 +
7 files changed, 248 insertions(+), 69 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3a07928..756153b 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -148,6 +148,12 @@ Run-time configuration

- **ethtool** operations on related kernel interfaces also affect the PMD.

+- ``rxq_cqe_comp_en`` parameter [int]
+
+ A nonzero value enables the compression of CQE on RX side. This feature
+ allows to save PCI bandwidth and improve performance at the cost of a
+ slightly higher CPU usage. Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 56b1dfc..7e8c579 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,9 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -256,12 +259,21 @@ static int
mlx5_args_check(const char *key, const char *val, void *opaque)
{
struct priv *priv = opaque;
+ unsigned long tmp;

- /* No parameters are expected at the moment. */
- (void)priv;
- (void)val;
- WARN("%s: unknown parameter", key);
- return EINVAL;
+ errno = 0;
+ tmp = strtoul(val, NULL, 0);
+ if (errno) {
+ WARN("%s: \"%s\" is not a valid integer", key, val);
+ return errno;
+ }
+ if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
+ priv->cqe_comp = !!tmp;
+ else {
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+ }
+ return 0;
}

/**
@@ -279,7 +291,7 @@ static int
mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
- NULL,
+ MLX5_RXQ_CQE_COMP_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -474,6 +486,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
ERROR("failed to process device arguments: %s",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3dca03d..8f5a6df 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -111,6 +111,7 @@ struct priv {
unsigned int hw_padding:1; /* End alignment padding is supported. */
unsigned int sriov:1; /* This is a VF or PF with VF devices. */
unsigned int mps:1; /* Whether multi-packet send is supported. */
+ unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index a8f68a3..6881cdd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -897,6 +897,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
return EINVAL;
}
tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
tmpl->rxq.rq_ci = 0;
tmpl->rxq.cq_db = cq->dbrec;
@@ -955,6 +956,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ unsigned int cqe_n = desc - 1;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -994,7 +996,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ if (priv->cqe_comp) {
+ attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+ attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+ cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
+ }
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
&attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 95bf981..30d413c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -69,44 +69,85 @@
#include "mlx5_defs.h"
#include "mlx5_prm.h"

-static inline volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
- unsigned int cqes_n, uint16_t *ci)
- __attribute__((always_inline));
+#ifndef NDEBUG
+
+/**
+ * Verify or set magic value in CQE.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ *
+ * @return
+ * 0 the first time.
+ */
+static inline int
+check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
+{
+ static const uint8_t magic[] = "seen";
+ volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
+ int ret = 1;
+ unsigned int i;
+
+ for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
+ if (!ret || !(ret = ((*buf)[i] == magic[i])))
+ (*buf)[i] = magic[i];
+ return ret;
+}
+
+#endif /* NDEBUG */

static inline int
-rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
+ __attribute__((always_inline));

-static volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
- unsigned int cqes_n, uint16_t *ci)
+/**
+ * Check whether CQE is valid.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ * @param cqes_n
+ * Size of completion queue.
+ * @param ci
+ * Consumer index.
+ *
+ * @return
+ * 0 on success, 1 on failure.
+ */
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
{
- volatile struct mlx5_cqe64 *cqe;
- uint16_t idx = *ci;
- uint8_t op_own;
-
- cqe = &cqes[idx & (cqes_n - 1)].cqe64;
- op_own = cqe->op_own;
- if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
- return NULL;
- } else if (unlikely(op_own & 0x80)) {
- switch (op_own >> 4) {
- case MLX5_CQE_INVALID:
- return NULL; /* No CQE */
- case MLX5_CQE_REQ_ERR:
- return cqe;
- case MLX5_CQE_RESP_ERR:
- ++(*ci);
- return NULL;
- default:
- return NULL;
- }
- }
- if (cqe) {
- *ci = idx + 1;
- return cqe;
+ uint16_t idx = ci & cqes_n;
+ uint8_t op_own = cqe->op_own;
+ uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+ uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+
+ if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
+ return 1; /* No CQE. */
+#ifndef NDEBUG
+ if ((op_code == MLX5_CQE_RESP_ERR) ||
+ (op_code == MLX5_CQE_REQ_ERR)) {
+ volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
+ uint8_t syndrome = err_cqe->syndrome;
+
+ if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
+ (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
+ return 0;
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE error %u (0x%02x)"
+ " syndrome 0x%02x",
+ op_code, op_code, syndrome);
+ return 1;
+ } else if ((op_code != MLX5_CQE_RESP_SEND) &&
+ (op_code != MLX5_CQE_REQ)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE opcode %u (0x%02x)",
+ op_code, op_code);
+ return 1;
}
- return NULL;
+#endif /* NDEBUG */
+ return 0;
}

/**
@@ -125,20 +166,34 @@ txq_complete(struct txq *txq)
{
const unsigned int elts_n = txq->elts_n;
const unsigned int cqe_n = txq->cqe_n;
+ const unsigned int cqe_cnt = cqe_n - 1;
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
unsigned int wqe_ci = (unsigned int)-1;
- int ret = 0;

- while (ret == 0) {
- volatile struct mlx5_cqe64 *cqe;
+ do {
+ unsigned int idx = cq_ci & cqe_cnt;
+ volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;

- cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
- if (cqe == NULL)
+ if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
break;
+#ifndef NDEBUG
+ if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected compressed CQE, TX stopped");
+ return;
+ }
+ if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
+ (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected error CQE, TX stopped");
+ return;
+ }
+#endif /* NDEBUG */
wqe_ci = ntohs(cqe->wqe_counter);
- }
+ ++cq_ci;
+ } while (1);
if (unlikely(wqe_ci == (unsigned int)-1))
return;
/* Free buffers. */
@@ -507,6 +562,97 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
}

/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param cqe
+ * CQE to process.
+ *
+ * @return
+ * Packet size in bytes (0 if there is none), -1 in case of completion
+ * with error.
+ */
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
+ uint16_t cqe_cnt)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ uint16_t cqe_n = cqe_cnt + 1;
+ int len = 0;
+
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64);
+
+ len = ntohl((*mc)[zip->ai & 7].byte_cnt);
+ if ((++zip->ai & 7) == 0) {
+ /* Increment consumer index to skip the number of
+ * CQEs consumed. Hardware leaves holes in the CQ
+ * ring for software use. */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ uint16_t idx = rxq->cq_ci;
+ uint16_t end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].cqe64.op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /* No compressed data, get next CQE and verify if it is compressed. */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret == 1))
+ return 0;
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
+ cqe_cnt].cqe64;
+
+ /* Fix endianness. */
+ zip->cqe_cnt = ntohl(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one returned by
+ * check_cqe64().
+ *
+ * If completion comprises several mini arrays, as a
+ * special case the second one is located 7 CQEs after
+ * the initial CQE instead of 8 for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci & cqe_cnt;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = ntohl((*mc)[0].byte_cnt);
+ zip->ai = 1;
+ } else
+ len = ntohl(cqe->byte_cnt);
+ /* Error while receiving packet. */
+ if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
+ return -1;
+ }
+ return len;
+}
+
+/**
* Translate RX completion flags to offload flags.
*
* @param[in] rxq
@@ -554,26 +700,6 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
}

/**
- * Get size of the next packet.
- *
- * @param rxq
- * RX queue to fetch packet from.
- *
- * @return
- * Packet size in bytes.
- */
-static inline int __attribute__((always_inline))
-rx_poll_len(struct rxq *rxq)
-{
- volatile struct mlx5_cqe64 *cqe;
-
- cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
- if (cqe)
- return ntohl(cqe->byte_cnt);
- return 0;
-}
-
-/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -595,15 +721,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int rq_ci = rxq->rq_ci;
const unsigned int elts_n = rxq->elts_n;
const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int cqe_cnt = rxq->cqe_n - 1;

for (i = 0; (i != pkts_n); ++i) {
unsigned int idx = rq_ci & wqe_cnt;
+ int len;
struct rte_mbuf *rep;
struct rte_mbuf *pkt;
- unsigned int len;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;

pkt = (*rxq->elts)[idx];
rte_prefetch0(cqe);
@@ -616,12 +743,20 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
NB_SEGS(rep) = 1;
PORT(rep) = rxq->port_id;
NEXT(rep) = NULL;
- len = rx_poll_len(rxq);
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
if (unlikely(len == 0)) {
rte_mbuf_refcnt_set(rep, 0);
__rte_mbuf_raw_free(rep);
break;
}
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ --i;
+ goto skip;
+ }
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
@@ -651,6 +786,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Return packet. */
*(pkts++) = pkt;
++pkts_ret;
+ skip:
++rq_ci;
}
if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 6b3bb2d..77b0fde 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -91,6 +91,15 @@ struct fdir_queue {

struct priv;

+/* Compressed CQE context. */
+struct rxq_zip {
+ uint16_t ai; /* Array index. */
+ uint16_t ca; /* Current array index. */
+ uint16_t na; /* Next array index. */
+ uint16_t cq_ci; /* The next CQE. */
+ uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
/* RX queue descriptor. */
struct rxq {
unsigned int csum:1; /* Enable checksum offloading. */
@@ -100,9 +109,11 @@ struct rxq {
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
+ uint16_t cqe_n; /* Number of CQ elements. */
uint16_t port_id;
volatile struct mlx5_wqe_data_seg(*wqes)[];
volatile struct mlx5_cqe(*cqes)[];
+ struct rxq_zip zip; /* Compressed context. */
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
struct rte_mbuf *(*elts)[];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbf9c04..ddcd6b6 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -268,6 +268,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
enum ibv_exp_query_intf_status status;
int ret = 0;

+ if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+ ret = ENOTSUP;
+ ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
+ goto error;
+ }
(void)conf; /* Thresholds configuration (ignored). */
tmpl.txq.elts_n = desc;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:28 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Replacing the variable countdown (which depends on the number of
descriptors) with a fixed relative threshold known at compile time improves
performance by reducing the TX queue structure footprint and the amount of
code to manage completions during a burst.

Completions are now requested at most once per burst after threshold is
reached.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_defs.h | 7 +++++--
drivers/net/mlx5/mlx5_rxtx.c | 42 ++++++++++++++++++++++++------------------
drivers/net/mlx5/mlx5_rxtx.h | 5 ++---
drivers/net/mlx5/mlx5_txq.c | 19 ++++++++++++-------
4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 8d2ec7a..cc2a6f3 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -48,8 +48,11 @@
/* Maximum number of special flows. */
#define MLX5_MAX_SPECIAL_FLOWS 4

-/* Request send completion once in every 64 sends, might be less. */
-#define MLX5_PMD_TX_PER_COMP_REQ 64
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32

/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 30d413c..d56c9e9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -154,9 +154,6 @@ check_cqe64(volatile struct mlx5_cqe64 *cqe,
* Manage TX completions.
*
* When sending a burst, mlx5_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to TX queue structure.
@@ -170,14 +167,16 @@ txq_complete(struct txq *txq)
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
- unsigned int wqe_ci = (unsigned int)-1;
+ volatile struct mlx5_cqe64 *cqe = NULL;
+ volatile union mlx5_wqe *wqe;

do {
- unsigned int idx = cq_ci & cqe_cnt;
- volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;
+ volatile struct mlx5_cqe64 *tmp;

- if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
+ tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64;
+ if (check_cqe64(tmp, cqe_n, cq_ci))
break;
+ cqe = tmp;
#ifndef NDEBUG
if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
if (!check_cqe64_seen(cqe))
@@ -191,14 +190,15 @@ txq_complete(struct txq *txq)
return;
}
#endif /* NDEBUG */
- wqe_ci = ntohs(cqe->wqe_counter);
++cq_ci;
} while (1);
- if (unlikely(wqe_ci == (unsigned int)-1))
+ if (unlikely(cqe == NULL))
return;
+ wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
+ elts_tail = wqe->wqe.ctrl.data[3];
+ assert(elts_tail < txq->wqe_n);
/* Free buffers. */
- elts_tail = (wqe_ci + 1) & (elts_n - 1);
- do {
+ while (elts_free != elts_tail) {
struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
(elts_free + 1) & (elts_n - 1);
@@ -214,7 +214,7 @@ txq_complete(struct txq *txq)
/* Only one segment needs to be freed. */
rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- } while (elts_free != elts_tail);
+ }
txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
/* Update the consumer index. */
@@ -435,6 +435,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
const unsigned int elts_n = txq->elts_n;
unsigned int i;
unsigned int max;
+ unsigned int comp;
volatile union mlx5_wqe *wqe;
struct rte_mbuf *buf;

@@ -484,12 +485,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- /* Request completion if needed. */
- if (unlikely(--txq->elts_comp == 0)) {
- wqe->wqe.ctrl.data[2] = htonl(8);
- txq->elts_comp = txq->elts_comp_cd_init;
- } else
- wqe->wqe.ctrl.data[2] = 0;
+ wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -508,6 +504,16 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->wqe.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent packets counter. */
txq->stats.opackets += i;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 77b0fde..f900e65 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -238,8 +238,7 @@ struct hash_rxq {
struct txq {
uint16_t elts_head; /* Current index in (*elts)[]. */
uint16_t elts_tail; /* First element awaiting completion. */
- uint16_t elts_comp_cd_init; /* Initial value for countdown. */
- uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_comp; /* Counter since last completion request. */
uint16_t elts_n; /* (*elts)[] length. */
uint16_t cq_ci; /* Consumer index for completion queue. */
uint16_t cqe_n; /* Number of CQ elements. */
@@ -247,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
volatile uint32_t *qp_db; /* Work queue doorbell. */
@@ -259,7 +259,6 @@ struct txq {
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index ddcd6b6..7b2dc7c 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -89,6 +89,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
}

/**
@@ -108,6 +109,7 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
DEBUG("%p: freeing WRs", (void *)txq_ctrl);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;

while (elts_tail != elts_head) {
struct rte_mbuf *elt = (*elts)[elts_tail];
@@ -274,13 +276,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
goto error;
}
(void)conf; /* Thresholds configuration (ignored). */
+ assert(desc > MLX5_TX_COMP_THRESH);
tmpl.txq.elts_n = desc;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- tmpl.txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
- tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -300,7 +297,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.res_domain = tmpl.rd,
};
tmpl.cq = ibv_exp_create_cq(priv->ctx,
- (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ (((desc / MLX5_TX_COMP_THRESH) - 1) ?
+ ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
NULL, NULL, 0, &attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
@@ -452,6 +450,13 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (desc <= MLX5_TX_COMP_THRESH) {
+ WARN("%p: number of descriptors requested for TX queue %u"
+ " must be higher than MLX5_TX_COMP_THRESH, using"
+ " %u instead of %u",
+ (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
+ desc = MLX5_TX_COMP_THRESH + 1;
+ }
if (!rte_is_power_of_2(desc)) {
desc = 1 << log2above(desc);
WARN("%p: increased number of descriptors in TX queue %u"
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:25 UTC
Permalink
Bypass Verbs to improve RX performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 4 +-
drivers/net/mlx5/mlx5_fdir.c | 2 +-
drivers/net/mlx5/mlx5_rxq.c | 303 ++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.c | 290 ++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 38 +++---
drivers/net/mlx5/mlx5_vlan.c | 3 +-
6 files changed, 325 insertions(+), 315 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 771d8b5..8628321 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1262,7 +1262,9 @@ mlx5_secondary_data_setup(struct priv *priv)
}
/* RX queues. */
for (i = 0; i != nb_rx_queues; ++i) {
- struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
+ struct rxq_ctrl *primary_rxq =
+ container_of((*sd->primary_priv->rxqs)[i],
+ struct rxq_ctrl, rxq);

if (primary_rxq == NULL)
continue;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 1850218..73eb00e 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -431,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx)
ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){
.pd = priv->pd,
.log_ind_tbl_size = 0,
- .ind_tbl = &((*priv->rxqs)[idx]->wq),
+ .ind_tbl = &rxq_ctrl->wq,
.comp_mask = 0,
};

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 7db4ce7..a8f68a3 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -43,6 +43,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/arch.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -373,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv)
DEBUG("indirection table extended to assume %u WQs",
priv->reta_idx_n);
}
- for (i = 0; (i != priv->reta_idx_n); ++i)
- wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
+ for (i = 0; (i != priv->reta_idx_n); ++i) {
+ struct rxq_ctrl *rxq_ctrl;
+
+ rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
+ struct rxq_ctrl, rxq);
+ wqs[i] = rxq_ctrl->wq;
+ }
/* Get number of hash RX queues to configure. */
for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
hash_rxqs_n += ind_table_init[i].hash_types_n;
@@ -638,21 +645,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf **pool)
{
unsigned int i;
- struct rxq_elt (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq_ctrl->socket);
int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
- ret = ENOMEM;
- goto error;
- }
/* For each WR (packet). */
for (i = 0; (i != elts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_sge *sge = &(*elts)[i].sge;
struct rte_mbuf *buf;
+ volatile struct mlx5_wqe_data_seg *scat =
+ &(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
buf = *(pool++);
@@ -666,40 +665,36 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
ret = ENOMEM;
goto error;
}
- elt->buf = buf;
/* Headroom is reserved by rte_pktmbuf_alloc(). */
assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
/* Buffer is supposed to be empty. */
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq_ctrl->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
+ assert(!buf->next);
+ PORT(buf) = rxq_ctrl->rxq.port_id;
+ DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+ PKT_LEN(buf) = DATA_LEN(buf);
+ NB_SEGS(buf) = 1;
+ /* scat->addr must be able to store a pointer. */
+ assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx5_wqe_data_seg){
+ .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = htonl(rxq_ctrl->mr->lkey),
+ };
+ (*rxq_ctrl->rxq.elts)[i] = buf;
}
DEBUG("%p: allocated and configured %u single-segment WRs",
(void *)rxq_ctrl, elts_n);
- rxq_ctrl->rxq.elts_n = elts_n;
- rxq_ctrl->rxq.elts_head = 0;
- rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
+ assert(pool == NULL);
+ elts_n = i;
+ for (i = 0; (i != elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
@@ -716,22 +711,16 @@ static void
rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq_ctrl->rxq.elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
- rxq_ctrl->rxq.elts_n = 0;
- rxq_ctrl->rxq.elts = NULL;
- if (elts == NULL)
+ if (rxq_ctrl->rxq.elts == NULL)
return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;

- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
+ for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
- rte_free(elts);
}

/**
@@ -749,42 +738,40 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)

DEBUG("cleaning up %p", (void *)rxq_ctrl);
rxq_free_elts(rxq_ctrl);
- rxq_ctrl->rxq.poll = NULL;
- rxq_ctrl->rxq.recv = NULL;
if (rxq_ctrl->if_wq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.wq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_wq,
&params));
}
if (rxq_ctrl->if_cq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.cq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_cq,
&params));
}
- if (rxq_ctrl->rxq.wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
- if (rxq_ctrl->rxq.cq != NULL)
- claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
+ if (rxq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
rxq_ctrl->rd,
&attr));
}
@@ -811,14 +798,13 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->rxq.priv;
+ struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
@@ -840,7 +826,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -854,60 +840,33 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq_ctrl->rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- pool[k++] = buf;
- }
+ for (i = 0; (i != desc_n); ++i)
+ pool[k++] = (*rxq_ctrl->rxq.elts)[i];
assert(k == mbuf_n);
- tmpl.rxq.elts_n = 0;
- tmpl.rxq.elts = NULL;
- assert((void *)&tmpl.rxq.elts == NULL);
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
- assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
- /* Clean up original data. */
- rxq_ctrl->rxq.elts_n = 0;
- rte_free(rxq_ctrl->rxq.elts);
- rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
/* Post SGEs. */
- assert(tmpl.if_wq != NULL);
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, err);
- /* Set err because it does not contain a valid errno value. */
- err = EIO;
- goto error;
+ ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
+ rte_free(pool);
+ assert(err > 0);
+ return err;
}
- tmpl.rxq.recv = tmpl.if_wq->recv_burst;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc_n;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
*rxq_ctrl = tmpl;
assert(err >= 0);
@@ -915,6 +874,45 @@ error:
}

/**
+ * Initialize RX queue.
+ *
+ * @param tmpl
+ * Pointer to RX queue control template.
+ * @param rxq_ctrl
+ * Pointer to RX queue control.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static inline int
+rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+{
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+ struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+
+ if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+ ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+ "it should be set to %u", RTE_CACHE_LINE_SIZE);
+ return EINVAL;
+ }
+ tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cq_ci = 0;
+ tmpl->rxq.rq_ci = 0;
+ tmpl->rxq.cq_db = cq->dbrec;
+ tmpl->rxq.wqes =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)rwq->rq.buff;
+ tmpl->rxq.cqes =
+ (volatile struct mlx5_cqe (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->rxq.elts =
+ (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
+ ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+ return 0;
+}
+
+/**
* Configure a RX queue.
*
* @param dev
@@ -934,15 +932,16 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp)
+rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
struct rxq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
.rxq = {
- .priv = priv,
+ .elts_n = desc,
.mp = mp,
},
};
@@ -952,17 +951,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_wq_init_attr wq;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
- struct rxq_elt (*elts)[desc];
int ret = 0;
- unsigned int i;
- unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors", (void *)dev);
+ ERROR("%p: invalid number of RX descriptors (must be a"
+ " multiple of 2)", (void *)dev);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -996,9 +994,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.rxq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1015,13 +1013,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
+ .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
priv->device_attr.max_qp_wr :
- (int)cq_size),
+ (int)desc),
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.rxq.cq,
+ .cq = tmpl.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
@@ -1064,19 +1062,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" up to date",
(void *)dev);

- tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.rxq.wq == NULL) {
+ tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
- if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1084,7 +1076,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf_version = 1,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.rxq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1095,7 +1087,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.rxq.wq,
+ .obj = tmpl.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1108,38 +1100,34 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- /* Post SGEs. */
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
+ ret = rxq_setup(&tmpl, rxq_ctrl);
+ if (ret) {
+ ERROR("%p: cannot initialize RX queue structure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
}
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, ret);
- /* Set ret because it does not contain a valid errno value. */
- ret = EIO;
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(ret));
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
*rxq_ctrl = tmpl;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
- /* Assign function in queue. */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
- rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1173,14 +1161,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
- struct rxq_ctrl *rxq_ctrl;
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in RX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1199,8 +1192,9 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->rxqs)[idx] = NULL;
rxq_cleanup(rxq_ctrl);
} else {
- rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
- socket);
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -1208,7 +1202,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
rte_free(rxq_ctrl);
else {
@@ -1243,12 +1237,12 @@ mlx5_rx_queue_release(void *dpdk_rxq)
if (rxq == NULL)
return;
rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
- priv = rxq->priv;
+ priv = rxq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
if ((*priv->rxqs)[i] == rxq) {
DEBUG("%p: removing RX queue %p from list",
- (void *)priv->dev, (void *)rxq);
+ (void *)priv->dev, (void *)rxq_ctrl);
(*priv->rxqs)[i] = NULL;
break;
}
@@ -1278,7 +1272,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index a6b0cf5..27d8852 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -42,6 +42,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -55,7 +57,7 @@
#include <rte_prefetch.h>
#include <rte_common.h>
#include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -65,6 +67,47 @@
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+ __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+{
+ volatile struct mlx5_cqe64 *cqe;
+ uint16_t idx = *ci;
+ uint8_t op_own;
+
+ cqe = &cqes[idx & (cqes_n - 1)].cqe64;
+ op_own = cqe->op_own;
+ if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+ return NULL;
+ } else if (unlikely(op_own & 0x80)) {
+ switch (op_own >> 4) {
+ case MLX5_CQE_INVALID:
+ return NULL; /* No CQE */
+ case MLX5_CQE_REQ_ERR:
+ return cqe;
+ case MLX5_CQE_RESP_ERR:
+ ++(*ci);
+ return NULL;
+ default:
+ return NULL;
+ }
+ }
+ if (cqe) {
+ *ci = idx + 1;
+ return cqe;
+ }
+ return NULL;
+}

/**
* Manage TX completions.
@@ -390,8 +433,8 @@ stop:
/**
* Translate RX completion flags to packet type.
*
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @note: fix mlx5_dev_supported_ptypes_get() if any change here.
*
@@ -399,11 +442,13 @@ stop:
* Packet type for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
{
uint32_t pkt_type;
+ uint8_t flags = cqe->l4_hdr_type_etc;
+ uint8_t info = cqe->rsvd0[0];

- if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+ if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
pkt_type =
TRANSPOSE(flags,
IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
@@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags)
else
pkt_type =
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4) |
+ MLX5_CQE_L3_HDR_TYPE_IPV6,
+ RTE_PTYPE_L3_IPV6) |
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6);
+ MLX5_CQE_L3_HDR_TYPE_IPV4,
+ RTE_PTYPE_L3_IPV4);
return pkt_type;
}

@@ -433,50 +478,69 @@ rxq_cq_to_pkt_type(uint32_t flags)
*
* @param[in] rxq
* Pointer to RX queue structure.
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @return
* Offload flags (ol_flags) for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
{
uint32_t ol_flags = 0;
+ uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+ uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+ uint8_t info = cqe->rsvd0[0];

- if (rxq->csum) {
- /* Set IP checksum flag only for IPv4/IPv6 packets. */
- if (flags &
- (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_BAD);
- /* Set L4 checksum flag only for TCP/UDP packets. */
- if (flags &
- (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_BAD);
- }
+ if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+ (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+ PKT_RX_IP_CKSUM_BAD);
+ if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+ PKT_RX_L4_CKSUM_BAD);
/*
* PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
* of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
* (its value is 0).
*/
- if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+ if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
ol_flags |=
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD) |
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
PKT_RX_L4_CKSUM_BAD);
return ol_flags;
}

/**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ * RX queue to fetch packet from.
+ *
+ * @return
+ * Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+ if (cqe)
+ return ntohl(cqe->byte_cnt);
+ return 0;
+}
+
+/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -492,134 +556,82 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_sge sges[pkts_n];
- unsigned int i;
+ struct rxq *rxq = dpdk_rxq;
unsigned int pkts_ret = 0;
- int ret;
+ unsigned int i;
+ unsigned int rq_ci = rxq->rq_ci;
+ const unsigned int elts_n = rxq->elts_n;
+ const unsigned int wqe_cnt = elts_n - 1;

for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[elts_head];
- unsigned int len;
- struct rte_mbuf *seg = elt->buf;
+ unsigned int idx = rq_ci & wqe_cnt;
struct rte_mbuf *rep;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(seg != NULL);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
+ struct rte_mbuf *pkt;
+ unsigned int len;
+ volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+
+ pkt = (*rxq->elts)[idx];
+ rte_prefetch0(cqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increment out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ break;
}
-
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- elt->buf = rep;
-
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
-
- /* Update seg information. */
- SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(seg) = 1;
- PORT(seg) = rxq->port_id;
- NEXT(seg) = NULL;
- PKT_LEN(seg) = len;
- DATA_LEN(seg) = len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- seg->packet_type = rxq_cq_to_pkt_type(flags);
- seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- seg->ol_flags |= PKT_RX_VLAN_PKT |
- PKT_RX_VLAN_STRIPPED;
- seg->vlan_tci = vlan_tci;
+ SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+ NB_SEGS(rep) = 1;
+ PORT(rep) = rxq->port_id;
+ NEXT(rep) = NULL;
+ len = rx_poll_len(rxq);
+ if (unlikely(len == 0)) {
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ /* Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes. */
+ wqe->addr = htonll((uintptr_t)rep->buf_addr +
+ RTE_PKTMBUF_HEADROOM);
+ (*rxq->elts)[idx] = rep;
+ /* Update pkt information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
}
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
}
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
+ PKT_LEN(pkt) = len;
+ DATA_LEN(pkt) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
rxq->stats.ibytes += len;
#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++pkts_ret;
+ ++rq_ci;
}
- if (unlikely(i == 0))
+ if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
return 0;
/* Repost WRs. */
#ifdef DEBUG_RECV
DEBUG("%p: reposting %u WRs", (void *)rxq, i);
#endif
- ret = rxq->recv(rxq->wq, sges, i);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci;
+ rte_wmb();
+ *rxq->cq_db = htonl(rxq->cq_ci);
+ rte_wmb();
+ *rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
rxq->stats.ipackets += pkts_ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 570345b..1827123 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -43,6 +43,7 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -61,6 +62,7 @@
#include "mlx5.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"

struct mlx5_rxq_stats {
unsigned int idx; /**< Mapping index. */
@@ -81,12 +83,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element. */
-struct rxq_elt {
- struct ibv_sge sge; /* Scatter/Gather Element. */
- struct rte_mbuf *buf; /* SGE buffer. */
-};
-
/* Flow director queue structure. */
struct fdir_queue {
struct ibv_qp *qp; /* Associated RX QP. */
@@ -97,25 +93,28 @@ struct priv;

/* RX queue descriptor. */
struct rxq {
- struct priv *priv; /* Back pointer to private data. */
- struct rte_mempool *mp; /* Memory Pool for allocations. */
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_exp_wq *wq; /* Work Queue. */
- int32_t (*poll)(); /* Verbs poll function. */
- int32_t (*recv)(); /* Verbs receive function. */
- unsigned int port_id; /* Port ID for incoming packets. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- struct rxq_elt (*elts)[]; /* RX elements. */
- struct mlx5_rxq_stats stats; /* RX queue counters. */
+ uint16_t rq_ci;
+ uint16_t cq_ci;
+ uint16_t elts_n;
+ uint16_t port_id;
+ volatile struct mlx5_wqe_data_seg(*wqes)[];
+ volatile struct mlx5_cqe(*cqes)[];
+ volatile uint32_t *rq_db;
+ volatile uint32_t *cq_db;
+ struct rte_mbuf *(*elts)[];
+ struct rte_mempool *mp;
+ struct mlx5_rxq_stats stats;
} __rte_cache_aligned;

/* RX queue control descriptor. */
struct rxq_ctrl {
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_exp_wq *wq; /* Work Queue. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -284,8 +283,9 @@ int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
void rxq_cleanup(struct rxq_ctrl *);
int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
-int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_rxconf *, struct rte_mempool *);
+int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_rxconf *,
+ struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
void mlx5_rx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 3b9b771..4719e69 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,6 +144,7 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -157,7 +158,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
.vlan_offloads = vlan_offloads,
};

- err = ibv_exp_modify_wq(rxq->wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: failed to modified stripping mode: %s",
(void *)priv, strerror(err));
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:31 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_txq.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4f17fb0..bae9f3d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -343,6 +343,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u,"
+ " max_inline_data=%u",
+ attr.init.cap.max_send_wr,
+ attr.init.cap.max_send_sge,
+ attr.init.cap.max_inline_data);
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:30 UTC
Permalink
This feature enables the TX burst function to emit up to 5 packets using
only two WQEs on devices that support it. Saves PCI bandwidth and improves
performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 10 ++
drivers/net/mlx5/mlx5.c | 14 +-
drivers/net/mlx5/mlx5_ethdev.c | 15 +-
drivers/net/mlx5/mlx5_rxtx.c | 400 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 2 +-
6 files changed, 439 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 9ada221..063c4a5 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -171,6 +171,16 @@ Run-time configuration

This option should be used in combination with ``txq_inline`` above.

+- ``txq_mpw_en`` parameter [int]
+
+ A nonzero value enables multi-packet send. This feature allows the TX
+ burst function to pack up to five packets in two descriptors in order to
+ save PCI bandwidth and improve performance at the cost of a slightly
+ higher CPU usage.
+
+ It is currently only supported on the ConnectX-4 Lx family of adapters.
+ Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 8c8c5e4..b85030a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -79,6 +79,9 @@
* enabling inline send. */
#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"

+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -280,6 +283,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
priv->txq_inline = tmp;
else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
priv->txqs_inline = tmp;
+ else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0)
+ priv->mps = !!tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -305,6 +310,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
MLX5_RXQ_CQE_COMP_EN,
MLX5_TXQ_INLINE,
MLX5_TXQS_MIN_INLINE,
+ MLX5_TXQ_MPW_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -499,6 +505,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->mps = mps; /* Enable MPW by default if supported. */
priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
@@ -547,7 +554,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)

priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
- priv->mps = mps;
+ if (priv->mps && !mps) {
+ ERROR("multi-packet send not supported on this device"
+ " (" MLX5_TXQ_MPW_EN ")");
+ err = ENOTSUP;
+ goto port_error;
+ }
/* Allocate and register default RSS hash keys. */
priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index a2bdc56..69bfe03 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -584,7 +584,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
DEV_RX_OFFLOAD_UDP_CKSUM |
DEV_RX_OFFLOAD_TCP_CKSUM) :
0);
- info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
+ if (!priv->mps)
+ info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
if (priv->hw_csum)
info->tx_offload_capa |=
(DEV_TX_OFFLOAD_IPV4_CKSUM |
@@ -1317,7 +1318,17 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
- if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ /* Display warning for unsupported configurations. */
+ if (priv->sriov && priv->mps)
+ WARN("multi-packet send WQE cannot be used on a SR-IOV setup");
+ /* Select appropriate TX function. */
+ if ((priv->sriov == 0) && priv->mps && priv->txq_inline) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
+ DEBUG("selected MPW inline TX function");
+ } else if ((priv->sriov == 0) && priv->mps) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
+ DEBUG("selected MPW TX function");
+ } else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
DEBUG("selected inline TX function (%u >= %u queues)",
priv->txqs_n, priv->txqs_inline);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 43fe532..2ee504d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -796,6 +796,406 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * Open a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+ volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+
+ mpw->state = MLX5_MPW_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw.eseg.mss = htons(length);
+ mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw.eseg.rsvd0 = 0;
+ mpw->wqe->mpw.eseg.rsvd1 = 0;
+ mpw->wqe->mpw.eseg.rsvd2 = 0;
+ mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw.ctrl.data[2] = 0;
+ mpw->wqe->mpw.ctrl.data[3] = 0;
+ mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
+ mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+ mpw->data.dseg[2] = &(*dseg)[0];
+ mpw->data.dseg[3] = &(*dseg)[1];
+ mpw->data.dseg[4] = &(*dseg)[2];
+}
+
+/**
+ * Close a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int num = mpw->pkts_n;
+
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ if (num < 3)
+ ++txq->wqe_ci;
+ else
+ txq->wqe_ci += 2;
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+}
+
+/**
+ * DPDK callback for TX with MPW support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
+ ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+ mlx5_mpw_close(txq, &mpw);
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ }
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
+ * Open a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+
+ mpw->state = MLX5_MPW_INL_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw_inl.ctrl.data[2] = 0;
+ mpw->wqe->mpw_inl.ctrl.data[3] = 0;
+ mpw->wqe->mpw_inl.eseg.mss = htons(length);
+ mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw_inl.eseg.cs_flags = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
+ mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+}
+
+/**
+ * Close a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int size;
+
+ size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw_inl.ctrl.data[1] =
+ htonl(txq->qp_num_8s | ((size + 15) / 16));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+ txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+}
+
+/**
+ * DPDK callback for TX with MPW inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ unsigned int inline_room = txq->max_inline;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+ mlx5_mpw_close(txq, &mpw);
+ } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (length > inline_room) ||
+ (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ if (length > inline_room) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ } else {
+ mlx5_mpw_inline_new(txq, &mpw, length);
+ mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ assert(inline_room == txq->max_inline);
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ } else {
+ unsigned int max;
+
+ assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
+ assert(length <= inline_room);
+ /* Maximum number of bytes before wrapping. */
+ max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+ (uintptr_t)mpw.data.raw);
+ if (length > max) {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ max);
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)(addr + max),
+ length - max);
+ mpw.data.raw += length - max;
+ } else {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ length);
+ mpw.data.raw += length;
+ }
+ if ((uintptr_t)mpw.data.raw ==
+ (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ } else
+ inline_room -= length;
+ }
+ mpw.total_len += length;
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw_inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw_inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
+ mlx5_mpw_inline_close(txq, &mpw);
+ else if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3c83148..41605f9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -312,6 +312,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6a4a96e..4f17fb0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -398,7 +398,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
- (priv->mps ?
+ ((priv->mps && !priv->sriov) ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
};
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:29 UTC
Permalink
From: Yaacov Hazan <***@mellanox.com>

Implement send inline feature which copies packet data directly into WQEs
for improved latency. The maximum packet size and the minimum number of Tx
queues to qualify for inline send are user-configurable.

This feature is effective when HW causes a performance bottleneck.

Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 17 +++
drivers/net/mlx5/mlx5.c | 13 ++
drivers/net/mlx5/mlx5.h | 2 +
drivers/net/mlx5/mlx5_ethdev.c | 5 +
drivers/net/mlx5/mlx5_rxtx.c | 271 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 4 +
7 files changed, 314 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 756153b..9ada221 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -154,6 +154,23 @@ Run-time configuration
allows to save PCI bandwidth and improve performance at the cost of a
slightly higher CPU usage. Enabled by default.

+- ``txq_inline`` parameter [int]
+
+ Amount of data to be inlined during TX operations. Improves latency.
+ Can improve PPS performance when PCI back pressure is detected and may be
+ useful for scenarios involving heavy traffic on many queues.
+
+ It is not enabled by default (set to 0) since the additional software
+ logic necessary to handle this mode can lower performance when back
+ pressure is not expected.
+
+- ``txqs_min_inline`` parameter [int]
+
+ Enable inline send only when the number of TX queues is greater or equal
+ to this value.
+
+ This option should be used in combination with ``txq_inline`` above.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7e8c579..8c8c5e4 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -72,6 +72,13 @@
/* Device parameter to enable RX completion queue compression. */
#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"

+/* Device parameter to configure inline send. */
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/* Device parameter to configure the number of TX queues threshold for
+ * enabling inline send. */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
/**
* Retrieve integer value from environment variable.
*
@@ -269,6 +276,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
}
if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
priv->cqe_comp = !!tmp;
+ else if (strcmp(MLX5_TXQ_INLINE, key) == 0)
+ priv->txq_inline = tmp;
+ else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
+ priv->txqs_inline = tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -292,6 +303,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
MLX5_RXQ_CQE_COMP_EN,
+ MLX5_TXQ_INLINE,
+ MLX5_TXQS_MIN_INLINE,
};
struct rte_kvargs *kvlist;
int ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8f5a6df..3a86609 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -113,6 +113,8 @@ struct priv {
unsigned int mps:1; /* Whether multi-packet send is supported. */
unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
+ unsigned int txq_inline; /* Maximum packet size for inlining. */
+ unsigned int txqs_inline; /* Queue number threshold for inlining. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 4e125a7..a2bdc56 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1317,6 +1317,11 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
+ if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
+ DEBUG("selected inline TX function (%u >= %u queues)",
+ priv->txqs_n, priv->txqs_inline);
+ }
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index d56c9e9..43fe532 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -374,6 +374,139 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
}

/**
+ * Write a inline WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ */
+static inline void
+mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length)
+{
+ uint32_t size;
+ uint16_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (void *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ size = 3 + ((4 + length + 15) / 16);
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
+ * Write a inline WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint16_t vlan_tci)
+{
+ uint32_t size;
+ uint32_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (uint8_t *)addr, 12);
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12,
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 16,
+ ((uint8_t *)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ size = (sizeof(wqe->inl.ctrl.ctrl) +
+ sizeof(wqe->inl.eseg) +
+ sizeof(wqe->inl.byte_cnt) +
+ length + 15) / 16;
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
* Ring TX queue doorbell.
*
* @param txq
@@ -415,6 +548,23 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
}

/**
+ * Prefetch a WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe_ci
+ * WQE consumer index.
+ */
+static inline void
+tx_prefetch_wqe(struct txq *txq, uint16_t ci)
+{
+ volatile union mlx5_wqe *wqe;
+
+ wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
+}
+
+/**
* DPDK callback for TX.
*
* @param dpdk_txq
@@ -525,6 +675,127 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * DPDK callback for TX with inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;
+ unsigned int max_inline = txq->max_inline;
+
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
+ rte_prefetch0(buf);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+
+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ if (i + 1 < max)
+ rte_prefetch0(pkts[i + 1]);
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->inl.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->inl.eseg.cs_flags = 0;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Prefetch next buffer data. */
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
+ if (length <= max_inline) {
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_inline_vlan(txq, wqe,
+ addr, length,
+ buf->vlan_tci);
+ else
+ mlx5_wqe_write_inline(txq, wqe, addr, length);
+ } else {
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length,
+ lkey, buf->vlan_tci);
+ else
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ }
+ wqe->inl.ctrl.data[2] = 0;
+ elts_head = elts_head_next;
+ buf = pkts[i + 1];
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f900e65..3c83148 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint16_t max_inline; /* Maximum size to inline in a WQE. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
@@ -310,6 +311,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7b2dc7c..6a4a96e 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -332,6 +332,10 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
+ if (priv->txq_inline && priv->txqs_n >= priv->txqs_inline) {
+ tmpl.txq.max_inline = priv->txq_inline;
+ attr.init.cap.max_inline_data = tmpl.txq.max_inline;
+ }
tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:32 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The space necessary to store segmented packets cannot be known in advance
and must be verified for each of them.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 136 ++++++++++++++++++++++---------------------
1 file changed, 70 insertions(+), 66 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ee504d..7097713 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -583,50 +583,49 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
rte_prefetch0(wqe);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -649,8 +648,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
txq->stats.obytes += length;
#endif
elts_head = elts_head_next;
- buf = pkts[i + 1];
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -693,44 +692,43 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -745,8 +743,8 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
if (length <= max_inline) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
@@ -766,12 +764,12 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}
wqe->inl.ctrl.data[2] = 0;
elts_head = elts_head_next;
- buf = pkts[i + 1];
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -879,13 +877,15 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -895,22 +895,22 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
+ do {
+ struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *dseg;
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -943,7 +943,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -1048,7 +1049,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1056,6 +1057,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -1065,21 +1068,21 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -1165,7 +1168,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:33 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Compared to its previous incarnation, the software limit on the number of
mbuf segments is no more (previously MLX5_PMD_SGE_WR_N, set to 4 by
default) hence no need for linearization code and related buffers that
permanently consumed a non negligible amount of memory to handle oversized
mbufs.

The resulting code is both lighter and faster.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 231 +++++++++++++++++++++++++++++++++----------
drivers/net/mlx5/mlx5_txq.c | 6 +-
2 files changed, 182 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7097713..db784c0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -301,6 +301,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
{
wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -346,6 +347,7 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,

wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -423,6 +425,7 @@ mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -496,6 +499,7 @@ mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -584,6 +588,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -600,21 +605,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
rte_prefetch0(wqe);
if (pkts_n)
rte_prefetch0(*pkts);
@@ -634,7 +643,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -643,6 +651,35 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
MLX5_ETH_WQE_L4_CSUM;
} else
wqe->wqe.eseg.cs_flags = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+ elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
@@ -654,7 +691,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->wqe.ctrl.data[2] = htonl(8);
@@ -693,6 +730,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -710,21 +748,25 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
if (pkts_n)
@@ -746,13 +788,14 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (pkts_n)
rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
- if (length <= max_inline) {
+ if ((length <= max_inline) && (segs_n == 1)) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
mlx5_wqe_write_inline_vlan(txq, wqe,
addr, length,
buf->vlan_tci);
else
mlx5_wqe_write_inline(txq, wqe, addr, length);
+ goto skip_segs;
} else {
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -762,7 +805,35 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
}
- wqe->inl.ctrl.data[2] = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->inl.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f);
+ skip_segs:
elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
@@ -774,7 +845,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->inl.ctrl.data[2] = htonl(8);
@@ -878,6 +949,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
@@ -896,46 +968,67 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
- volatile struct mlx5_wqe_data_seg *dseg;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
- uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
+ break;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
+ assert(length);
/* Start new session if packet differs. */
if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
mlx5_mpw_close(txq, &mpw);
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
}
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+ uintptr_t addr;
+
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
elts_head = elts_head_next;
@@ -949,7 +1042,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

@@ -1050,6 +1144,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1069,36 +1164,38 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
+ break;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
/* Start new session if packet differs. */
if (mpw.state == MLX5_MPW_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags))
mlx5_mpw_close(txq, &mpw);
} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(length > inline_room) ||
(mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
mlx5_mpw_inline_close(txq, &mpw);
@@ -1106,7 +1203,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
}
}
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
- if (length > inline_room) {
+ if ((segs_n != 1) ||
+ (length > inline_room)) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
} else {
@@ -1114,17 +1212,36 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
}
}
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
if (mpw.state == MLX5_MPW_STATE_OPENED) {
- volatile struct mlx5_wqe_data_seg *dseg;
-
assert(inline_room == txq->max_inline);
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ elts_head_next =
+ (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
} else {
@@ -1132,6 +1249,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,

assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
assert(length <= inline_room);
+ assert(length == DATA_LEN(buf));
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ (*txq->elts)[elts_head] = buf;
/* Maximum number of bytes before wrapping. */
max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
(uintptr_t)mpw.data.raw);
@@ -1156,6 +1277,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.data.raw =
(volatile void *)&(*txq->wqes)[0];
++mpw.pkts_n;
+ ++j;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
mlx5_mpw_inline_close(txq, &mpw);
inline_room = txq->max_inline;
@@ -1174,7 +1296,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index bae9f3d..f1c17e0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -320,7 +320,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
priv->device_attr.max_qp_wr :
desc),
- /* Max number of scatter/gather elements in a WR. */
+ /* Max number of scatter/gather elements in a WR,
+ * must be 1 to prevent libmlx5 from trying to affect
+ * too much memory. TX gather is not impacted by the
+ * priv->device_attr.max_sge limit and will still work
+ * properly. */
.max_send_sge = 1,
},
.qp_type = IBV_QPT_RAW_PACKET,
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:34 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Since commit "mlx5: resurrect Tx gather support", older GCC versions (such
as 4.8.5) may complain about the following:

mlx5_rxtx.c: In function `mlx5_tx_burst':
mlx5_rxtx.c:705:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

mlx5_rxtx.c: In function `mlx5_tx_burst_inline':
mlx5_rxtx.c:864:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

In both cases, this code cannot be reached when wqe is not initialized.

Considering older GCC versions are still widely used, work around this
issue by initializing wqe preemptively, even if it should not be necessary.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index db784c0..2fc57dc 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -591,7 +591,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;

if (unlikely(!pkts_n))
return 0;
@@ -733,7 +733,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:35 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Toggling RX checksum offloads is already done at initialization time. This
code does not belong in rxq_rehash().

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxq.c | 10 ----------
1 file changed, 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 6881cdd..707296c 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -798,7 +798,6 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
@@ -811,15 +810,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
/* Number of descriptors and mbufs currently allocated. */
desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
- /* Toggle RX checksum offload if hardware supports it. */
- if (priv->hw_csum) {
- tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum = tmpl.rxq.csum;
- }
- if (priv->hw_csum_l2tun) {
- tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:36 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The primary purpose of rxq_rehash() function is to stop and restart
reception on a queue after re-posting buffers. This may fail if the array
that temporarily stores existing buffers for reuse cannot be allocated.

Update rxq_rehash() to work on the target queue directly (not through a
template copy) and avoid this allocation.

rxq_alloc_elts() is modified accordingly to take buffers from an existing
queue directly and update their refcount.

Unlike rxq_rehash(), rxq_setup() must work on a temporary structure but
should not allocate new mbufs from the pool while reinitializing an
existing queue. This is achieved by using the refcount-aware
rxq_alloc_elts() before overwriting queue data.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_rxq.c | 83 ++++++++++++++++++++++-----------------------
1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 707296c..0a3225e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -642,7 +642,7 @@ priv_rehash_flows(struct priv *priv)
*/
static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
- struct rte_mbuf **pool)
+ struct rte_mbuf *(*pool)[])
{
unsigned int i;
int ret = 0;
@@ -654,9 +654,10 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
&(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
- buf = *(pool++);
+ buf = (*pool)[i];
assert(buf != NULL);
rte_pktmbuf_reset(buf);
+ rte_pktmbuf_refcnt_update(buf, 1);
} else
buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
@@ -781,7 +782,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
}

/**
- * Reconfigure a RX queue with new parameters.
+ * Reconfigure RX queue buffers.
*
* rxq_rehash() does not allocate mbufs, which, if not done from the right
* thread (such as a control thread), may corrupt the pool.
@@ -798,67 +799,48 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct rxq_ctrl tmpl = *rxq_ctrl;
- unsigned int mbuf_n;
- unsigned int desc_n;
- struct rte_mbuf **pool;
- unsigned int i, k;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ unsigned int i;
struct ibv_exp_wq_attr mod;
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
- /* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.rxq.elts_n;
- mbuf_n = desc_n;
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
return err;
}
- /* Allocate pool. */
- pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
- if (pool == NULL) {
- ERROR("%p: cannot allocate memory", (void *)dev);
- return ENOBUFS;
- }
/* Snatch mbufs from original queue. */
- k = 0;
- for (i = 0; (i != desc_n); ++i)
- pool[k++] = (*rxq_ctrl->rxq.elts)[i];
- assert(k == mbuf_n);
- rte_free(pool);
+ claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
+ for (i = 0; i != elts_n; ++i) {
+ struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
+
+ assert(rte_mbuf_refcnt_read(buf) == 2);
+ rte_pktmbuf_free_seg(buf);
+ }
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
- /* Post SGEs. */
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc_n;
+ rxq_ctrl->rxq.rq_ci = elts_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
- *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -868,24 +850,26 @@ error:
*
* @param tmpl
* Pointer to RX queue control template.
- * @param rxq_ctrl
- * Pointer to RX queue control.
*
* @return
* 0 on success, errno value on failure.
*/
static inline int
-rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+rxq_setup(struct rxq_ctrl *tmpl)
{
struct ibv_cq *ibcq = tmpl->cq;
struct mlx5_cq *cq = to_mxxx(cq, cq);
struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+ struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+ rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);

if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
"it should be set to %u", RTE_CACHE_LINE_SIZE);
return EINVAL;
}
+ if (elts == NULL)
+ return ENOMEM;
tmpl->rxq.rq_db = rwq->rq.db;
tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
@@ -897,9 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
tmpl->rxq.cqes =
(volatile struct mlx5_cqe (*)[])
(uintptr_t)cq->active_buf->buf;
- tmpl->rxq.elts =
- (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
- ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+ tmpl->rxq.elts = elts;
return 0;
}

@@ -947,6 +929,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
unsigned int cqe_n = desc - 1;
+ struct rte_mbuf *(*elts)[desc] = NULL;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -1103,13 +1086,19 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_setup(&tmpl, rxq_ctrl);
+ ret = rxq_setup(&tmpl);
if (ret) {
ERROR("%p: cannot initialize RX queue structure: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ /* Reuse buffers from original queue if possible. */
+ if (rxq_ctrl->rxq.elts_n) {
+ assert(rxq_ctrl->rxq.elts_n == desc);
+ assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
+ ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
+ } else
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1118,6 +1107,14 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
+ /* Move mbuf pointers to dedicated storage area in RX queue. */
+ elts = (void *)(rxq_ctrl + 1);
+ rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
+#ifndef NDEBUG
+ memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
+#endif
+ rte_free(tmpl.rxq.elts);
+ tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
rxq_ctrl->rxq.rq_ci = desc;
@@ -1127,7 +1124,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
assert(ret == 0);
return 0;
error:
+ elts = tmpl.rxq.elts;
rxq_cleanup(&tmpl);
+ rte_free(elts);
assert(ret > 0);
return ret;
}
--
2.1.4
Nelio Laranjeiro
2016-06-20 16:10:37 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

This commit brings back Rx scatter and related support by the MTU update
function. The maximum number of segments per packet is not a fixed value
anymore (previously MLX5_PMD_SGE_WR_N, set to 4 by default) as it caused
performance issues when fewer segments were actually needed as well as
limitations on the maximum packet size that could be received with the
default mbuf size (supporting at most 8576 bytes).

These limitations are now lifted as the number of SGEs is derived from the
MTU (which implies MRU) at queue initialization and during MTU update.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 84 +++++++++++++++++++++----
drivers/net/mlx5/mlx5_rxq.c | 73 +++++++++++++++++-----
drivers/net/mlx5/mlx5_rxtx.c | 139 ++++++++++++++++++++++++-----------------
drivers/net/mlx5/mlx5_rxtx.h | 1 +
4 files changed, 215 insertions(+), 82 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 69bfe03..757f8e4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -725,6 +725,9 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
unsigned int i;
uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
mlx5_rx_burst;
+ unsigned int max_frame_len;
+ int rehash;
+ int restart = priv->started;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;
@@ -738,7 +741,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
goto out;
} else
DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
- priv->mtu = mtu;
/* Temporarily replace RX handler with a fake one, assuming it has not
* been copied elsewhere. */
dev->rx_pkt_burst = removed_rx_burst;
@@ -746,28 +748,88 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
* removed_rx_burst() instead. */
rte_wmb();
usleep(1000);
+ /* MTU does not include header and CRC. */
+ max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN;
+ /* Check if at least one queue is going to need a SGE update. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct rxq *rxq = (*priv->rxqs)[i];
+ unsigned int mb_len;
+ unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len;
+ unsigned int sges_n;
+
+ if (rxq == NULL)
+ continue;
+ mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ if (sges_n != rxq->sges_n)
+ break;
+ }
+ /* If all queues have the right number of SGEs, a simple rehash
+ * of their buffers is enough, otherwise SGE information can only
+ * be updated in a queue by recreating it. All resources that depend
+ * on queues (flows, indirection tables) must be recreated as well in
+ * that case. */
+ rehash = (i == priv->rxqs_n);
+ if (!rehash) {
+ /* Clean up everything as with mlx5_dev_stop(). */
+ priv_special_flow_disable_all(priv);
+ priv_mac_addrs_disable(priv);
+ priv_destroy_hash_rxqs(priv);
+ priv_fdir_disable(priv);
+ priv_dev_interrupt_handler_uninstall(priv, dev);
+ }
+recover:
/* Reconfigure each RX queue. */
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
- unsigned int mb_len;
- unsigned int max_frame_len;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct rxq_ctrl, rxq);
int sp;
+ unsigned int mb_len;
+ unsigned int tmp;

if (rxq == NULL)
continue;
- /* Calculate new maximum frame length according to MTU and
- * toggle scattered support (sp) if necessary. */
- max_frame_len = (priv->mtu + ETHER_HDR_LEN +
- (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Toggle scattered support (sp) if necessary. */
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- if (sp) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- ret = ENOTSUP;
- goto out;
+ /* Provide new values to rxq_setup(). */
+ dev->data->dev_conf.rxmode.jumbo_frame = sp;
+ dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
+ if (rehash)
+ ret = rxq_rehash(dev, rxq_ctrl);
+ else
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+ rxq_ctrl->socket, NULL, rxq->mp);
+ if (!ret)
+ continue;
+ /* Attempt to roll back in case of error. */
+ tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM;
+ if (max_frame_len != tmp) {
+ max_frame_len = tmp;
+ goto recover;
}
+ /* Double fault, disable RX. */
+ break;
}
+ /* Use a safe RX burst function in case of error, otherwise mimic
+ * mlx5_dev_start(). */
+ if (ret) {
+ ERROR("unable to reconfigure RX queues, RX disabled");
+ rx_func = removed_rx_burst;
+ } else if (restart &&
+ !rehash &&
+ !priv_create_hash_rxqs(priv) &&
+ !priv_rehash_flows(priv)) {
+ if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE)
+ priv_fdir_enable(priv);
+ priv_dev_interrupt_handler_install(priv, dev);
+ }
+ priv->mtu = mtu;
/* Burst functions can now be called again. */
rte_wmb();
dev->rx_pkt_burst = rx_func;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0a3225e..38e3caa 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -644,10 +644,11 @@ static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf *(*pool)[])
{
+ const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
unsigned int i;
int ret = 0;

- /* For each WR (packet). */
+ /* Iterate on segments. */
for (i = 0; (i != elts_n); ++i) {
struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *scat =
@@ -672,6 +673,9 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
assert(!buf->next);
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ SET_DATA_OFF(buf, 0);
PORT(buf) = rxq_ctrl->rxq.port_id;
DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
PKT_LEN(buf) = DATA_LEN(buf);
@@ -685,8 +689,8 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
};
(*rxq_ctrl->rxq.elts)[i] = buf;
}
- DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq_ctrl, elts_n);
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
assert(ret == 0);
return 0;
error:
@@ -804,7 +808,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
struct ibv_exp_wq_attr mod;
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
+ DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
+ (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
+ assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -837,7 +843,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
goto error;
}
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = elts_n;
+ rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
@@ -933,9 +939,40 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of 2)", (void *)dev);
+ /* Enable scattered packets support for this queue if necessary. */
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+ (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+ (mb_len - RTE_PKTMBUF_HEADROOM))) {
+ unsigned int size =
+ RTE_PKTMBUF_HEADROOM +
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ unsigned int sges_n;
+
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ tmpl.rxq.sges_n = sges_n;
+ /* Make sure rxq.sges_n did not overflow. */
+ size = mb_len * (1 << tmpl.rxq.sges_n);
+ size -= RTE_PKTMBUF_HEADROOM;
+ if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+ ERROR("%p: too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u",
+ (void *)dev,
+ 1 << sges_n,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len);
+ return EOVERFLOW;
+ }
+ }
+ DEBUG("%p: maximum number of segments per packet: %u",
+ (void *)dev, 1 << tmpl.rxq.sges_n);
+ if (desc % (1 << tmpl.rxq.sges_n)) {
+ ERROR("%p: number of RX queue descriptors (%u) is not a"
+ " multiple of SGEs per packet (%u)",
+ (void *)dev,
+ desc,
+ 1 << tmpl.rxq.sges_n);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -943,7 +980,6 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -993,11 +1029,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
- priv->device_attr.max_qp_wr :
- (int)desc),
+ .max_recv_wr = desc >> tmpl.rxq.sges_n,
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = 1,
+ .max_recv_sge = 1 << tmpl.rxq.sges_n,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1049,6 +1083,17 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ /* Make sure number of WRs*SGEs match expectations since a queue
+ * cannot allocate more than "desc" buffers. */
+ if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
+ ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
+ ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+ (void *)dev,
+ (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
+ attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+ ret = EINVAL;
+ goto error;
+ }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1117,7 +1162,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc;
+ rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2fc57dc..71ecdcd 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1520,96 +1520,121 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- unsigned int pkts_ret = 0;
- unsigned int i;
- unsigned int rq_ci = rxq->rq_ci;
- const unsigned int elts_n = rxq->elts_n;
- const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int wqe_cnt = rxq->elts_n - 1;
const unsigned int cqe_cnt = rxq->cqe_n - 1;
+ const unsigned int sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ unsigned int i = 0;
+ unsigned int rq_ci = rxq->rq_ci << sges_n;
+ int len;

- for (i = 0; (i != pkts_n); ++i) {
+ while (pkts_n) {
unsigned int idx = rq_ci & wqe_cnt;
- int len;
- struct rte_mbuf *rep;
- struct rte_mbuf *pkt;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
- volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ struct rte_mbuf *rep = (*rxq->elts)[idx];

- pkt = (*rxq->elts)[idx];
+ if (pkt)
+ NEXT(seg) = rep;
+ seg = rep;
+ rte_prefetch0(seg);
rte_prefetch0(cqe);
+ rte_prefetch0(wqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
+ while (pkt) {
+ seg = NEXT(pkt);
+ rte_mbuf_refcnt_set(pkt, 0);
+ __rte_mbuf_raw_free(pkt);
+ pkt = seg;
+ }
++rxq->stats.rx_nombuf;
break;
}
- SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(rep) = 1;
- PORT(rep) = rxq->port_id;
- NEXT(rep) = NULL;
- len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
- if (unlikely(len == 0)) {
- rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
- break;
- }
- if (unlikely(len == -1)) {
- /* RX error, packet is likely too large. */
- rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
- ++rxq->stats.idropped;
- --i;
- goto skip;
+ if (!pkt) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
+ if (len == 0) {
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
+ /* Update packet information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type =
+ rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags =
+ rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc &
+ MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
+ }
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
+ }
+ PKT_LEN(pkt) = len;
}
+ DATA_LEN(rep) = DATA_LEN(seg);
+ PKT_LEN(rep) = PKT_LEN(seg);
+ SET_DATA_OFF(rep, DATA_OFF(seg));
+ NB_SEGS(rep) = NB_SEGS(seg);
+ PORT(rep) = PORT(seg);
+ NEXT(rep) = NULL;
+ (*rxq->elts)[idx] = rep;
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
- wqe->addr = htonll((uintptr_t)rep->buf_addr +
- RTE_PKTMBUF_HEADROOM);
- (*rxq->elts)[idx] = rep;
- /* Update pkt information. */
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
- rxq->crc_present) {
- if (rxq->csum) {
- pkt->packet_type = rxq_cq_to_pkt_type(cqe);
- pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
- }
- if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
- pkt->ol_flags |= PKT_RX_VLAN_PKT;
- pkt->vlan_tci = ntohs(cqe->vlan_info);
- }
- if (rxq->crc_present)
- len -= ETHER_CRC_LEN;
+ wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > DATA_LEN(seg)) {
+ len -= DATA_LEN(seg);
+ ++NB_SEGS(pkt);
+ ++rq_ci;
+ continue;
}
- PKT_LEN(pkt) = len;
- DATA_LEN(pkt) = len;
+ DATA_LEN(seg) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
- rxq->stats.ibytes += len;
+ rxq->stats.ibytes += PKT_LEN(pkt);
#endif
/* Return packet. */
*(pkts++) = pkt;
- ++pkts_ret;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
++rq_ci;
+ rq_ci <<= sges_n;
}
- if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
+ if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
return 0;
- /* Repost WRs. */
-#ifdef DEBUG_RECV
- DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
/* Update the consumer index. */
- rxq->rq_ci = rq_ci;
+ rxq->rq_ci = rq_ci >> sges_n;
rte_wmb();
*rxq->cq_db = htonl(rxq->cq_ci);
rte_wmb();
*rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
+ rxq->stats.ipackets += i;
#endif
- return pkts_ret;
+ return i;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 41605f9..f6e2cba 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -106,6 +106,7 @@ struct rxq {
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
+ unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
--
2.1.4
Ferruh Yigit
2016-06-20 17:01:49 UTC
Permalink
Hi Nelio,
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
- Rebased patchset on top of dpdk/master.
This is driver patch, and should be rebased on top of
dpdk-next-net/rel_16_07.

I tried to apply to a few branches but all failed, am I missing something?


The error log for applying to dpdk-next-net/rel_16_07:

Applying patch #14086 using 'git am'
Description: [dpdk-dev,v2,01/25] drivers: fix PCI class id support
Applying: drivers: fix PCI class id support

Applying patch #14087 using 'git am'
Description: [dpdk-dev,v2,02/25] mlx5: split memory registration function
Applying: mlx5: split memory registration function

Applying patch #14088 using 'git am'
Description: [dpdk-dev,v2,03/25] mlx5: remove Tx gather support
Applying: mlx5: remove Tx gather support

Applying patch #14089 using 'git am'
Description: [dpdk-dev,v2,04/25] mlx5: remove Rx scatter support
Applying: mlx5: remove Rx scatter support
error: patch failed: drivers/net/mlx5/mlx5_rxtx.c:502
error: drivers/net/mlx5/mlx5_rxtx.c: patch does not apply
Patch failed at 0001 mlx5: remove Rx scatter support
The copy of the patch that failed is found in:
/tmp/dpdk-b/.git/rebase-apply/patch
Post by Nelio Laranjeiro
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.
Nélio Laranjeiro
2016-06-21 06:44:46 UTC
Permalink
Hi Ferruh,
Post by Ferruh Yigit
Hi Nelio,
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
- Rebased patchset on top of dpdk/master.
This is driver patch, and should be rebased on top of
dpdk-next-net/rel_16_07.
I just applied it on this branch, in fact some on the patch fails to
apply.
Post by Ferruh Yigit
I tried to apply to a few branches but all failed, am I missing something?
No I missed something, I did not notice it should be rebased on top of
rel_16_07.
Post by Ferruh Yigit
Applying patch #14086 using 'git am'
Description: [dpdk-dev,v2,01/25] drivers: fix PCI class id support
Applying: drivers: fix PCI class id support
Applying patch #14087 using 'git am'
Description: [dpdk-dev,v2,02/25] mlx5: split memory registration function
Applying: mlx5: split memory registration function
Applying patch #14088 using 'git am'
Description: [dpdk-dev,v2,03/25] mlx5: remove Tx gather support
Applying: mlx5: remove Tx gather support
Applying patch #14089 using 'git am'
Description: [dpdk-dev,v2,04/25] mlx5: remove Rx scatter support
Applying: mlx5: remove Rx scatter support
error: patch failed: drivers/net/mlx5/mlx5_rxtx.c:502
error: drivers/net/mlx5/mlx5_rxtx.c: patch does not apply
Patch failed at 0001 mlx5: remove Rx scatter support
/tmp/dpdk-b/.git/rebase-apply/patch
I prepare a v3 on top on this branch.
--
Nélio Laranjeiro
6WIND
Nelio Laranjeiro
2016-06-21 07:23:13 UTC
Permalink
Enhance mlx5 with a data path that bypasses Verbs.

The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.

The PMD remains usable during the transition.

This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".

Changes in v3:
- Rebased patchset on top of next-net/rel_16_07.

Changes in v2:
- Rebased patchset on top of dpdk/master.
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.

Adrien Mazarguil (8):
mlx5: replace countdown with threshold for Tx completions
mlx5: add debugging information about Tx queues capabilities
mlx5: check remaining space while processing Tx burst
mlx5: resurrect Tx gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant Rx queue initialization code
mlx5: make Rx queue reinitialization safer
mlx5: resurrect Rx scatter support

Nelio Laranjeiro (16):
drivers: fix PCI class id support
mlx5: split memory registration function
mlx5: remove Tx gather support
mlx5: remove Rx scatter support
mlx5: remove configuration variable
mlx5: remove inline Tx support
mlx5: split Tx queue structure
mlx5: split Rx queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add Tx/Rx burst function selection wrapper
mlx5: refactor Rx data path
mlx5: refactor Tx data path
mlx5: handle Rx CQE compression
mlx5: add support for multi-packet send

Yaacov Hazan (1):
mlx5: add support for inline send

config/common_base | 2 -
doc/guides/nics/mlx5.rst | 94 +-
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +-
drivers/net/mlx4/mlx4.c | 18 +-
drivers/net/mlx5/Makefile | 49 +-
drivers/net/mlx5/mlx5.c | 182 ++-
drivers/net/mlx5/mlx5.h | 10 +
drivers/net/mlx5/mlx5_defs.h | 26 +-
drivers/net/mlx5/mlx5_ethdev.c | 188 ++-
drivers/net/mlx5/mlx5_fdir.c | 20 +-
drivers/net/mlx5/mlx5_mr.c | 280 ++++
drivers/net/mlx5/mlx5_prm.h | 163 +++
drivers/net/mlx5/mlx5_rxmode.c | 8 -
drivers/net/mlx5/mlx5_rxq.c | 762 ++++-------
drivers/net/mlx5/mlx5_rxtx.c | 2210 +++++++++++++++++++-------------
drivers/net/mlx5/mlx5_rxtx.h | 176 ++-
drivers/net/mlx5/mlx5_txq.c | 368 +++---
drivers/net/mlx5/mlx5_vlan.c | 6 +-
drivers/net/nfp/nfp_net.c | 12 +-
19 files changed, 2624 insertions(+), 1955 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c
create mode 100644 drivers/net/mlx5/mlx5_prm.h
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:15 UTC
Permalink
Except for the first time when memory registration occurs, the lkey is
always cached. Since memory registration is slow and performs system calls,
performance can be improved by moving that code to its own function outside
of the data path so only the lookup code is left in the original inlined
function.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 1 +
drivers/net/mlx5/mlx5_mr.c | 277 +++++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.c | 209 ++------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 8 +-
4 files changed, 295 insertions(+), 200 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 82558aa..999ada5 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -47,6 +47,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c

# Dependencies.
DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 0000000..7c3e87f
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,277 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mempool.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+struct mlx5_check_mempool_data {
+ int ret;
+ char *start;
+ char *end;
+};
+
+/* Called by mlx5_check_mempool() when iterating the memory chunks. */
+static void mlx5_check_mempool_cb(struct rte_mempool *mp,
+ void *opaque, struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx)
+{
+ struct mlx5_check_mempool_data *data = opaque;
+
+ (void)mp;
+ (void)mem_idx;
+
+ /* It already failed, skip the next chunks. */
+ if (data->ret != 0)
+ return;
+ /* It is the first chunk. */
+ if (data->start == NULL && data->end == NULL) {
+ data->start = memhdr->addr;
+ data->end = data->start + memhdr->len;
+ return;
+ }
+ if (data->end == memhdr->addr) {
+ data->end += memhdr->len;
+ return;
+ }
+ if (data->start == (char *)memhdr->addr + memhdr->len) {
+ data->start -= memhdr->len;
+ return;
+ }
+ /* Error, mempool is not virtually contiguous. */
+ data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ * Pointer to memory pool.
+ * @param[out] start
+ * Pointer to the start address of the mempool virtual memory area
+ * @param[out] end
+ * Pointer to the end address of the mempool virtual memory area
+ *
+ * @return
+ * 0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
+ uintptr_t *end)
+{
+ struct mlx5_check_mempool_data data;
+
+ memset(&data, 0, sizeof(data));
+ rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
+ *start = (uintptr_t)data.start;
+ *end = (uintptr_t)data.end;
+
+ return data.ret;
+}
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ * Pointer to protection domain.
+ * @param mp
+ * Pointer to memory pool.
+ *
+ * @return
+ * Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ uintptr_t start;
+ uintptr_t end;
+ unsigned int i;
+
+ if (mlx5_check_mempool(mp, &start, &end) != 0) {
+ ERROR("mempool %p: not virtually contiguous",
+ (void *)mp);
+ return NULL;
+ }
+
+ DEBUG("mempool %p area start=%p end=%p size=%zu",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ /* Round start and end to page boundary if found in memory segments. */
+ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+ uintptr_t addr = (uintptr_t)ms[i].addr;
+ size_t len = ms[i].len;
+ unsigned int align = ms[i].hugepage_sz;
+
+ if ((start > addr) && (start < addr + len))
+ start = RTE_ALIGN_FLOOR(start, align);
+ if ((end > addr) && (end < addr + len))
+ end = RTE_ALIGN_CEIL(end, align);
+ }
+ DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ return ibv_reg_mr(pd,
+ (void *)start,
+ end - start,
+ IBV_ACCESS_LOCAL_WRITE);
+}
+
+/**
+ * Register a Memory Region (MR) <-> Memory Pool (MP) association in
+ * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ *
+ * This function should only be called by txq_mp2mr().
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] mp
+ * Memory Pool for which a Memory Region lkey must be returned.
+ * @param idx
+ * Index of the next available entry.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
+{
+ struct ibv_mr *mr;
+
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq->priv->pd, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --idx;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[idx].mp = mp;
+ txq->mp2mr[idx].mr = mr;
+ txq->mp2mr[idx].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
+ return txq->mp2mr[idx].lkey;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+ int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ * The mempool pointer
+ * @param[in] arg
+ * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
+ * return value.
+ * @param[in] obj
+ * Object address.
+ * @param index
+ * Object index, unused.
+ */
+static void
+txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+ uint32_t index __rte_unused)
+{
+ struct txq_mp2mr_mbuf_check_data *data = arg;
+ struct rte_mbuf *buf = obj;
+
+ /* Check whether mbuf structure fits element size and whether mempool
+ * pointer is valid. */
+ if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+ data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a TX queue.
+ *
+ * @param[in] mp
+ * Memory Pool to register.
+ * @param *arg
+ * Pointer to TX queue structure.
+ */
+void
+txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+ struct txq *txq = arg;
+ struct txq_mp2mr_mbuf_check_data data = {
+ .ret = 0,
+ };
+ unsigned int i;
+
+ /* Register mempool only if the first element looks like a mbuf. */
+ if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
+ data.ret == -1)
+ return;
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp)
+ return;
+ }
+ txq_mp2mr_reg(txq, mp, i);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9cb1dfa..616cf7a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -140,121 +140,6 @@ txq_complete(struct txq *txq)
return 0;
}

-struct mlx5_check_mempool_data {
- int ret;
- char *start;
- char *end;
-};
-
-/* Called by mlx5_check_mempool() when iterating the memory chunks. */
-static void mlx5_check_mempool_cb(struct rte_mempool *mp,
- void *opaque, struct rte_mempool_memhdr *memhdr,
- unsigned mem_idx)
-{
- struct mlx5_check_mempool_data *data = opaque;
-
- (void)mp;
- (void)mem_idx;
-
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
- return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
- return;
- }
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
- }
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
- }
- /* Error, mempool is not virtually contigous. */
- data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area
- *
- * @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
- uintptr_t *end)
-{
- struct mlx5_check_mempool_data data;
-
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
-
- return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __attribute__((noinline));
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- * Pointer to protection domain.
- * @param mp
- * Pointer to memory pool.
- *
- * @return
- * Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
-
- if (mlx5_check_mempool(mp, &start, &end) != 0) {
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
- }
-
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- return ibv_reg_mr(pd,
- (void *)start,
- end - start,
- IBV_ACCESS_LOCAL_WRITE);
-}
-
/**
* Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
@@ -273,6 +158,10 @@ txq_mb2mp(struct rte_mbuf *buf)
return buf->pool;
}

+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+ __attribute__((always_inline));
+
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
@@ -286,11 +175,11 @@ txq_mb2mp(struct rte_mbuf *buf)
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
-static uint32_t
+static inline uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
- struct ibv_mr *mr;
+ uint32_t lkey = (uint32_t)-1;

for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
@@ -300,89 +189,13 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ lkey = txq->mp2mr[i].lkey;
+ break;
}
}
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index __rte_unused)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- /* Check whether mbuf structure fits element size and whether mempool
- * pointer is valid. */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to TX queue structure.
- */
-void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- txq_mp2mr(txq, mp);
+ if (unlikely(lkey == (uint32_t)-1))
+ lkey = txq_mp2mr_reg(txq, mp, i);
+ return lkey;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 47f6299..462eddf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -337,12 +337,16 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_rxtx.c */

-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
-void txq_mp2mr_iter(struct rte_mempool *, void *);
uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);

+/* mlx5_mr.c */
+
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
+void txq_mp2mr_iter(struct rte_mempool *, void *);
+uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int);
+
#endif /* RTE_PMD_MLX5_RXTX_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:14 UTC
Permalink
Fixes: 701c8d80c820 ("pci: support class id probing")

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +----
drivers/net/mlx4/mlx4.c | 18 ++++++------------
drivers/net/mlx5/mlx5.c | 24 ++++++++----------------
drivers/net/nfp/nfp_net.c | 12 ++++--------
4 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/drivers/crypto/qat/rte_qat_cryptodev.c b/drivers/crypto/qat/rte_qat_cryptodev.c
index a7912f5..f46ec85 100644
--- a/drivers/crypto/qat/rte_qat_cryptodev.c
+++ b/drivers/crypto/qat/rte_qat_cryptodev.c
@@ -69,10 +69,7 @@ static struct rte_cryptodev_ops crypto_qat_ops = {

static struct rte_pci_id pci_id_qat_map[] = {
{
- .vendor_id = 0x8086,
- .device_id = 0x0443,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(0x8086, 0x0443),
},
{.device_id = 0},
};
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 9e94630..6228688 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -5807,22 +5807,16 @@ error:

static const struct rte_pci_id mlx4_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
},
{
.vendor_id = 0
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 67a541c..350028b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -610,28 +610,20 @@ error:

static const struct rte_pci_id mlx5_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LX,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
},
{
.vendor_id = 0
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..dd0c559 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -2446,16 +2446,12 @@ nfp_net_init(struct rte_eth_dev *eth_dev)

static struct rte_pci_id pci_id_nfp_net_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_PF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_PF_NIC)
},
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_VF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_VF_NIC)
},
{
.vendor_id = 0,
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:16 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. TX gather cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 315 ++++++++---------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 17 ---
drivers/net/mlx5/mlx5_txq.c | 49 ++-----
4 files changed, 69 insertions(+), 314 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 0a881b6..280a90a 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1260,7 +1260,7 @@ mlx5_secondary_data_setup(struct priv *priv)
if (txq != NULL) {
if (txq_setup(priv->dev,
txq,
- primary_txq->elts_n * MLX5_PMD_SGE_WR_N,
+ primary_txq->elts_n,
primary_txq->socket,
NULL) == 0) {
txq->stats.idx = primary_txq->stats.idx;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 616cf7a..6e184c3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -228,156 +228,6 @@ insert_vlan_sw(struct rte_mbuf *buf)
return 0;
}

-#if MLX5_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- * Linear output buffer.
- * @param[in] buf
- * Scattered input buffer.
- *
- * @return
- * Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
- unsigned int size = 0;
- unsigned int offset;
-
- do {
- unsigned int len = DATA_LEN(buf);
-
- offset = size;
- size += len;
- if (unlikely(size > sizeof(*linear)))
- return 0;
- memcpy(&(*linear)[offset],
- rte_pktmbuf_mtod(buf, uint8_t *),
- len);
- buf = NEXT(buf);
- } while (buf != NULL);
- return size;
-}
-
-/**
- * Handle scattered buffers for mlx5_tx_burst().
- *
- * @param txq
- * TX queue structure.
- * @param segs
- * Number of segments in buf.
- * @param elt
- * TX queue element to fill.
- * @param[in] buf
- * Buffer to process.
- * @param elts_head
- * Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- * Array filled with SGEs on success.
- *
- * @return
- * A structure containing the processed packet size in bytes and the
- * number of SGEs. Both fields are set to (unsigned int)-1 in case of
- * failure.
- */
-static struct tx_burst_sg_ret {
- unsigned int length;
- unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
- struct rte_mbuf *buf, unsigned int elts_head,
- struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
-{
- unsigned int sent_size = 0;
- unsigned int j;
- int linearize = 0;
-
- /* When there are too many segments, extra segments are
- * linearized in the last SGE. */
- if (unlikely(segs > RTE_DIM(*sges))) {
- segs = (RTE_DIM(*sges) - 1);
- linearize = 1;
- }
- /* Update element. */
- elt->buf = buf;
- /* Register segments as SGEs. */
- for (j = 0; (j != segs); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- uint32_t lkey;
-
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update SGE. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)sge->addr);
- sge->length = DATA_LEN(buf);
- sge->lkey = lkey;
- sent_size += sge->length;
- buf = NEXT(buf);
- }
- /* If buf is not NULL here and is not going to be linearized,
- * nb_segs is not valid. */
- assert(j == segs);
- assert((buf == NULL) || (linearize));
- /* Linearize extra segments. */
- if (linearize) {
- struct ibv_sge *sge = &(*sges)[segs];
- linear_t *linear = &(*txq->elts_linear)[elts_head];
- unsigned int size = linearize_mbuf(linear, buf);
-
- assert(segs == (RTE_DIM(*sges) - 1));
- if (size == 0) {
- /* Invalid packet. */
- DEBUG("%p: packet too large to be linearized.",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
- if (RTE_DIM(*sges) == 1) {
- do {
- struct rte_mbuf *next = NEXT(buf);
-
- rte_pktmbuf_free_seg(buf);
- buf = next;
- } while (buf != NULL);
- elt->buf = NULL;
- }
- /* Update SGE. */
- sge->addr = (uintptr_t)&(*linear)[0];
- sge->length = size;
- sge->lkey = txq->mr_linear->lkey;
- sent_size += size;
- /* Include last segment. */
- segs++;
- }
- return (struct tx_burst_sg_ret){
- .length = sent_size,
- .num = segs,
- };
-stop:
- return (struct tx_burst_sg_ret){
- .length = -1,
- .num = -1,
- };
-}
-
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
-
/**
* DPDK callback for TX.
*
@@ -424,14 +274,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt = &(*txq->elts)[elts_head];
- unsigned int segs = NB_SEGS(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
- unsigned int sent_size = 0;
-#endif
uint32_t send_flags = 0;
#ifdef HAVE_VERBS_VLAN_INSERTION
int insert_vlan = 0;
#endif /* HAVE_VERBS_VLAN_INSERTION */
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+ uintptr_t buf_next_addr;

if (i + 1 < max)
rte_prefetch0(buf_next);
@@ -464,126 +314,81 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto stop;
}
}
- if (likely(segs == 1)) {
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
- uintptr_t buf_next_addr;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- /* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
- /* Put packet into send queue. */
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->sriov)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ /* Prefetch next buffer data. */
+ if (i + 1 < max) {
+ buf_next_addr =
+ rte_pktmbuf_mtod(buf_next, uintptr_t);
+ rte_prefetch0((volatile void *)
+ (uintptr_t)buf_next_addr);
+ }
+ /* Put packet into send queue. */
#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
+ if (length <= txq->max_inline) {
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_inline_vlan
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += length;
+ err = txq->send_pending_inline
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags);
+ } else
#endif
- } else {
-#if MLX5_PMD_SGE_WR_N > 1
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
- struct tx_burst_sg_ret ret;
-
- ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
- &sges);
- if (ret.length == (unsigned int)-1)
+ {
+ /* Retrieve Memory Region key for this
+ * memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
goto stop;
- /* Put SG list into send queue. */
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
if (insert_vlan)
- err = txq->send_pending_sg_list_vlan
+ err = txq->send_pending_vlan
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags,
&buf->vlan_tci);
else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_sg_list
+ err = txq->send_pending
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += ret.length;
-#endif
-#else /* MLX5_PMD_SGE_WR_N > 1 */
- DEBUG("%p: TX scattered buffers support not"
- " compiled in", (void *)txq);
- goto stop;
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
}
- elts_head = elts_head_next;
- buf = buf_next;
+ if (unlikely(err))
+ goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
+ txq->stats.obytes += length;
#endif
- }
stop:
+ elts_head = elts_head_next;
+ buf = buf_next;
+ }
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 462eddf..8358ccb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -242,14 +242,6 @@ struct txq_elt {
struct rte_mbuf *buf;
};

-/* Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes. */
-typedef uint8_t linear_t[16384];
-
/* TX queue descriptor. */
struct txq {
struct priv *priv; /* Back pointer to private data. */
@@ -264,12 +256,6 @@ struct txq {
int (*send_pending_inline_vlan)();
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- int (*send_pending_sg_list)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_sg_list_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
@@ -289,9 +275,6 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- /* Elements used only for init part are here. */
- linear_t (*elts_linear)[]; /* Linearized buffers. */
- struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e20df21..5a248c9 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,26 +82,13 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
unsigned int i;
struct txq_elt (*elts)[elts_n] =
rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
- linear_t (*elts_linear)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
- txq->socket);
- struct ibv_mr *mr_linear = NULL;
int ret = 0;

- if ((elts == NULL) || (elts_linear == NULL)) {
+ if (elts == NULL) {
ERROR("%p: can't allocate packets array", (void *)txq);
ret = ENOMEM;
goto error;
}
- mr_linear =
- ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
- IBV_ACCESS_LOCAL_WRITE);
- if (mr_linear == NULL) {
- ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
- (void *)txq);
- ret = EINVAL;
- goto error;
- }
for (i = 0; (i != elts_n); ++i) {
struct txq_elt *elt = &(*elts)[i];

@@ -119,15 +106,9 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
txq->elts_comp_cd = txq->elts_comp_cd_init;
- txq->elts_linear = elts_linear;
- txq->mr_linear = mr_linear;
assert(ret == 0);
return 0;
error:
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
rte_free(elts);

DEBUG("%p: failed, freed everything", (void *)txq);
@@ -148,8 +129,6 @@ txq_free_elts(struct txq *txq)
unsigned int elts_head = txq->elts_head;
unsigned int elts_tail = txq->elts_tail;
struct txq_elt (*elts)[elts_n] = txq->elts;
- linear_t (*elts_linear)[elts_n] = txq->elts_linear;
- struct ibv_mr *mr_linear = txq->mr_linear;

DEBUG("%p: freeing WRs", (void *)txq);
txq->elts_n = 0;
@@ -159,12 +138,7 @@ txq_free_elts(struct txq *txq)
txq->elts_comp_cd = 0;
txq->elts_comp_cd_init = 0;
txq->elts = NULL;
- txq->elts_linear = NULL;
- txq->mr_linear = NULL;
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));

- rte_free(elts_linear);
if (elts == NULL)
return;
while (elts_tail != elts_head) {
@@ -286,12 +260,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of TX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of TX descriptors", (void *)dev);
+ return EINVAL;
+ }
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: TX gather is not supported", (void *)dev);
return EINVAL;
}
- desc /= MLX5_PMD_SGE_WR_N;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -332,10 +308,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
- .max_send_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_send_sge = 1,
#if MLX5_PMD_MAX_INLINE > 0
.max_inline_data = MLX5_PMD_MAX_INLINE,
#endif
@@ -440,12 +413,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_sg_list_vlan = txq->if_qp->send_pending_sg_list_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:17 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. RX scatter cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 31 +---
drivers/net/mlx5/mlx5_rxq.c | 314 ++++++-----------------------------------
drivers/net/mlx5/mlx5_rxtx.c | 211 +--------------------------
drivers/net/mlx5/mlx5_rxtx.h | 13 +-
4 files changed, 53 insertions(+), 516 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 280a90a..ca57021 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -623,8 +623,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)

};

- if (dev->rx_pkt_burst == mlx5_rx_burst ||
- dev->rx_pkt_burst == mlx5_rx_burst_sp)
+ if (dev->rx_pkt_burst == mlx5_rx_burst)
return ptypes;
return NULL;
}
@@ -762,19 +761,11 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- /* Provide new values to rxq_setup(). */
- dev->data->dev_conf.rxmode.jumbo_frame = sp;
- dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
- ret = rxq_rehash(dev, rxq);
- if (ret) {
- /* Force SP RX if that queue requires it and abort. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
- break;
+ if (sp) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ ret = ENOTSUP;
+ goto out;
}
- /* Scattered burst function takes priority. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
}
/* Burst functions can now be called again. */
rte_wmb();
@@ -1103,22 +1094,12 @@ priv_set_link(struct priv *priv, int up)
{
struct rte_eth_dev *dev = priv->dev;
int err;
- unsigned int i;

if (up) {
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- for (i = 0; i < priv->rxqs_n; i++)
- if ((*priv->rxqs)[i]->sp)
- break;
- /* Check if an sp queue exists.
- * Note: Some old frames might be received.
- */
- if (i == priv->rxqs_n)
- dev->rx_pkt_burst = mlx5_rx_burst;
- else
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
+ dev->rx_pkt_burst = mlx5_rx_burst;
dev->tx_pkt_burst = mlx5_tx_burst;
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0bcf55b..38ff9fd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -634,145 +634,6 @@ priv_rehash_flows(struct priv *priv)
}

/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt_sp (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
- struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
-
- /* These two arrays must have the same size. */
- assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
- /* For each SGE (segment). */
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- elt->bufs[j] = buf;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- if (j == 0) {
- /* The first SGE keeps its headroom. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- sge->length = (buf->buf_len -
- RTE_PKTMBUF_HEADROOM);
- } else {
- /* Subsequent SGEs lose theirs. */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- SET_DATA_OFF(buf, 0);
- sge->addr = (uintptr_t)buf->buf_addr;
- sge->length = buf->buf_len;
- }
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- }
- }
- DEBUG("%p: allocated and configured %u WRs (%zu segments)",
- (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
-}
-
-/**
* Allocate RX queue elements.
*
* @param rxq
@@ -838,7 +699,7 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
(void *)rxq, elts_n);
rxq->elts_n = elts_n;
rxq->elts_head = 0;
- rxq->elts.no_sp = elts;
+ rxq->elts = elts;
assert(ret == 0);
return 0;
error:
@@ -869,11 +730,11 @@ rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[elts_n] = rxq->elts;

DEBUG("%p: freeing WRs", (void *)rxq);
rxq->elts_n = 0;
- rxq->elts.no_sp = NULL;
+ rxq->elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -900,10 +761,7 @@ rxq_cleanup(struct rxq *rxq)
struct ibv_exp_release_intf_params params;

DEBUG("cleaning up %p", (void *)rxq);
- if (rxq->sp)
- rxq_free_elts_sp(rxq);
- else
- rxq_free_elts(rxq);
+ rxq_free_elts(rxq);
rxq->poll = NULL;
rxq->recv = NULL;
if (rxq->if_wq != NULL) {
@@ -973,12 +831,12 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- unsigned int mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ struct rxq_elt (*elts)[tmpl.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
+ desc_n = tmpl.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
@@ -989,22 +847,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
rxq->csum_l2tun = tmpl.csum_l2tun;
}
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc_n /= MLX5_PMD_SGE_WR_N;
- } else
- tmpl.sp = 0;
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
- /* If scatter mode is the same as before, nothing to do. */
- if (tmpl.sp == rxq->sp) {
- DEBUG("%p: nothing to do", (void *)dev);
- return 0;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -1025,35 +867,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- if (rxq->sp) {
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[i];
- unsigned int j;
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- assert(elt->bufs[j] != NULL);
- pool[k++] = elt->bufs[j];
- }
- }
- } else {
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
+ elts = rxq->elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf = elt->buf;

- pool[k++] = buf;
- }
+ pool[k++] = buf;
}
assert(k == mbuf_n);
tmpl.elts_n = 0;
- tmpl.elts.sp = NULL;
- assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
- err = ((tmpl.sp) ?
- rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
- rxq_alloc_elts(&tmpl, desc_n, pool));
+ tmpl.elts = NULL;
+ assert((void *)&tmpl.elts == NULL);
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
rte_free(pool);
@@ -1061,12 +886,11 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
return err;
}
assert(tmpl.elts_n == desc_n);
- assert(tmpl.elts.sp != NULL);
rte_free(pool);
/* Clean up original data. */
rxq->elts_n = 0;
- rte_free(rxq->elts.sp);
- rxq->elts.sp = NULL;
+ rte_free(rxq->elts);
+ rxq->elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
@@ -1080,28 +904,14 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (err)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ err = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (err)
+ break;
}
if (err) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1110,10 +920,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- if (tmpl.sp)
- tmpl.recv = tmpl.if_wq->recv_sg_list;
- else
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.recv = tmpl.if_wq->recv_burst;
error:
*rxq = tmpl;
assert(err >= 0);
@@ -1159,31 +966,26 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ struct rxq_elt (*elts)[desc];
int ret = 0;
unsigned int i;
unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ return ENOTSUP;
+ }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc /= MLX5_PMD_SGE_WR_N;
- }
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
+ (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -1232,10 +1034,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
priv->device_attr.max_qp_wr :
(int)cq_size),
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_recv_sge = 1,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1297,10 +1096,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
- if (tmpl.sp)
- ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
- else
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1346,28 +1142,14 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Post SGEs. */
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (ret)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ ret = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (ret)
+ break;
}
if (ret) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1388,10 +1170,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq->poll = rxq->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- if (rxq->sp)
- rxq->recv = rxq->if_wq->recv_sg_list;
- else
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq->recv = rxq->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1466,10 +1245,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)rxq);
(*priv->rxqs)[idx] = rxq;
/* Update receive callback. */
- if (rxq->sp)
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
- else
- dev->rx_pkt_burst = mlx5_rx_burst;
+ dev->rx_pkt_burst = mlx5_rx_burst;
}
priv_unlock(priv);
return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6e184c3..07d95eb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -502,215 +502,8 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
}

/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(!rxq->sp))
- return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
- if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
- return 0;
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[elts_head];
- unsigned int len;
- unsigned int pkt_buf_len;
- struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
- struct rte_mbuf **pkt_buf_next = &pkt_buf;
- unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
- unsigned int j = 0;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
- pkt_buf_len = len;
- /*
- * Replace spent segments with new ones, concatenate and
- * return them as pkt_buf.
- */
- while (1) {
- struct ibv_sge *sge = &elt->sges[j];
- struct rte_mbuf *seg = elt->bufs[j];
- struct rte_mbuf *rep;
- unsigned int seg_tailroom;
-
- assert(seg != NULL);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_prefetch0(seg);
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- if (pkt_buf != NULL) {
- *pkt_buf_next = NULL;
- rte_pktmbuf_free(pkt_buf);
- }
- /* Increment out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
-#ifndef NDEBUG
- /* Poison user-modifiable fields in rep. */
- NEXT(rep) = (void *)((uintptr_t)-1);
- SET_DATA_OFF(rep, 0xdead);
- DATA_LEN(rep) = 0xd00d;
- PKT_LEN(rep) = 0xdeadd00d;
- NB_SEGS(rep) = 0x2a;
- PORT(rep) = 0x2a;
- rep->ol_flags = -1;
-#endif
- assert(rep->buf_len == seg->buf_len);
- /* Reconfigure sge to use rep instead of seg. */
- assert(sge->lkey == rxq->mr->lkey);
- sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
- elt->bufs[j] = rep;
- ++j;
- /* Update pkt_buf if it's the first segment, or link
- * seg to the previous one and update pkt_buf_next. */
- *pkt_buf_next = seg;
- pkt_buf_next = &NEXT(seg);
- /* Update seg information. */
- seg_tailroom = (seg->buf_len - seg_headroom);
- assert(sge->length == seg_tailroom);
- SET_DATA_OFF(seg, seg_headroom);
- if (likely(len <= seg_tailroom)) {
- /* Last segment. */
- DATA_LEN(seg) = len;
- PKT_LEN(seg) = len;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) ==
- seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) ==
- (seg_tailroom - len));
- break;
- }
- DATA_LEN(seg) = seg_tailroom;
- PKT_LEN(seg) = seg_tailroom;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) == seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) == 0);
- /* Fix len and clear headroom for next segments. */
- len -= seg_tailroom;
- seg_headroom = 0;
- }
- /* Update head and tail segments. */
- *pkt_buf_next = NULL;
- assert(pkt_buf != NULL);
- assert(j != 0);
- NB_SEGS(pkt_buf) = j;
- PORT(pkt_buf) = rxq->port_id;
- PKT_LEN(pkt_buf) = pkt_buf_len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
- pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
- pkt_buf->vlan_tci = vlan_tci;
- }
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- }
-
- /* Return packet. */
- *(pkts++) = pkt_buf;
- ++pkts_ret;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment bytes counter. */
- rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
- ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_sg_list(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- rxq->elts_head = elts_head;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
* DPDK callback for RX.
*
- * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
@@ -725,7 +518,7 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_sge sges[pkts_n];
@@ -733,8 +526,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int pkts_ret = 0;
int ret;

- if (unlikely(rxq->sp))
- return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
for (i = 0; (i != pkts_n); ++i) {
struct rxq_elt *elt = &(*elts)[elts_head];
unsigned int len;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8358ccb..2e1f83b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -81,12 +81,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element (scattered packets). */
-struct rxq_elt_sp {
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
- struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
/* RX element. */
struct rxq_elt {
struct ibv_sge sge; /* Scatter/Gather Element. */
@@ -112,15 +106,11 @@ struct rxq {
unsigned int port_id; /* Port ID for incoming packets. */
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int sp:1; /* Use scattered RX elements. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- union {
- struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
- struct rxq_elt (*no_sp)[]; /* RX elements. */
- } elts;
+ struct rxq_elt (*elts)[]; /* RX elements. */
unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
@@ -321,7 +311,6 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
-uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:18 UTC
Permalink
There is no scatter/gather support anymore, CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
has no purpose and can be removed.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 7 -------
drivers/net/mlx5/Makefile | 4 ----
drivers/net/mlx5/mlx5_defs.h | 5 -----
drivers/net/mlx5/mlx5_rxq.c | 4 ----
drivers/net/mlx5/mlx5_txq.c | 4 ----
6 files changed, 25 deletions(-)

diff --git a/config/common_base b/config/common_base
index ead5984..39e6333 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index d9196d1..84c35a0 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,13 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N`` (default **4**)
-
- Number of scatter/gather elements (SGEs) per work request (WR). Lowering
- this number improves performance but also limits the ability to receive
- scattered packets (packets that do not fit a single mbuf). The default
- value is a safe tradeoff.
-
- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)

Amount of data to be inlined during TX operations. Improves latency.
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 999ada5..656a6e1 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -86,10 +86,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
-CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 09207d9..da1c90e 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX5_PMD_SGE_WR_N
-#define MLX5_PMD_SGE_WR_N 4
-#endif
-
/* Maximum size for inline data. */
#ifndef MLX5_PMD_MAX_INLINE
#define MLX5_PMD_MAX_INLINE 0
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 38ff9fd..4000624 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -976,10 +976,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- return ENOTSUP;
- }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5a248c9..59974c5 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -264,10 +264,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
ERROR("%p: invalid number of TX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: TX gather is not supported", (void *)dev);
- return EINVAL;
- }
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:19 UTC
Permalink
Inline TX will be fully managed by the PMD after Verbs is bypassed in the
data path. Remove the current code until then.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 10 ------
drivers/net/mlx5/Makefile | 4 ---
drivers/net/mlx5/mlx5_defs.h | 5 ---
drivers/net/mlx5/mlx5_rxtx.c | 73 +++++++++++++++-----------------------------
drivers/net/mlx5/mlx5_rxtx.h | 9 ------
drivers/net/mlx5/mlx5_txq.c | 16 ----------
7 files changed, 25 insertions(+), 93 deletions(-)

diff --git a/config/common_base b/config/common_base
index 39e6333..5fbac47 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

#
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 84c35a0..77fa957 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,16 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)
-
- Amount of data to be inlined during TX operations. Improves latency.
- Can improve PPS performance when PCI backpressure is detected and may be
- useful for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE`` (default **8**)

Maximum number of cached memory pools (MPs) per TX queue. Each MP from
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 656a6e1..289c85e 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -86,10 +86,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
-CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE
CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index da1c90e..9a19835 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum size for inline data. */
-#ifndef MLX5_PMD_MAX_INLINE
-#define MLX5_PMD_MAX_INLINE 0
-#endif
-
/*
* Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
* from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 07d95eb..4ba88ea 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -329,56 +329,33 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch0((volatile void *)
(uintptr_t)buf_next_addr);
}
- /* Put packet into send queue. */
-#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
+ goto stop;
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_vlan
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
+ err = txq->send_pending
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags);
if (unlikely(err))
goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2e1f83b..3a353b0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -240,19 +240,10 @@ struct txq {
#ifdef HAVE_VERBS_VLAN_INSERTION
int (*send_pending_vlan)();
#endif
-#if MLX5_PMD_MAX_INLINE > 0
- int (*send_pending_inline)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_inline_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
struct txq_elt (*elts)[]; /* TX elements. */
-#if MLX5_PMD_MAX_INLINE > 0
- uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
-#endif
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int elts_tail; /* First element awaiting completion. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 59974c5..75da65b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -173,9 +173,6 @@ txq_cleanup(struct txq *txq)
DEBUG("cleaning up %p", (void *)txq);
txq_free_elts(txq);
txq->poll_cnt = NULL;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = NULL;
-#endif
txq->send_flush = NULL;
if (txq->if_qp != NULL) {
assert(txq->priv != NULL);
@@ -305,9 +302,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
desc),
/* Max number of scatter/gather elements in a WR. */
.max_send_sge = 1,
-#if MLX5_PMD_MAX_INLINE > 0
- .max_inline_data = MLX5_PMD_MAX_INLINE,
-#endif
},
.qp_type = IBV_QPT_RAW_PACKET,
/* Do *NOT* enable this, completions events are managed per
@@ -325,10 +319,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
-#if MLX5_PMD_MAX_INLINE > 0
- /* ibv_create_qp() updates this value. */
- tmpl.max_inline = attr.init.cap.max_inline_data;
-#endif
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
@@ -403,12 +393,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq_cleanup(txq);
*txq = tmpl;
txq->poll_cnt = txq->if_cq->poll_cnt;
-#if MLX5_PMD_MAX_INLINE > 0
- txq->send_pending_inline = txq->if_qp->send_pending_inline;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:20 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure txq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 21 +++--
drivers/net/mlx5/mlx5_ethdev.c | 27 +++---
drivers/net/mlx5/mlx5_mr.c | 39 ++++----
drivers/net/mlx5/mlx5_rxtx.h | 9 +-
drivers/net/mlx5/mlx5_txq.c | 198 +++++++++++++++++++++--------------------
5 files changed, 158 insertions(+), 136 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 350028b..3d30e00 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -98,7 +98,6 @@ static void
mlx5_dev_close(struct rte_eth_dev *dev)
{
struct priv *priv = mlx5_get_priv(dev);
- void *tmp;
unsigned int i;

priv_lock(priv);
@@ -122,12 +121,13 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_rx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
- tmp = (*priv->rxqs)[i];
- if (tmp == NULL)
+ struct rxq *rxq = (*priv->rxqs)[i];
+
+ if (rxq == NULL)
continue;
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(tmp);
- rte_free(tmp);
+ rxq_cleanup(rxq);
+ rte_free(rxq);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
@@ -136,12 +136,15 @@ mlx5_dev_close(struct rte_eth_dev *dev)
/* XXX race condition if mlx5_tx_burst() is still running. */
usleep(1000);
for (i = 0; (i != priv->txqs_n); ++i) {
- tmp = (*priv->txqs)[i];
- if (tmp == NULL)
+ struct txq *txq = (*priv->txqs)[i];
+ struct txq_ctrl *txq_ctrl;
+
+ if (txq == NULL)
continue;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
(*priv->txqs)[i] = NULL;
- txq_cleanup(tmp);
- rte_free(tmp);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
priv->txqs_n = 0;
priv->txqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index ca57021..3992b2c 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1232,28 +1232,31 @@ mlx5_secondary_data_setup(struct priv *priv)
/* TX queues. */
for (i = 0; i != nb_tx_queues; ++i) {
struct txq *primary_txq = (*sd->primary_priv->txqs)[i];
- struct txq *txq;
+ struct txq_ctrl *primary_txq_ctrl;
+ struct txq_ctrl *txq_ctrl;

if (primary_txq == NULL)
continue;
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0,
- primary_txq->socket);
- if (txq != NULL) {
+ primary_txq_ctrl = container_of(primary_txq,
+ struct txq_ctrl, txq);
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
+ primary_txq_ctrl->socket);
+ if (txq_ctrl != NULL) {
if (txq_setup(priv->dev,
- txq,
+ primary_txq_ctrl,
primary_txq->elts_n,
- primary_txq->socket,
+ primary_txq_ctrl->socket,
NULL) == 0) {
- txq->stats.idx = primary_txq->stats.idx;
- tx_queues[i] = txq;
+ txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
+ tx_queues[i] = &txq_ctrl->txq;
continue;
}
- rte_free(txq);
+ rte_free(txq_ctrl);
}
while (i) {
- txq = tx_queues[--i];
- txq_cleanup(txq);
- rte_free(txq);
+ txq_ctrl = tx_queues[--i];
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
}
goto error;
}
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 7c3e87f..79d5568 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -183,33 +183,36 @@ mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
uint32_t
txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
{
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
struct ibv_mr *mr;

/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
+ (void *)txq_ctrl, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
+ (void *)txq_ctrl);
return (uint32_t)-1;
}
- if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ if (unlikely(idx == RTE_DIM(txq_ctrl->txq.mp2mr))) {
/* Table is full, remove oldest entry. */
DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
+ (void *)txq_ctrl);
--idx;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr));
+ memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1],
+ (sizeof(txq_ctrl->txq.mp2mr) -
+ sizeof(txq_ctrl->txq.mp2mr[0])));
}
/* Store the new entry. */
- txq->mp2mr[idx].mp = mp;
- txq->mp2mr[idx].mr = mr;
- txq->mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].mp = mp;
+ txq_ctrl->txq.mp2mr[idx].mr = mr;
+ txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
- return txq->mp2mr[idx].lkey;
+ (void *)txq_ctrl, mp->name, (void *)mp,
+ txq_ctrl->txq.mp2mr[idx].lkey);
+ return txq_ctrl->txq.mp2mr[idx].lkey;
}

struct txq_mp2mr_mbuf_check_data {
@@ -255,7 +258,7 @@ txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
void
txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
{
- struct txq *txq = arg;
+ struct txq_ctrl *txq_ctrl = arg;
struct txq_mp2mr_mbuf_check_data data = {
.ret = 0,
};
@@ -265,13 +268,13 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
data.ret == -1)
return;
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (unlikely(txq_ctrl->txq.mp2mr[i].mp == NULL)) {
/* Unknown MP, add a new MR for it. */
break;
}
- if (txq->mp2mr[i].mp == mp)
+ if (txq_ctrl->txq.mp2mr[i].mp == mp)
return;
}
- txq_mp2mr_reg(txq, mp, i);
+ txq_mp2mr_reg(&txq_ctrl->txq, mp, i);
}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3a353b0..5baefcb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -256,6 +256,10 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+} __rte_cache_aligned;
+
+/* TX queue control descriptor. */
+struct txq_ctrl {
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
@@ -264,6 +268,7 @@ struct txq {
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
+ struct txq txq; /* Data path structure. */
};

/* mlx5_rxq.c */
@@ -291,8 +296,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_txq.c */

-void txq_cleanup(struct txq *);
-int txq_setup(struct rte_eth_dev *, struct txq *, uint16_t, unsigned int,
+void txq_cleanup(struct txq_ctrl *);
+int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 75da65b..4683775 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -68,7 +68,7 @@
/**
* Allocate TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -77,15 +77,15 @@
* 0 on success, errno value on failure.
*/
static int
-txq_alloc_elts(struct txq *txq, unsigned int elts_n)
+txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
+ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq);
+ ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -94,24 +94,24 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)

elt->buf = NULL;
}
- DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
- txq->elts_n = elts_n;
- txq->elts = elts;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
+ DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
+ txq_ctrl->txq.elts_n = elts_n;
+ txq_ctrl->txq.elts = elts;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
* at least 4 times per ring. */
- txq->elts_comp_cd_init =
+ txq_ctrl->txq.elts_comp_cd_init =
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq->elts_comp_cd = txq->elts_comp_cd_init;
+ txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
assert(ret == 0);
return 0;
error:
rte_free(elts);

- DEBUG("%p: failed, freed everything", (void *)txq);
+ DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
assert(ret > 0);
return ret;
}
@@ -119,25 +119,25 @@ error:
/**
* Free TX queue elements.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
static void
-txq_free_elts(struct txq *txq)
+txq_free_elts(struct txq_ctrl *txq_ctrl)
{
- unsigned int elts_n = txq->elts_n;
- unsigned int elts_head = txq->elts_head;
- unsigned int elts_tail = txq->elts_tail;
- struct txq_elt (*elts)[elts_n] = txq->elts;
+ unsigned int elts_n = txq_ctrl->txq.elts_n;
+ unsigned int elts_head = txq_ctrl->txq.elts_head;
+ unsigned int elts_tail = txq_ctrl->txq.elts_tail;
+ struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;

- DEBUG("%p: freeing WRs", (void *)txq);
- txq->elts_n = 0;
- txq->elts_head = 0;
- txq->elts_tail = 0;
- txq->elts_comp = 0;
- txq->elts_comp_cd = 0;
- txq->elts_comp_cd_init = 0;
- txq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)txq_ctrl);
+ txq_ctrl->txq.elts_n = 0;
+ txq_ctrl->txq.elts_head = 0;
+ txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
+ txq_ctrl->txq.elts_comp_cd = 0;
+ txq_ctrl->txq.elts_comp_cd_init = 0;
+ txq_ctrl->txq.elts = NULL;

if (elts == NULL)
return;
@@ -161,63 +161,63 @@ txq_free_elts(struct txq *txq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
*/
void
-txq_cleanup(struct txq *txq)
+txq_cleanup(struct txq_ctrl *txq_ctrl)
{
struct ibv_exp_release_intf_params params;
size_t i;

- DEBUG("cleaning up %p", (void *)txq);
- txq_free_elts(txq);
- txq->poll_cnt = NULL;
- txq->send_flush = NULL;
- if (txq->if_qp != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->qp != NULL);
+ DEBUG("cleaning up %p", (void *)txq_ctrl);
+ txq_free_elts(txq_ctrl);
+ txq_ctrl->txq.poll_cnt = NULL;
+ txq_ctrl->txq.send_flush = NULL;
+ if (txq_ctrl->if_qp != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_qp,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_qp,
&params));
}
- if (txq->if_cq != NULL) {
- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- assert(txq->cq != NULL);
+ if (txq_ctrl->if_cq != NULL) {
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ assert(txq_ctrl->txq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq->priv->ctx,
- txq->if_cq,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->if_cq,
&params));
}
- if (txq->qp != NULL)
- claim_zero(ibv_destroy_qp(txq->qp));
- if (txq->cq != NULL)
- claim_zero(ibv_destroy_cq(txq->cq));
- if (txq->rd != NULL) {
+ if (txq_ctrl->txq.qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
+ if (txq_ctrl->txq.cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq->priv != NULL);
- assert(txq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
- txq->rd,
+ assert(txq_ctrl->txq.priv != NULL);
+ assert(txq_ctrl->txq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ txq_ctrl->rd,
&attr));
}
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (txq->mp2mr[i].mp == NULL)
+ for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
+ if (txq_ctrl->txq.mp2mr[i].mp == NULL)
break;
- assert(txq->mp2mr[i].mr != NULL);
- claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+ assert(txq_ctrl->txq.mp2mr[i].mr != NULL);
+ claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr));
}
- memset(txq, 0, sizeof(*txq));
+ memset(txq_ctrl, 0, sizeof(*txq_ctrl));
}

/**
@@ -225,7 +225,7 @@ txq_cleanup(struct txq *txq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param txq
+ * @param txq_ctrl
* Pointer to TX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -238,13 +238,15 @@ txq_cleanup(struct txq *txq)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
+txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
- struct txq tmpl = {
- .priv = priv,
- .socket = socket
+ struct txq_ctrl tmpl = {
+ .socket = socket,
+ .txq = {
+ .priv = priv,
+ },
};
union {
struct ibv_exp_query_intf_params params;
@@ -279,8 +281,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
+ if (tmpl.txq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -292,9 +294,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.cq,
+ .send_cq = tmpl.txq.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.cq,
+ .recv_cq = tmpl.txq.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -312,8 +314,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.qp == NULL) {
+ tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.txq.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -325,7 +327,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
@@ -341,14 +343,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -357,7 +359,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.txq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -369,7 +371,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.qp,
+ .obj = tmpl.txq.qp,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
@@ -389,18 +391,18 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
goto error;
}
/* Clean up txq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
- txq_cleanup(txq);
- *txq = tmpl;
- txq->poll_cnt = txq->if_cq->poll_cnt;
- txq->send_pending = txq->if_qp->send_pending;
+ DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
+ txq_cleanup(txq_ctrl);
+ *txq_ctrl = tmpl;
+ txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
+ txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
+ txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
#endif
- txq->send_flush = txq->if_qp->send_flush;
- DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
+ txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
+ DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
- rte_mempool_walk(txq_mp2mr_iter, txq);
+ rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
assert(ret == 0);
return 0;
error:
@@ -432,12 +434,15 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
+ struct txq_ctrl *txq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (txq)
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -454,24 +459,25 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->txqs)[idx] = NULL;
- txq_cleanup(txq);
+ txq_cleanup(txq_ctrl);
} else {
- txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
- if (txq == NULL) {
+ txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
+ 0, socket);
+ if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq, desc, socket, conf);
+ ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
- rte_free(txq);
+ rte_free(txq_ctrl);
else {
- txq->stats.idx = idx;
+ txq_ctrl->txq.stats.idx = idx;
DEBUG("%p: adding TX queue %p to list",
- (void *)dev, (void *)txq);
- (*priv->txqs)[idx] = txq;
+ (void *)dev, (void *)txq_ctrl);
+ (*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
dev->tx_pkt_burst = mlx5_tx_burst;
}
@@ -489,6 +495,7 @@ void
mlx5_tx_queue_release(void *dpdk_txq)
{
struct txq *txq = (struct txq *)dpdk_txq;
+ struct txq_ctrl *txq_ctrl;
struct priv *priv;
unsigned int i;

@@ -497,17 +504,18 @@ mlx5_tx_queue_release(void *dpdk_txq)

if (txq == NULL)
return;
+ txq_ctrl = container_of(txq, struct txq_ctrl, txq);
priv = txq->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
DEBUG("%p: removing TX queue %p from list",
- (void *)priv->dev, (void *)txq);
+ (void *)priv->dev, (void *)txq_ctrl);
(*priv->txqs)[i] = NULL;
break;
}
- txq_cleanup(txq);
- rte_free(txq);
+ txq_cleanup(txq_ctrl);
+ rte_free(txq_ctrl);
priv_unlock(priv);
}
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:21 UTC
Permalink
To keep the data path as efficient as possible, move fields only useful to
the control path into new structure rxq_ctrl.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 6 +-
drivers/net/mlx5/mlx5_fdir.c | 8 +-
drivers/net/mlx5/mlx5_rxq.c | 250 ++++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.c | 1 -
drivers/net/mlx5/mlx5_rxtx.h | 13 ++-
5 files changed, 148 insertions(+), 130 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3d30e00..27a7a30 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -122,12 +122,14 @@ mlx5_dev_close(struct rte_eth_dev *dev)
usleep(1000);
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
+ struct rxq_ctrl *rxq_ctrl;

if (rxq == NULL)
continue;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
(*priv->rxqs)[i] = NULL;
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
}
priv->rxqs_n = 0;
priv->rxqs = NULL;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 63e43ad..e3b97ba 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -424,7 +424,9 @@ create_flow:
static struct fdir_queue *
priv_get_fdir_queue(struct priv *priv, uint16_t idx)
{
- struct fdir_queue *fdir_queue = &(*priv->rxqs)[idx]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[idx], struct rxq_ctrl, rxq);
+ struct fdir_queue *fdir_queue = &rxq_ctrl->fdir_queue;
struct ibv_exp_rwq_ind_table *ind_table = NULL;
struct ibv_qp *qp = NULL;
struct ibv_exp_rwq_ind_table_init_attr ind_init_attr;
@@ -629,8 +631,10 @@ priv_fdir_disable(struct priv *priv)
/* Run on every RX queue to destroy related flow director QP and
* indirection table. */
for (i = 0; (i != priv->rxqs_n); i++) {
- fdir_queue = &(*priv->rxqs)[i]->fdir_queue;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of((*priv->rxqs)[i], struct rxq_ctrl, rxq);

+ fdir_queue = &rxq_ctrl->fdir_queue;
if (fdir_queue->qp != NULL) {
claim_zero(ibv_destroy_qp(fdir_queue->qp));
fdir_queue->qp = NULL;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 4000624..8d32e74 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -636,7 +636,7 @@ priv_rehash_flows(struct priv *priv)
/**
* Allocate RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param elts_n
* Number of elements to allocate.
@@ -648,16 +648,17 @@ priv_rehash_flows(struct priv *priv)
* 0 on success, errno value on failure.
*/
static int
-rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
+rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
+ struct rte_mbuf **pool)
{
unsigned int i;
struct rxq_elt (*elts)[elts_n] =
rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
+ rxq_ctrl->socket);
int ret = 0;

if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
+ ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -672,10 +673,10 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
assert(buf != NULL);
rte_pktmbuf_reset(buf);
} else
- buf = rte_pktmbuf_alloc(rxq->mp);
+ buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
+ ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
ret = ENOMEM;
goto error;
}
@@ -691,15 +692,15 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
sge->addr = (uintptr_t)
((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
+ sge->lkey = rxq_ctrl->mr->lkey;
/* Redundant check for tailroom. */
assert(sge->length == rte_pktmbuf_tailroom(buf));
}
DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq, elts_n);
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts = elts;
+ (void *)rxq_ctrl, elts_n);
+ rxq_ctrl->rxq.elts_n = elts_n;
+ rxq_ctrl->rxq.elts_head = 0;
+ rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
@@ -714,7 +715,7 @@ error:
}
rte_free(elts);
}
- DEBUG("%p: failed, freed everything", (void *)rxq);
+ DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
return ret;
}
@@ -722,19 +723,19 @@ error:
/**
* Free RX queue elements.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
static void
-rxq_free_elts(struct rxq *rxq)
+rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts = NULL;
+ DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
+ rxq_ctrl->rxq.elts_n = 0;
+ rxq_ctrl->rxq.elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -752,58 +753,58 @@ rxq_free_elts(struct rxq *rxq)
*
* Destroy objects, free allocated memory and reset the structure for reuse.
*
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
*/
void
-rxq_cleanup(struct rxq *rxq)
+rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
{
struct ibv_exp_release_intf_params params;

- DEBUG("cleaning up %p", (void *)rxq);
- rxq_free_elts(rxq);
- rxq->poll = NULL;
- rxq->recv = NULL;
- if (rxq->if_wq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->wq != NULL);
+ DEBUG("cleaning up %p", (void *)rxq_ctrl);
+ rxq_free_elts(rxq_ctrl);
+ rxq_ctrl->rxq.poll = NULL;
+ rxq_ctrl->rxq.recv = NULL;
+ if (rxq_ctrl->if_wq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_wq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_wq,
&params));
}
- if (rxq->if_cq != NULL) {
- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- assert(rxq->cq != NULL);
+ if (rxq_ctrl->if_cq != NULL) {
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ assert(rxq_ctrl->rxq.cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
- rxq->if_cq,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->if_cq,
&params));
}
- if (rxq->wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq->wq));
- if (rxq->cq != NULL)
- claim_zero(ibv_destroy_cq(rxq->cq));
- if (rxq->rd != NULL) {
+ if (rxq_ctrl->rxq.wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
+ if (rxq_ctrl->rxq.cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq->priv != NULL);
- assert(rxq->priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
- rxq->rd,
+ assert(rxq_ctrl->rxq.priv != NULL);
+ assert(rxq_ctrl->rxq.priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ rxq_ctrl->rd,
&attr));
}
- if (rxq->mr != NULL)
- claim_zero(ibv_dereg_mr(rxq->mr));
- memset(rxq, 0, sizeof(*rxq));
+ if (rxq_ctrl->mr != NULL)
+ claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
+ memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
}

/**
@@ -815,37 +816,37 @@ rxq_cleanup(struct rxq *rxq)
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* RX queue pointer.
*
* @return
* 0 on success, errno value on failure.
*/
int
-rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
+rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq->priv;
- struct rxq tmpl = *rxq;
+ struct priv *priv = rxq_ctrl->rxq.priv;
+ struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.elts_n];
+ struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
+ DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.elts_n;
+ desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum = tmpl.csum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum = tmpl.rxq.csum;
}
if (priv->hw_csum_l2tun) {
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq->csum_l2tun = tmpl.csum_l2tun;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
}
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
@@ -853,7 +854,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -867,7 +868,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq->elts;
+ elts = rxq_ctrl->rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
struct rxq_elt *elt = &(*elts)[i];
struct rte_mbuf *buf = elt->buf;
@@ -875,9 +876,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
pool[k++] = buf;
}
assert(k == mbuf_n);
- tmpl.elts_n = 0;
- tmpl.elts = NULL;
- assert((void *)&tmpl.elts == NULL);
+ tmpl.rxq.elts_n = 0;
+ tmpl.rxq.elts = NULL;
+ assert((void *)&tmpl.rxq.elts == NULL);
err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
@@ -885,18 +886,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
assert(err > 0);
return err;
}
- assert(tmpl.elts_n == desc_n);
+ assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
/* Clean up original data. */
- rxq->elts_n = 0;
- rte_free(rxq->elts);
- rxq->elts = NULL;
+ rxq_ctrl->rxq.elts_n = 0;
+ rte_free(rxq_ctrl->rxq.elts);
+ rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
@@ -904,10 +905,10 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
err = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (err)
@@ -920,9 +921,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.rxq.recv = tmpl.if_wq->recv_burst;
error:
- *rxq = tmpl;
+ *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -932,7 +933,7 @@ error:
*
* @param dev
* Pointer to Ethernet device structure.
- * @param rxq
+ * @param rxq_ctrl
* Pointer to RX queue structure.
* @param desc
* Number of descriptors to configure in queue.
@@ -947,15 +948,17 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
+rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
unsigned int socket, const struct rte_eth_rxconf *conf,
struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
- struct rxq tmpl = {
- .priv = priv,
- .mp = mp,
- .socket = socket
+ struct rxq_ctrl tmpl = {
+ .socket = socket,
+ .rxq = {
+ .priv = priv,
+ .mp = mp,
+ },
};
struct ibv_exp_wq_attr mod;
union {
@@ -978,9 +981,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
}
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
- tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
- tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
+ tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
(void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
@@ -1007,9 +1010,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.cq == NULL) {
+ tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.rxq.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1020,8 +1023,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
DEBUG("priv->device_attr.max_sge is %d",
priv->device_attr.max_sge);
/* Configure VLAN stripping. */
- tmpl.vlan_strip = (priv->hw_vlan_strip &&
- !!dev->data->dev_conf.rxmode.hw_vlan_strip);
+ tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
+ !!dev->data->dev_conf.rxmode.hw_vlan_strip);
attr.wq = (struct ibv_exp_wq_init_attr){
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
@@ -1032,7 +1035,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.cq,
+ .cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
@@ -1041,7 +1044,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
0,
.res_domain = tmpl.rd,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- .vlan_offloads = (tmpl.vlan_strip ?
+ .vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1050,24 +1053,24 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
} else if (priv->hw_fcs_strip) {
/* Ask HW/Verbs to leave CRC in place when supported. */
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
- tmpl.crc_present = 1;
+ tmpl.rxq.crc_present = 1;
} else {
WARN("%p: CRC stripping has been disabled but will still"
" be performed by hardware, make sure MLNX_OFED and"
" firmware are up to date",
(void *)dev);
- tmpl.crc_present = 0;
+ tmpl.rxq.crc_present = 0;
}
DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
" incoming frames to hide it",
(void *)dev,
- tmpl.crc_present ? "disabled" : "enabled",
- tmpl.crc_present << 2);
+ tmpl.rxq.crc_present ? "disabled" : "enabled",
+ tmpl.rxq.crc_present << 2);
#endif /* HAVE_VERBS_FCS */

#ifdef HAVE_VERBS_RX_END_PADDING
@@ -1075,7 +1078,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
; /* Nothing else to do. */
else if (priv->hw_padding) {
INFO("%p: enabling packet padding on queue %p",
- (void *)dev, (void *)rxq);
+ (void *)dev, (void *)rxq_ctrl);
attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
} else
@@ -1085,8 +1088,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev);
#endif /* HAVE_VERBS_RX_END_PADDING */

- tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.wq == NULL) {
+ tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.rxq.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1099,15 +1102,15 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Save port ID. */
- tmpl.port_id = dev->data->port_id;
- DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+ tmpl.rxq.port_id = dev->data->port_id;
+ DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.cq,
+ .obj = tmpl.rxq.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1118,7 +1121,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.wq,
+ .obj = tmpl.rxq.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1131,17 +1134,17 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
/* Post SGEs. */
- elts = tmpl.elts;
+ elts = tmpl.rxq.elts;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
+ tmpl.rxq.wq,
&(*elts)[i].sge,
1);
if (ret)
@@ -1155,18 +1158,18 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
- DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
- rxq_cleanup(rxq);
- *rxq = tmpl;
- DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
+ rxq_cleanup(rxq_ctrl);
+ *rxq_ctrl = tmpl;
+ DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->poll = rxq->if_cq->poll_length_flags;
+ rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1200,12 +1203,14 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl;
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1222,24 +1227,25 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -EEXIST;
}
(*priv->rxqs)[idx] = NULL;
- rxq_cleanup(rxq);
+ rxq_cleanup(rxq_ctrl);
} else {
- rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
- if (rxq == NULL) {
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
+ socket);
+ if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
priv_unlock(priv);
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
+ ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
- rte_free(rxq);
+ rte_free(rxq_ctrl);
else {
- rxq->stats.idx = idx;
+ rxq_ctrl->rxq.stats.idx = idx;
DEBUG("%p: adding RX queue %p to list",
- (void *)dev, (void *)rxq);
- (*priv->rxqs)[idx] = rxq;
+ (void *)dev, (void *)rxq_ctrl);
+ (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
/* Update receive callback. */
dev->rx_pkt_burst = mlx5_rx_burst;
}
@@ -1257,6 +1263,7 @@ void
mlx5_rx_queue_release(void *dpdk_rxq)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct rxq_ctrl *rxq_ctrl;
struct priv *priv;
unsigned int i;

@@ -1265,6 +1272,7 @@ mlx5_rx_queue_release(void *dpdk_rxq)

if (rxq == NULL)
return;
+ rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
priv = rxq->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
@@ -1274,8 +1282,8 @@ mlx5_rx_queue_release(void *dpdk_rxq)
(*priv->rxqs)[i] = NULL;
break;
}
- rxq_cleanup(rxq);
- rte_free(rxq);
+ rxq_cleanup(rxq_ctrl);
+ rte_free(rxq_ctrl);
priv_unlock(priv);
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4ba88ea..f0b42e9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -574,7 +574,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)

/* Reconfigure sge to use rep instead of seg. */
elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
elt->buf = rep;

/* Add SGE to array for repost. */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 5baefcb..2c5e447 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -111,8 +111,11 @@ struct rxq {
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
struct rxq_elt (*elts)[]; /* RX elements. */
- unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
+} __rte_cache_aligned;
+
+/* RX queue control descriptor. */
+struct rxq_ctrl {
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -122,6 +125,8 @@ struct rxq {
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ struct rxq rxq; /* Data path structure. */
};

/* Hash RX queue types. */
@@ -285,9 +290,9 @@ int priv_create_hash_rxqs(struct priv *);
void priv_destroy_hash_rxqs(struct priv *);
int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
-void rxq_cleanup(struct rxq *);
-int rxq_rehash(struct rte_eth_dev *, struct rxq *);
-int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int,
+void rxq_cleanup(struct rxq_ctrl *);
+int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
+int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:22 UTC
Permalink
The latest version of Mellanox OFED exposes hardware definitions necessary
to implement data path operation bypassing Verbs. Update the minimum
version requirement to MLNX_OFED >= 3.3 and clean up compatibility checks
for previous releases.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 44 +++---------------------------------------
drivers/net/mlx5/Makefile | 39 ++++++++-----------------------------
drivers/net/mlx5/mlx5.c | 23 ----------------------
drivers/net/mlx5/mlx5.h | 5 +++++
drivers/net/mlx5/mlx5_defs.h | 9 ---------
drivers/net/mlx5/mlx5_fdir.c | 10 ----------
drivers/net/mlx5/mlx5_rxmode.c | 8 --------
drivers/net/mlx5/mlx5_rxq.c | 30 ----------------------------
drivers/net/mlx5/mlx5_rxtx.c | 4 ----
drivers/net/mlx5/mlx5_rxtx.h | 8 --------
drivers/net/mlx5/mlx5_txq.c | 2 --
drivers/net/mlx5/mlx5_vlan.c | 3 ---
12 files changed, 16 insertions(+), 169 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 77fa957..3a07928 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -125,16 +125,6 @@ These options can be modified in the ``.config`` file.
Environment variables
~~~~~~~~~~~~~~~~~~~~~

-- ``MLX5_ENABLE_CQE_COMPRESSION``
-
- A nonzero value lets ConnectX-4 return smaller completion entries to
- improve performance when PCI backpressure is detected. It is most useful
- for scenarios involving heavy traffic on many queues.
-
- Since the additional software logic necessary to handle this mode can
- lower performance when there is no backpressure, it is not enabled by
- default.
-
- ``MLX5_PMD_ENABLE_PADDING``

Enables HW packet padding in PCI bus transactions.
@@ -211,40 +201,12 @@ DPDK and must be installed separately:

Currently supported by DPDK:

-- Mellanox OFED **3.1-1.0.3**, **3.1-1.5.7.1** or **3.2-2.0.0.0** depending
- on usage.
-
- The following features are supported with version **3.1-1.5.7.1** and
- above only:
-
- - IPv6, UPDv6, TCPv6 RSS.
- - RX checksum offloads.
- - IBM POWER8.
-
- The following features are supported with version **3.2-2.0.0.0** and
- above only:
-
- - Flow director.
- - RX VLAN stripping.
- - TX VLAN insertion.
- - RX CRC stripping configuration.
+- Mellanox OFED **3.3-1.0.0.0**.

- Minimum firmware version:

- With MLNX_OFED **3.1-1.0.3**:
-
- - ConnectX-4: **12.12.1240**
- - ConnectX-4 Lx: **14.12.1100**
-
- With MLNX_OFED **3.1-1.5.7.1**:
-
- - ConnectX-4: **12.13.0144**
- - ConnectX-4 Lx: **14.13.0144**
-
- With MLNX_OFED **3.2-2.0.0.0**:
-
- - ConnectX-4: **12.14.2036**
- - ConnectX-4 Lx: **14.14.2036**
+ - ConnectX-4: **12.16.1006**
+ - ConnectX-4 Lx: **14.16.1006**

Getting Mellanox OFED
~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 289c85e..dc99797 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -106,42 +106,19 @@ mlx5_autoconf.h.new: FORCE
mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_EXP_QUERY_DEVICE \
- infiniband/verbs.h \
- type 'struct ibv_exp_device_attr' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_FLOW_SPEC_IPV6 \
- infiniband/verbs.h \
- type 'struct ibv_exp_flow_spec_ipv6' $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- infiniband/verbs.h \
- enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- infiniband/verbs.h \
- enum IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_EXP_CQ_RX_TCP_PACKET \
+ HAVE_VERBS_VLAN_INSERTION \
infiniband/verbs.h \
- enum IBV_EXP_CQ_RX_TCP_PACKET \
+ enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_FCS \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS \
+ HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
+ infiniband/verbs_exp.h \
+ enum IBV_EXP_CQ_COMPRESSED_CQE \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
- HAVE_VERBS_RX_END_PADDING \
- infiniband/verbs.h \
- enum IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
+ HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
+ infiniband/mlx5_hw.h \
+ enum MLX5_ETH_VLAN_INLINE_HEADER_SIZE \
$(AUTOCONF_OUTPUT)

# Create mlx5_autoconf.h or update it in case it differs from the new one.
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 27a7a30..3f45d84 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -195,17 +195,13 @@ static const struct eth_dev_ops mlx5_dev_ops = {
.mac_addr_add = mlx5_mac_addr_add,
.mac_addr_set = mlx5_mac_addr_set,
.mtu_set = mlx5_dev_set_mtu,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
.vlan_offload_set = mlx5_vlan_offload_set,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.reta_update = mlx5_dev_rss_reta_update,
.reta_query = mlx5_dev_rss_reta_query,
.rss_hash_update = mlx5_rss_hash_update,
.rss_hash_conf_get = mlx5_rss_hash_conf_get,
-#ifdef MLX5_FDIR_SUPPORT
.filter_ctrl = mlx5_dev_filter_ctrl,
-#endif /* MLX5_FDIR_SUPPORT */
};

static struct {
@@ -352,24 +348,16 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct ibv_pd *pd = NULL;
struct priv *priv = NULL;
struct rte_eth_dev *eth_dev;
-#ifdef HAVE_EXP_QUERY_DEVICE
struct ibv_exp_device_attr exp_device_attr;
-#endif /* HAVE_EXP_QUERY_DEVICE */
struct ether_addr mac;
uint16_t num_vfs = 0;

-#ifdef HAVE_EXP_QUERY_DEVICE
exp_device_attr.comp_mask =
IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
IBV_EXP_DEVICE_ATTR_RX_HASH |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-#ifdef HAVE_VERBS_RX_END_PADDING
IBV_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN |
-#endif /* HAVE_VERBS_RX_END_PADDING */
0;
-#endif /* HAVE_EXP_QUERY_DEVICE */

DEBUG("using port %u (%08" PRIx32 ")", port, test);

@@ -420,7 +408,6 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
-#ifdef HAVE_EXP_QUERY_DEVICE
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
@@ -446,30 +433,20 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
DEBUG("maximum RX indirection table size is %u",
priv->ind_table_max_size);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap &
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP);
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
DEBUG("VLAN stripping is %ssupported",
(priv->hw_vlan_strip ? "" : "not "));

-#ifdef HAVE_VERBS_FCS
priv->hw_fcs_strip = !!(exp_device_attr.exp_device_cap_flags &
IBV_EXP_DEVICE_SCATTER_FCS);
-#endif /* HAVE_VERBS_FCS */
DEBUG("FCS stripping configuration is %ssupported",
(priv->hw_fcs_strip ? "" : "not "));

-#ifdef HAVE_VERBS_RX_END_PADDING
priv->hw_padding = !!exp_device_attr.rx_pad_end_addr_align;
-#endif /* HAVE_VERBS_RX_END_PADDING */
DEBUG("hardware RX end alignment padding is %ssupported",
(priv->hw_padding ? "" : "not "));

-#else /* HAVE_EXP_QUERY_DEVICE */
- priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
-#endif /* HAVE_EXP_QUERY_DEVICE */
-
priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
priv->mps = mps;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index cbcb8b9..935e1b0 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -68,6 +68,11 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+#if !defined(HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE) || \
+ !defined(HAVE_VERBS_MLX5_ETH_VLAN_INLINE_HEADER_SIZE)
+#error Mellanox OFED >= 3.3 is required, please refer to the documentation.
+#endif
+
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 9a19835..8d2ec7a 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -76,13 +76,4 @@
/* Alarm timeout. */
#define MLX5_ALARM_TIMEOUT_US 100000

-/*
- * Extended flow priorities necessary to support flow director are available
- * since MLNX_OFED 3.2. Considering this version adds support for VLAN
- * offloads as well, their availability means flow director can be used.
- */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-#define MLX5_FDIR_SUPPORT 1
-#endif
-
#endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index e3b97ba..1850218 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -122,7 +122,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV4_OTHER:
desc->type = HASH_RXQ_IPV4;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
desc->type = HASH_RXQ_UDPV6;
break;
@@ -132,7 +131,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
case RTE_ETH_FLOW_NONFRAG_IPV6_OTHER:
desc->type = HASH_RXQ_IPV6;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -147,7 +145,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
desc->src_ip[0] = fdir_filter->input.flow.ip4_flow.src_ip;
desc->dst_ip[0] = fdir_filter->input.flow.ip4_flow.dst_ip;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case RTE_ETH_FLOW_NONFRAG_IPV6_UDP:
case RTE_ETH_FLOW_NONFRAG_IPV6_TCP:
desc->src_port = fdir_filter->input.flow.udp6_flow.src_port;
@@ -161,7 +158,6 @@ fdir_filter_to_flow_desc(const struct rte_eth_fdir_filter *fdir_filter,
fdir_filter->input.flow.ipv6_flow.dst_ip,
sizeof(desc->dst_ip));
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -211,7 +207,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[0] & mask->ipv4_mask.dst_ip)))
return 0;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -222,7 +217,6 @@ priv_fdir_overlap(const struct priv *priv,
(desc2->dst_ip[i] & mask->ipv6_mask.dst_ip[i])))
return 0;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
break;
}
@@ -258,9 +252,7 @@ priv_fdir_flow_add(struct priv *priv,
uintptr_t spec_offset = (uintptr_t)&data->spec;
struct ibv_exp_flow_spec_eth *spec_eth;
struct ibv_exp_flow_spec_ipv4 *spec_ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 *spec_ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_tcp_udp *spec_tcp_udp;
struct mlx5_fdir_filter *iter_fdir_filter;
unsigned int i;
@@ -334,7 +326,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv4->size;
break;
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_IPV6:
case HASH_RXQ_UDPV6:
case HASH_RXQ_TCPV6:
@@ -368,7 +359,6 @@ priv_fdir_flow_add(struct priv *priv,

spec_offset += spec_ipv6->size;
break;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
default:
ERROR("invalid flow attribute type");
return EINVAL;
diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c
index 3a55f63..51e2aca 100644
--- a/drivers/net/mlx5/mlx5_rxmode.c
+++ b/drivers/net/mlx5/mlx5_rxmode.c
@@ -67,11 +67,9 @@ static const struct special_flow_init special_flow_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -82,10 +80,8 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 0,
@@ -96,15 +92,12 @@ static const struct special_flow_init special_flow_init[] = {
.hash_types =
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
1 << HASH_RXQ_ETH |
0,
.per_vlan = 1,
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_FLOW_TYPE_IPV6MULTI] = {
.dst_mac_val = "\x33\x33\x00\x00\x00\x00",
.dst_mac_mask = "\xff\xff\x00\x00\x00\x00",
@@ -115,7 +108,6 @@ static const struct special_flow_init special_flow_init[] = {
0,
.per_vlan = 1,
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
};

/**
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 8d32e74..7db4ce7 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -105,7 +105,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#ifdef HAVE_FLOW_SPEC_IPV6
[HASH_RXQ_TCPV6] = {
.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
IBV_EXP_RX_HASH_DST_IPV6 |
@@ -144,7 +143,6 @@ const struct hash_rxq_init hash_rxq_init[] = {
},
.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
},
-#endif /* HAVE_FLOW_SPEC_IPV6 */
[HASH_RXQ_ETH] = {
.hash_fields = 0,
.dpdk_rss_hf = 0,
@@ -168,17 +166,11 @@ static const struct ind_table_init ind_table_init[] = {
1 << HASH_RXQ_TCPV4 |
1 << HASH_RXQ_UDPV4 |
1 << HASH_RXQ_IPV4 |
-#ifdef HAVE_FLOW_SPEC_IPV6
1 << HASH_RXQ_TCPV6 |
1 << HASH_RXQ_UDPV6 |
1 << HASH_RXQ_IPV6 |
-#endif /* HAVE_FLOW_SPEC_IPV6 */
0,
-#ifdef HAVE_FLOW_SPEC_IPV6
.hash_types_n = 6,
-#else /* HAVE_FLOW_SPEC_IPV6 */
- .hash_types_n = 3,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
},
{
.max_size = 1,
@@ -243,12 +235,8 @@ priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
init = &hash_rxq_init[type];
*flow_attr = (struct ibv_exp_flow_attr){
.type = IBV_EXP_FLOW_ATTR_NORMAL,
-#ifdef MLX5_FDIR_SUPPORT
/* Priorities < 3 are reserved for flow director. */
.priority = init->flow_priority + 3,
-#else /* MLX5_FDIR_SUPPORT */
- .priority = init->flow_priority,
-#endif /* MLX5_FDIR_SUPPORT */
.num_of_specs = 0,
.port = priv->port,
.flags = 0,
@@ -589,9 +577,7 @@ priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
case HASH_RXQ_FLOW_TYPE_ALLMULTI:
return !!priv->allmulti_req;
case HASH_RXQ_FLOW_TYPE_BROADCAST:
-#ifdef HAVE_FLOW_SPEC_IPV6
case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
-#endif /* HAVE_FLOW_SPEC_IPV6 */
/* If allmulti is enabled, broadcast and ipv6multi
* are unnecessary. */
return !priv->allmulti_req;
@@ -1038,19 +1024,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.cq = tmpl.rxq.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
0,
.res_domain = tmpl.rd,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.vlan_offloads = (tmpl.rxq.vlan_strip ?
IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
0),
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
};
-
-#ifdef HAVE_VERBS_FCS
/* By default, FCS (CRC) is stripped by hardware. */
if (dev->data->dev_conf.rxmode.hw_strip_crc) {
tmpl.rxq.crc_present = 0;
@@ -1071,9 +1051,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
(void *)dev,
tmpl.rxq.crc_present ? "disabled" : "enabled",
tmpl.rxq.crc_present << 2);
-#endif /* HAVE_VERBS_FCS */
-
-#ifdef HAVE_VERBS_RX_END_PADDING
if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
; /* Nothing else to do. */
else if (priv->hw_padding) {
@@ -1086,7 +1063,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" supported, make sure MLNX_OFED and firmware are"
" up to date",
(void *)dev);
-#endif /* HAVE_VERBS_RX_END_PADDING */

tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
if (tmpl.rxq.wq == NULL) {
@@ -1106,9 +1082,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
.intf_version = 1,
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
.intf = IBV_EXP_INTF_CQ,
.obj = tmpl.rxq.cq,
};
@@ -1164,11 +1138,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
/* Assign function in queue. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f0b42e9..6a0d707 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -452,11 +452,9 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD);
-#ifdef HAVE_EXP_CQ_RX_TCP_PACKET
/* Set L4 checksum flag only for TCP/UDP packets. */
if (flags &
(IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
-#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */
ol_flags |=
TRANSPOSE(~flags,
IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
@@ -589,12 +587,10 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
seg->packet_type = rxq_cq_to_pkt_type(flags);
seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
seg->ol_flags |= PKT_RX_VLAN_PKT;
seg->vlan_tci = vlan_tci;
}
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
}
/* Return packet. */
*(pkts++) = seg;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2c5e447..570345b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -120,11 +120,7 @@ struct rxq_ctrl {
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
unsigned int socket; /* CPU socket ID for allocations. */
struct rxq rxq; /* Data path structure. */
};
@@ -134,11 +130,9 @@ enum hash_rxq_type {
HASH_RXQ_TCPV4,
HASH_RXQ_UDPV4,
HASH_RXQ_IPV4,
-#ifdef HAVE_FLOW_SPEC_IPV6
HASH_RXQ_TCPV6,
HASH_RXQ_UDPV6,
HASH_RXQ_IPV6,
-#endif /* HAVE_FLOW_SPEC_IPV6 */
HASH_RXQ_ETH,
};

@@ -169,9 +163,7 @@ struct hash_rxq_init {
} hdr;
struct ibv_exp_flow_spec_tcp_udp tcp_udp;
struct ibv_exp_flow_spec_ipv4 ipv4;
-#ifdef HAVE_FLOW_SPEC_IPV6
struct ibv_exp_flow_spec_ipv6 ipv6;
-#endif /* HAVE_FLOW_SPEC_IPV6 */
struct ibv_exp_flow_spec_eth eth;
} flow_spec; /* Flow specification template. */
const struct hash_rxq_init *underlayer; /* Pointer to underlayer. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4683775..9f3a33b 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -375,13 +375,11 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
#endif
-#ifdef HAVE_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
-#endif
};
tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_qp == NULL) {
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index ff40538..3b9b771 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,7 +144,6 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -165,8 +164,6 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
return;
}

-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
/* Update related bits in RX queue. */
rxq->vlan_strip = !!on;
}
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:23 UTC
Permalink
These structures and macros extend those exposed by libmlx5 (in mlx5_hw.h)
to let the PMD manage work queue and completion queue elements directly.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_prm.h | 163 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 163 insertions(+)
create mode 100644 drivers/net/mlx5/mlx5_prm.h

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
new file mode 100644
index 0000000..5db219b
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -0,0 +1,163 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/mlx5_hw.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* Get CQE owner bit. */
+#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK)
+
+/* Get CQE format. */
+#define MLX5_CQE_FORMAT(op_own) (((op_own) & MLX5E_CQE_FORMAT_MASK) >> 2)
+
+/* Get CQE opcode. */
+#define MLX5_CQE_OPCODE(op_own) (((op_own) & 0xf0) >> 4)
+
+/* Get CQE solicited event. */
+#define MLX5_CQE_SE(op_own) (((op_own) >> 1) & 1)
+
+/* Invalidate a CQE. */
+#define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
+
+/* CQE value to inform that VLAN is stripped. */
+#define MLX5_CQE_VLAN_STRIPPED 0x1
+
+/* Maximum number of packets a multi-packet WQE can handle. */
+#define MLX5_MPW_DSEG_MAX 5
+
+/* Room for inline data in regular work queue element. */
+#define MLX5_WQE64_INL_DATA 12
+
+/* Room for inline data in multi-packet WQE. */
+#define MLX5_MWQE64_INL_DATA 28
+
+/* Subset of struct mlx5_wqe_eth_seg. */
+struct mlx5_wqe_eth_seg_small {
+ uint32_t rsvd0;
+ uint8_t cs_flags;
+ uint8_t rsvd1;
+ uint16_t mss;
+ uint32_t rsvd2;
+ uint16_t inline_hdr_sz;
+};
+
+/* Regular WQE. */
+struct mlx5_wqe_regular {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ struct mlx5_wqe_data_seg dseg;
+} __rte_aligned(64);
+
+/* Inline WQE. */
+struct mlx5_wqe_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_WQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Multi-packet WQE. */
+struct mlx5_wqe_mpw {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ struct mlx5_wqe_data_seg dseg[2];
+} __rte_aligned(64);
+
+/* Multi-packet WQE with inline. */
+struct mlx5_wqe_mpw_inl {
+ union {
+ struct mlx5_wqe_ctrl_seg ctrl;
+ uint32_t data[4];
+ } ctrl;
+ struct mlx5_wqe_eth_seg_small eseg;
+ uint32_t byte_cnt;
+ uint8_t data[MLX5_MWQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Union of all WQE types. */
+union mlx5_wqe {
+ struct mlx5_wqe_regular wqe;
+ struct mlx5_wqe_inl inl;
+ struct mlx5_wqe_mpw mpw;
+ struct mlx5_wqe_mpw_inl mpw_inl;
+ uint8_t data[64];
+};
+
+/* MPW session status. */
+enum mlx5_mpw_state {
+ MLX5_MPW_STATE_OPENED,
+ MLX5_MPW_INL_STATE_OPENED,
+ MLX5_MPW_STATE_CLOSED,
+};
+
+/* MPW session descriptor. */
+struct mlx5_mpw {
+ enum mlx5_mpw_state state;
+ unsigned int pkts_n;
+ unsigned int len;
+ unsigned int total_len;
+ volatile union mlx5_wqe *wqe;
+ union {
+ volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
+ volatile uint8_t *raw;
+ } data;
+};
+
+/* CQ element structure - should be equal to the cache line size */
+struct mlx5_cqe {
+#if (RTE_CACHE_LINE_SIZE == 128)
+ uint8_t padding[64];
+#endif
+ struct mlx5_cqe64 cqe64;
+};
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:24 UTC
Permalink
The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3f45d84..56b1dfc 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -37,6 +37,7 @@
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
+#include <errno.h>
#include <net/if.h>

/* Verbs header. */
@@ -57,6 +58,7 @@
#include <rte_ethdev.h>
#include <rte_pci.h>
#include <rte_common.h>
+#include <rte_kvargs.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -237,6 +239,70 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr)
return ret;
}

+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ * Key argument to verify.
+ * @param[in] val
+ * Value associated with key.
+ * @param opaque
+ * User data.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct priv *priv = opaque;
+
+ /* No parameters are expected at the moment. */
+ (void)priv;
+ (void)val;
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param devargs
+ * Device arguments structure.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+mlx5_args(struct priv *priv, struct rte_devargs *devargs)
+{
+ static const char *params[] = {
+ NULL,
+ };
+ struct rte_kvargs *kvlist;
+ int ret = 0;
+ int i;
+
+ if (devargs == NULL)
+ return 0;
+ kvlist = rte_kvargs_parse(devargs->args, params);
+ if (kvlist == NULL)
+ return 0;
+ /* Process parameters. */
+ for (i = 0; (i != RTE_DIM(params)); ++i) {
+ if (rte_kvargs_count(kvlist, params[i])) {
+ ret = rte_kvargs_process(kvlist, params[i],
+ mlx5_args_check, priv);
+ if (ret != 0)
+ return ret;
+ }
+ }
+ rte_kvargs_free(kvlist);
+ return 0;
+}
+
static struct eth_driver mlx5_driver;

/**
@@ -408,6 +474,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ err = mlx5_args(priv, pci_dev->devargs);
+ if (err) {
+ ERROR("failed to process device arguments: %s",
+ strerror(err));
+ goto port_error;
+ }
if (ibv_exp_query_device(ctx, &exp_device_attr)) {
ERROR("ibv_exp_query_device() failed");
goto port_error;
--
2.1.4
Ferruh Yigit
2016-06-21 16:42:42 UTC
Permalink
Post by Nelio Laranjeiro
The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.
---
...
Post by Nelio Laranjeiro
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct priv *priv = opaque;
+
+ /* No parameters are expected at the moment. */
+ (void)priv;
+ (void)val;
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
Returning positive value here will prevent rte_kvargs_process() to fail,
I guess that is the intention but returning EINVAL is misleading.

Also generating the checkpatch warning:
WARNING:USE_NEGATIVE_ERRNO: return of an errno should typically be
negative (ie: return -EINVAL)
#71: FILE: drivers/net/mlx5/mlx5.c:264:
+ return EINVAL;
Nélio Laranjeiro
2016-06-22 07:30:42 UTC
Permalink
Post by Ferruh Yigit
Post by Nelio Laranjeiro
The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.
---
...
Post by Nelio Laranjeiro
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+ struct priv *priv = opaque;
+
+ /* No parameters are expected at the moment. */
+ (void)priv;
+ (void)val;
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
Returning positive value here will prevent rte_kvargs_process() to fail,
I guess that is the intention but returning EINVAL is misleading.
WARNING:USE_NEGATIVE_ERRNO: return of an errno should typically be
negative (ie: return -EINVAL)
+ return EINVAL;
Good catch, in fact as it is not processed by the PMD itself, it must be
compliant with the rte_kvargs_process().

I will fix in in the v4.
--
Nélio Laranjeiro
6WIND
Nelio Laranjeiro
2016-06-21 07:23:25 UTC
Permalink
These wrappers are meant to prevent code duplication later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5.h | 2 ++
drivers/net/mlx5/mlx5_ethdev.c | 34 ++++++++++++++++++++++++++++------
drivers/net/mlx5/mlx5_txq.c | 2 +-
3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 935e1b0..3dca03d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -196,6 +196,8 @@ void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
int mlx5_set_link_down(struct rte_eth_dev *dev);
int mlx5_set_link_up(struct rte_eth_dev *dev);
struct priv *mlx5_secondary_data_setup(struct priv *priv);
+void priv_select_tx_function(struct priv *);
+void priv_select_rx_function(struct priv *);

/* mlx5_mac.c */

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 3992b2c..771d8b5 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1099,8 +1099,8 @@ priv_set_link(struct priv *priv, int up)
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- dev->rx_pkt_burst = mlx5_rx_burst;
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
if (err)
@@ -1289,13 +1289,11 @@ mlx5_secondary_data_setup(struct priv *priv)
rte_mb();
priv->dev->data = &sd->data;
rte_mb();
- priv->dev->tx_pkt_burst = mlx5_tx_burst;
- priv->dev->rx_pkt_burst = removed_rx_burst;
+ priv_select_tx_function(priv);
+ priv_select_rx_function(priv);
priv_unlock(priv);
end:
/* More sanity checks. */
- assert(priv->dev->tx_pkt_burst == mlx5_tx_burst);
- assert(priv->dev->rx_pkt_burst == removed_rx_burst);
assert(priv->dev->data == &sd->data);
rte_spinlock_unlock(&sd->lock);
return priv;
@@ -1306,3 +1304,27 @@ error:
rte_spinlock_unlock(&sd->lock);
return NULL;
}
+
+/**
+ * Configure the TX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_tx_function(struct priv *priv)
+{
+ priv->dev->tx_pkt_burst = mlx5_tx_burst;
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param priv
+ * Pointer to private structure.
+ */
+void
+priv_select_rx_function(struct priv *priv)
+{
+ priv->dev->rx_pkt_burst = mlx5_rx_burst;
+}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 9f3a33b..d7cc39d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -477,7 +477,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)txq_ctrl);
(*priv->txqs)[idx] = &txq_ctrl->txq;
/* Update send callback. */
- dev->tx_pkt_burst = mlx5_tx_burst;
+ priv_select_tx_function(priv);
}
priv_unlock(priv);
return -ret;
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:26 UTC
Permalink
Bypass Verbs to improve RX performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 4 +-
drivers/net/mlx5/mlx5_fdir.c | 2 +-
drivers/net/mlx5/mlx5_rxq.c | 303 ++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.c | 289 ++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 38 +++---
drivers/net/mlx5/mlx5_vlan.c | 3 +-
6 files changed, 325 insertions(+), 314 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 771d8b5..8628321 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1262,7 +1262,9 @@ mlx5_secondary_data_setup(struct priv *priv)
}
/* RX queues. */
for (i = 0; i != nb_rx_queues; ++i) {
- struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
+ struct rxq_ctrl *primary_rxq =
+ container_of((*sd->primary_priv->rxqs)[i],
+ struct rxq_ctrl, rxq);

if (primary_rxq == NULL)
continue;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 1850218..73eb00e 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -431,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx)
ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){
.pd = priv->pd,
.log_ind_tbl_size = 0,
- .ind_tbl = &((*priv->rxqs)[idx]->wq),
+ .ind_tbl = &rxq_ctrl->wq,
.comp_mask = 0,
};

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 7db4ce7..a8f68a3 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -43,6 +43,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/arch.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -373,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv)
DEBUG("indirection table extended to assume %u WQs",
priv->reta_idx_n);
}
- for (i = 0; (i != priv->reta_idx_n); ++i)
- wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
+ for (i = 0; (i != priv->reta_idx_n); ++i) {
+ struct rxq_ctrl *rxq_ctrl;
+
+ rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
+ struct rxq_ctrl, rxq);
+ wqs[i] = rxq_ctrl->wq;
+ }
/* Get number of hash RX queues to configure. */
for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
hash_rxqs_n += ind_table_init[i].hash_types_n;
@@ -638,21 +645,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf **pool)
{
unsigned int i;
- struct rxq_elt (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq_ctrl->socket);
int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
- ret = ENOMEM;
- goto error;
- }
/* For each WR (packet). */
for (i = 0; (i != elts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_sge *sge = &(*elts)[i].sge;
struct rte_mbuf *buf;
+ volatile struct mlx5_wqe_data_seg *scat =
+ &(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
buf = *(pool++);
@@ -666,40 +665,36 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
ret = ENOMEM;
goto error;
}
- elt->buf = buf;
/* Headroom is reserved by rte_pktmbuf_alloc(). */
assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
/* Buffer is supposed to be empty. */
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq_ctrl->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
+ assert(!buf->next);
+ PORT(buf) = rxq_ctrl->rxq.port_id;
+ DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+ PKT_LEN(buf) = DATA_LEN(buf);
+ NB_SEGS(buf) = 1;
+ /* scat->addr must be able to store a pointer. */
+ assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx5_wqe_data_seg){
+ .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = htonl(rxq_ctrl->mr->lkey),
+ };
+ (*rxq_ctrl->rxq.elts)[i] = buf;
}
DEBUG("%p: allocated and configured %u single-segment WRs",
(void *)rxq_ctrl, elts_n);
- rxq_ctrl->rxq.elts_n = elts_n;
- rxq_ctrl->rxq.elts_head = 0;
- rxq_ctrl->rxq.elts = elts;
assert(ret == 0);
return 0;
error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
+ assert(pool == NULL);
+ elts_n = i;
+ for (i = 0; (i != elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
assert(ret > 0);
@@ -716,22 +711,16 @@ static void
rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
{
unsigned int i;
- unsigned int elts_n = rxq_ctrl->rxq.elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;

DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
- rxq_ctrl->rxq.elts_n = 0;
- rxq_ctrl->rxq.elts = NULL;
- if (elts == NULL)
+ if (rxq_ctrl->rxq.elts == NULL)
return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;

- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
+ for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+ if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+ (*rxq_ctrl->rxq.elts)[i] = NULL;
}
- rte_free(elts);
}

/**
@@ -749,42 +738,40 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)

DEBUG("cleaning up %p", (void *)rxq_ctrl);
rxq_free_elts(rxq_ctrl);
- rxq_ctrl->rxq.poll = NULL;
- rxq_ctrl->rxq.recv = NULL;
if (rxq_ctrl->if_wq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.wq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->wq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_wq,
&params));
}
if (rxq_ctrl->if_cq != NULL) {
- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- assert(rxq_ctrl->rxq.cq != NULL);
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ assert(rxq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
rxq_ctrl->if_cq,
&params));
}
- if (rxq_ctrl->rxq.wq != NULL)
- claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
- if (rxq_ctrl->rxq.cq != NULL)
- claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+ if (rxq_ctrl->wq != NULL)
+ claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
+ if (rxq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
if (rxq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(rxq_ctrl->rxq.priv != NULL);
- assert(rxq_ctrl->rxq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+ assert(rxq_ctrl->priv != NULL);
+ assert(rxq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
rxq_ctrl->rd,
&attr));
}
@@ -811,14 +798,13 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->rxq.priv;
+ struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- struct rxq_elt (*elts)[tmpl.rxq.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
@@ -840,7 +826,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
@@ -854,60 +840,33 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
}
/* Snatch mbufs from original queue. */
k = 0;
- elts = rxq_ctrl->rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
-
- pool[k++] = buf;
- }
+ for (i = 0; (i != desc_n); ++i)
+ pool[k++] = (*rxq_ctrl->rxq.elts)[i];
assert(k == mbuf_n);
- tmpl.rxq.elts_n = 0;
- tmpl.rxq.elts = NULL;
- assert((void *)&tmpl.rxq.elts == NULL);
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
- assert(tmpl.rxq.elts_n == desc_n);
rte_free(pool);
- /* Clean up original data. */
- rxq_ctrl->rxq.elts_n = 0;
- rte_free(rxq_ctrl->rxq.elts);
- rxq_ctrl->rxq.elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ err = ibv_exp_modify_wq(tmpl.wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
/* Post SGEs. */
- assert(tmpl.if_wq != NULL);
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, err);
- /* Set err because it does not contain a valid errno value. */
- err = EIO;
- goto error;
+ ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
+ rte_free(pool);
+ assert(err > 0);
+ return err;
}
- tmpl.rxq.recv = tmpl.if_wq->recv_burst;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc_n;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
*rxq_ctrl = tmpl;
assert(err >= 0);
@@ -915,6 +874,45 @@ error:
}

/**
+ * Initialize RX queue.
+ *
+ * @param tmpl
+ * Pointer to RX queue control template.
+ * @param rxq_ctrl
+ * Pointer to RX queue control.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static inline int
+rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+{
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+ struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+
+ if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+ ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+ "it should be set to %u", RTE_CACHE_LINE_SIZE);
+ return EINVAL;
+ }
+ tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cq_ci = 0;
+ tmpl->rxq.rq_ci = 0;
+ tmpl->rxq.cq_db = cq->dbrec;
+ tmpl->rxq.wqes =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)rwq->rq.buff;
+ tmpl->rxq.cqes =
+ (volatile struct mlx5_cqe (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->rxq.elts =
+ (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
+ ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+ return 0;
+}
+
+/**
* Configure a RX queue.
*
* @param dev
@@ -934,15 +932,16 @@ error:
* 0 on success, errno value on failure.
*/
int
-rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_rxconf *conf,
- struct rte_mempool *mp)
+rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
struct rxq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
.rxq = {
- .priv = priv,
+ .elts_n = desc,
.mp = mp,
},
};
@@ -952,17 +951,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_wq_init_attr wq;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
- struct rxq_elt (*elts)[desc];
int ret = 0;
- unsigned int i;
- unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors", (void *)dev);
+ ERROR("%p: invalid number of RX descriptors (must be a"
+ " multiple of 2)", (void *)dev);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -996,9 +994,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
- &attr.cq);
- if (tmpl.rxq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -1015,13 +1013,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
+ .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
priv->device_attr.max_qp_wr :
- (int)cq_size),
+ (int)desc),
/* Max number of scatter/gather elements in a WR. */
.max_recv_sge = 1,
.pd = priv->pd,
- .cq = tmpl.rxq.cq,
+ .cq = tmpl.cq,
.comp_mask =
IBV_EXP_CREATE_WQ_RES_DOMAIN |
IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
@@ -1064,19 +1062,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
" up to date",
(void *)dev);

- tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
- if (tmpl.rxq.wq == NULL) {
+ tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+ if (tmpl.wq == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: WQ creation failure: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
- if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(ret));
- goto error;
- }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1084,7 +1076,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf_version = 1,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.rxq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -1095,7 +1087,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_WQ,
- .obj = tmpl.rxq.wq,
+ .obj = tmpl.wq,
};
tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_wq == NULL) {
@@ -1108,38 +1100,34 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+ ret = ibv_exp_modify_wq(tmpl.wq, &mod);
if (ret) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- /* Post SGEs. */
- elts = tmpl.rxq.elts;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.rxq.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
+ ret = rxq_setup(&tmpl, rxq_ctrl);
+ if (ret) {
+ ERROR("%p: cannot initialize RX queue structure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
}
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
- ERROR("%p: failed to post SGEs with error %d",
- (void *)dev, ret);
- /* Set ret because it does not contain a valid errno value. */
- ret = EIO;
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(ret));
goto error;
}
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
*rxq_ctrl = tmpl;
+ /* Update doorbell counter. */
+ rxq_ctrl->rxq.rq_ci = desc;
+ rte_wmb();
+ *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
assert(ret == 0);
- /* Assign function in queue. */
- rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
- rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1173,14 +1161,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct rxq *rxq = (*priv->rxqs)[idx];
- struct rxq_ctrl *rxq_ctrl;
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in RX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
@@ -1199,8 +1192,9 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->rxqs)[idx] = NULL;
rxq_cleanup(rxq_ctrl);
} else {
- rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
- socket);
+ rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (rxq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -1208,7 +1202,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
if (ret)
rte_free(rxq_ctrl);
else {
@@ -1243,12 +1237,12 @@ mlx5_rx_queue_release(void *dpdk_rxq)
if (rxq == NULL)
return;
rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
- priv = rxq->priv;
+ priv = rxq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->rxqs_n); ++i)
if ((*priv->rxqs)[i] == rxq) {
DEBUG("%p: removing RX queue %p from list",
- (void *)priv->dev, (void *)rxq);
+ (void *)priv->dev, (void *)rxq_ctrl);
(*priv->rxqs)[i] = NULL;
break;
}
@@ -1278,7 +1272,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+ struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6a0d707..27d8852 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -42,6 +42,8 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -55,7 +57,7 @@
#include <rte_prefetch.h>
#include <rte_common.h>
#include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -65,6 +67,47 @@
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+ __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+{
+ volatile struct mlx5_cqe64 *cqe;
+ uint16_t idx = *ci;
+ uint8_t op_own;
+
+ cqe = &cqes[idx & (cqes_n - 1)].cqe64;
+ op_own = cqe->op_own;
+ if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+ return NULL;
+ } else if (unlikely(op_own & 0x80)) {
+ switch (op_own >> 4) {
+ case MLX5_CQE_INVALID:
+ return NULL; /* No CQE */
+ case MLX5_CQE_REQ_ERR:
+ return cqe;
+ case MLX5_CQE_RESP_ERR:
+ ++(*ci);
+ return NULL;
+ default:
+ return NULL;
+ }
+ }
+ if (cqe) {
+ *ci = idx + 1;
+ return cqe;
+ }
+ return NULL;
+}

/**
* Manage TX completions.
@@ -390,8 +433,8 @@ stop:
/**
* Translate RX completion flags to packet type.
*
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @note: fix mlx5_dev_supported_ptypes_get() if any change here.
*
@@ -399,11 +442,13 @@ stop:
* Packet type for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
{
uint32_t pkt_type;
+ uint8_t flags = cqe->l4_hdr_type_etc;
+ uint8_t info = cqe->rsvd0[0];

- if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+ if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
pkt_type =
TRANSPOSE(flags,
IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
@@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags)
else
pkt_type =
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4) |
+ MLX5_CQE_L3_HDR_TYPE_IPV6,
+ RTE_PTYPE_L3_IPV6) |
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6);
+ MLX5_CQE_L3_HDR_TYPE_IPV4,
+ RTE_PTYPE_L3_IPV4);
return pkt_type;
}

@@ -433,50 +478,69 @@ rxq_cq_to_pkt_type(uint32_t flags)
*
* @param[in] rxq
* Pointer to RX queue structure.
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @return
* Offload flags (ol_flags) for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
{
uint32_t ol_flags = 0;
+ uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+ uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+ uint8_t info = cqe->rsvd0[0];

- if (rxq->csum) {
- /* Set IP checksum flag only for IPv4/IPv6 packets. */
- if (flags &
- (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_BAD);
- /* Set L4 checksum flag only for TCP/UDP packets. */
- if (flags &
- (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_BAD);
- }
+ if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+ (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+ PKT_RX_IP_CKSUM_BAD);
+ if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+ PKT_RX_L4_CKSUM_BAD);
/*
* PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
* of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
* (its value is 0).
*/
- if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+ if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
ol_flags |=
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD) |
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
PKT_RX_L4_CKSUM_BAD);
return ol_flags;
}

/**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ * RX queue to fetch packet from.
+ *
+ * @return
+ * Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+ if (cqe)
+ return ntohl(cqe->byte_cnt);
+ return 0;
+}
+
+/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -492,133 +556,82 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_sge sges[pkts_n];
- unsigned int i;
+ struct rxq *rxq = dpdk_rxq;
unsigned int pkts_ret = 0;
- int ret;
+ unsigned int i;
+ unsigned int rq_ci = rxq->rq_ci;
+ const unsigned int elts_n = rxq->elts_n;
+ const unsigned int wqe_cnt = elts_n - 1;

for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[elts_head];
- unsigned int len;
- struct rte_mbuf *seg = elt->buf;
+ unsigned int idx = rq_ci & wqe_cnt;
struct rte_mbuf *rep;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(seg != NULL);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
+ struct rte_mbuf *pkt;
+ unsigned int len;
+ volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+
+ pkt = (*rxq->elts)[idx];
+ rte_prefetch0(cqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increment out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ break;
}
-
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- elt->buf = rep;
-
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
-
- /* Update seg information. */
- SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(seg) = 1;
- PORT(seg) = rxq->port_id;
- NEXT(seg) = NULL;
- PKT_LEN(seg) = len;
- DATA_LEN(seg) = len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- seg->packet_type = rxq_cq_to_pkt_type(flags);
- seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- seg->ol_flags |= PKT_RX_VLAN_PKT;
- seg->vlan_tci = vlan_tci;
+ SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+ NB_SEGS(rep) = 1;
+ PORT(rep) = rxq->port_id;
+ NEXT(rep) = NULL;
+ len = rx_poll_len(rxq);
+ if (unlikely(len == 0)) {
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ /* Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes. */
+ wqe->addr = htonll((uintptr_t)rep->buf_addr +
+ RTE_PKTMBUF_HEADROOM);
+ (*rxq->elts)[idx] = rep;
+ /* Update pkt information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
}
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
}
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
+ PKT_LEN(pkt) = len;
+ DATA_LEN(pkt) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
rxq->stats.ibytes += len;
#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++pkts_ret;
+ ++rq_ci;
}
- if (unlikely(i == 0))
+ if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
return 0;
/* Repost WRs. */
#ifdef DEBUG_RECV
DEBUG("%p: reposting %u WRs", (void *)rxq, i);
#endif
- ret = rxq->recv(rxq->wq, sges, i);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci;
+ rte_wmb();
+ *rxq->cq_db = htonl(rxq->cq_ci);
+ rte_wmb();
+ *rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
rxq->stats.ipackets += pkts_ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 570345b..1827123 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -43,6 +43,7 @@
#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
@@ -61,6 +62,7 @@
#include "mlx5.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"

struct mlx5_rxq_stats {
unsigned int idx; /**< Mapping index. */
@@ -81,12 +83,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element. */
-struct rxq_elt {
- struct ibv_sge sge; /* Scatter/Gather Element. */
- struct rte_mbuf *buf; /* SGE buffer. */
-};
-
/* Flow director queue structure. */
struct fdir_queue {
struct ibv_qp *qp; /* Associated RX QP. */
@@ -97,25 +93,28 @@ struct priv;

/* RX queue descriptor. */
struct rxq {
- struct priv *priv; /* Back pointer to private data. */
- struct rte_mempool *mp; /* Memory Pool for allocations. */
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_exp_wq *wq; /* Work Queue. */
- int32_t (*poll)(); /* Verbs poll function. */
- int32_t (*recv)(); /* Verbs receive function. */
- unsigned int port_id; /* Port ID for incoming packets. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- struct rxq_elt (*elts)[]; /* RX elements. */
- struct mlx5_rxq_stats stats; /* RX queue counters. */
+ uint16_t rq_ci;
+ uint16_t cq_ci;
+ uint16_t elts_n;
+ uint16_t port_id;
+ volatile struct mlx5_wqe_data_seg(*wqes)[];
+ volatile struct mlx5_cqe(*cqes)[];
+ volatile uint32_t *rq_db;
+ volatile uint32_t *cq_db;
+ struct rte_mbuf *(*elts)[];
+ struct rte_mempool *mp;
+ struct mlx5_rxq_stats stats;
} __rte_cache_aligned;

/* RX queue control descriptor. */
struct rxq_ctrl {
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_exp_wq *wq; /* Work Queue. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
struct fdir_queue fdir_queue; /* Flow director queue. */
struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -284,8 +283,9 @@ int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
int priv_rehash_flows(struct priv *);
void rxq_cleanup(struct rxq_ctrl *);
int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
-int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_rxconf *, struct rte_mempool *);
+int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_rxconf *,
+ struct rte_mempool *);
int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_rxconf *, struct rte_mempool *);
void mlx5_rx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 3b9b771..4719e69 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,6 +144,7 @@ static void
priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
{
struct rxq *rxq = (*priv->rxqs)[idx];
+ struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
struct ibv_exp_wq_attr mod;
uint16_t vlan_offloads =
(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -157,7 +158,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
.vlan_offloads = vlan_offloads,
};

- err = ibv_exp_modify_wq(rxq->wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: failed to modified stripping mode: %s",
(void *)priv, strerror(err));
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:27 UTC
Permalink
Bypass Verbs to improve Tx performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 5 -
drivers/net/mlx5/mlx5_ethdev.c | 10 +-
drivers/net/mlx5/mlx5_mr.c | 4 +-
drivers/net/mlx5/mlx5_rxtx.c | 359 ++++++++++++++++++++++-------------------
drivers/net/mlx5/mlx5_rxtx.h | 52 +++---
drivers/net/mlx5/mlx5_txq.c | 216 +++++++++++++------------
6 files changed, 343 insertions(+), 303 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index dc99797..66687e8 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -106,11 +106,6 @@ mlx5_autoconf.h.new: FORCE
mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q sh -- '$<' '$@' \
- HAVE_VERBS_VLAN_INSERTION \
- infiniband/verbs.h \
- enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
- $(AUTOCONF_OUTPUT)
- $Q sh -- '$<' '$@' \
HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
infiniband/verbs_exp.h \
enum IBV_EXP_CQ_COMPRESSED_CQE \
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 8628321..4e125a7 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1242,11 +1242,11 @@ mlx5_secondary_data_setup(struct priv *priv)
txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
primary_txq_ctrl->socket);
if (txq_ctrl != NULL) {
- if (txq_setup(priv->dev,
- primary_txq_ctrl,
- primary_txq->elts_n,
- primary_txq_ctrl->socket,
- NULL) == 0) {
+ if (txq_ctrl_setup(priv->dev,
+ primary_txq_ctrl,
+ primary_txq->elts_n,
+ primary_txq_ctrl->socket,
+ NULL) == 0) {
txq_ctrl->txq.stats.idx = primary_txq->stats.idx;
tx_queues[i] = &txq_ctrl->txq;
continue;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 79d5568..e5e8a04 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -189,7 +189,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Add a new entry, register MR first. */
DEBUG("%p: discovered new memory pool \"%s\" (%p)",
(void *)txq_ctrl, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
+ mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
if (unlikely(mr == NULL)) {
DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
(void *)txq_ctrl);
@@ -208,7 +208,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
/* Store the new entry. */
txq_ctrl->txq.mp2mr[idx].mp = mp;
txq_ctrl->txq.mp2mr[idx].mr = mr;
- txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
+ txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
(void *)txq_ctrl, mp->name, (void *)mp,
txq_ctrl->txq.mp2mr[idx].lkey);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 27d8852..95bf981 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe cqes[],
*
* @param txq
* Pointer to TX queue structure.
- *
- * @return
- * 0 on success, -1 on failure.
*/
-static int
+static void
txq_complete(struct txq *txq)
{
- unsigned int elts_comp = txq->elts_comp;
- unsigned int elts_tail = txq->elts_tail;
- unsigned int elts_free = txq->elts_tail;
const unsigned int elts_n = txq->elts_n;
- int wcs_n;
-
- if (unlikely(elts_comp == 0))
- return 0;
-#ifdef DEBUG_SEND
- DEBUG("%p: processing %u work requests completions",
- (void *)txq, elts_comp);
-#endif
- wcs_n = txq->poll_cnt(txq->cq, elts_comp);
- if (unlikely(wcs_n == 0))
- return 0;
- if (unlikely(wcs_n < 0)) {
- DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
- (void *)txq, wcs_n);
- return -1;
+ const unsigned int cqe_n = txq->cqe_n;
+ uint16_t elts_free = txq->elts_tail;
+ uint16_t elts_tail;
+ uint16_t cq_ci = txq->cq_ci;
+ unsigned int wqe_ci = (unsigned int)-1;
+ int ret = 0;
+
+ while (ret == 0) {
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
+ if (cqe == NULL)
+ break;
+ wqe_ci = ntohs(cqe->wqe_counter);
}
- elts_comp -= wcs_n;
- assert(elts_comp <= txq->elts_comp);
- /*
- * Assume WC status is successful as nothing can be done about it
- * anyway.
- */
- elts_tail += wcs_n * txq->elts_comp_cd_init;
- if (elts_tail >= elts_n)
- elts_tail -= elts_n;
-
- while (elts_free != elts_tail) {
- struct txq_elt *elt = &(*txq->elts)[elts_free];
+ if (unlikely(wqe_ci == (unsigned int)-1))
+ return;
+ /* Free buffers. */
+ elts_tail = (wqe_ci + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
- (((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
- struct rte_mbuf *tmp = elt->buf;
- struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+ (elts_free + 1) & (elts_n - 1);
+ struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];

#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x66, sizeof(*elt));
+ memset(&(*txq->elts)[elts_free],
+ 0x66,
+ sizeof((*txq->elts)[elts_free]));
#endif
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- /* Faster than rte_pktmbuf_free(). */
- do {
- struct rte_mbuf *next = NEXT(tmp);
-
- rte_pktmbuf_free_seg(tmp);
- tmp = next;
- } while (tmp != NULL);
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next);
+ /* Only one segment needs to be freed. */
+ rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- }
-
+ } while (elts_free != elts_tail);
+ txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
- txq->elts_comp = elts_comp;
- return 0;
+ /* Update the consumer index. */
+ rte_wmb();
+ *txq->cq_db = htonl(cq_ci);
}

/**
@@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
- assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+ assert(htonl(txq->mp2mr[i].mr->lkey) ==
+ txq->mp2mr[i].lkey);
lkey = txq->mp2mr[i].lkey;
break;
}
@@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
}

/**
- * Insert VLAN using mbuf headroom space.
- *
- * @param buf
- * Buffer for VLAN insertion.
+ * Write a regular WQE.
*
- * @return
- * 0 on success, errno value on failure.
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
*/
-static inline int
-insert_vlan_sw(struct rte_mbuf *buf)
+static inline void
+mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey)
{
- uintptr_t addr;
- uint32_t vlan;
- uint16_t head_room_len = rte_pktmbuf_headroom(buf);
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- if (head_room_len < 4)
- return EINVAL;
+/**
+ * Write a regular WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint32_t lkey,
+ uint16_t vlan_tci)
+{
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+ (uint8_t *)(uintptr_t)addr, 12);
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
+ (uint8_t *)((uintptr_t)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ /* Store remaining data in data segment. */
+ wqe->wqe.dseg.byte_count = htonl(length);
+ wqe->wqe.dseg.lkey = lkey;
+ wqe->wqe.dseg.addr = htonll(addr);
+ /* Increment consumer index. */
+ ++txq->wqe_ci;
+}

- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- vlan = htonl(0x81000000 | buf->vlan_tci);
- memmove((void *)(addr - 4), (void *)addr, 12);
- memcpy((void *)(addr + 8), &vlan, sizeof(vlan));
+/**
+ * Ring TX queue doorbell.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ */
+static inline void
+mlx5_tx_dbrec(struct txq *txq)
+{
+ uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
+ uint32_t data[4] = {
+ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+ htonl(txq->qp_num_8s),
+ 0,
+ 0,
+ };
+ rte_wmb();
+ *txq->qp_db = htonl(txq->wqe_ci);
+ /* Ensure ordering between DB record and BF copy. */
+ rte_wmb();
+ rte_mov16(dst, (uint8_t *)data);
+ txq->bf_offset ^= txq->bf_buf_size;
+}

- SET_DATA_OFF(buf, head_room_len - 4);
- DATA_LEN(buf) += 4;
+/**
+ * Prefetch a CQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param cqe_ci
+ * CQE consumer index.
+ */
+static inline void
+tx_prefetch_cqe(struct txq *txq, uint16_t ci)
+{
+ volatile struct mlx5_cqe64 *cqe;

- return 0;
+ cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+ rte_prefetch0(cqe);
}

/**
@@ -288,18 +376,21 @@ uint16_t
mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct txq *txq = (struct txq *)dpdk_txq;
- unsigned int elts_head = txq->elts_head;
+ uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int elts_comp_cd = txq->elts_comp_cd;
- unsigned int elts_comp = 0;
unsigned int i;
unsigned int max;
- int err;
- struct rte_mbuf *buf = pkts[0];
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;

- assert(elts_comp_cd != 0);
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
/* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
rte_prefetch0(buf);
+ /* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
@@ -313,101 +404,51 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > pkts_n)
max = pkts_n;
for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf_next = pkts[i + 1];
- unsigned int elts_head_next =
- (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
- struct txq_elt *elt = &(*txq->elts)[elts_head];
- uint32_t send_flags = 0;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int insert_vlan = 0;
-#endif /* HAVE_VERBS_VLAN_INSERTION */
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
uintptr_t addr;
uint32_t length;
uint32_t lkey;
- uintptr_t buf_next_addr;

+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
if (i + 1 < max)
- rte_prefetch0(buf_next);
- /* Request TX completion. */
- if (unlikely(--elts_comp_cd == 0)) {
- elts_comp_cd = txq->elts_comp_cd_init;
- ++elts_comp;
- send_flags |= IBV_EXP_QP_BURST_SIGNALED;
- }
- /* Should we enable HW CKSUM offload */
- if (buf->ol_flags &
- (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
- /* HW does not support checksum offloads at arbitrary
- * offsets but automatically recognizes the packet
- * type. For inner L3/L4 checksums, only VXLAN (UDP)
- * tunnels are currently supported. */
- if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
- send_flags |= IBV_EXP_QP_BURST_TUNNEL;
- }
- if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (!txq->priv->mps)
- insert_vlan = 1;
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- {
- err = insert_vlan_sw(buf);
- if (unlikely(err))
- goto stop;
- }
- }
+ rte_prefetch0(pkts[i + 1]);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
+ (*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
+ buf->vlan_tci);
else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- if (unlikely(err))
- goto stop;
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ /* Request completion if needed. */
+ if (unlikely(--txq->elts_comp == 0)) {
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ txq->elts_comp = txq->elts_comp_cd_init;
+ } else
+ wqe->wqe.ctrl.data[2] = 0;
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->wqe.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->wqe.eseg.cs_flags = 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
-stop:
elts_head = elts_head_next;
- buf = buf_next;
+ buf = pkts[i + 1];
}
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
@@ -417,16 +458,8 @@ stop:
txq->stats.opackets += i;
#endif
/* Ring QP doorbell. */
- err = txq->send_flush(txq->qp);
- if (unlikely(err)) {
- /* A nonzero value is not supposed to be returned.
- * Nothing can be done about it. */
- DEBUG("%p: send_flush() failed with error %d",
- (void *)txq, err);
- }
+ mlx5_tx_dbrec(txq);
txq->elts_head = elts_head;
- txq->elts_comp += elts_comp;
- txq->elts_comp_cd = elts_comp_cd;
return i;
}

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1827123..6b3bb2d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -223,44 +223,40 @@ struct hash_rxq {
[MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
};

-/* TX element. */
-struct txq_elt {
- struct rte_mbuf *buf;
-};
-
/* TX queue descriptor. */
struct txq {
- struct priv *priv; /* Back pointer to private data. */
- int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
- int (*send_pending)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_vlan)();
-#endif
- int (*send_flush)(struct ibv_qp *qp);
- struct ibv_cq *cq; /* Completion Queue. */
- struct ibv_qp *qp; /* Queue Pair. */
- struct txq_elt (*elts)[]; /* TX elements. */
- unsigned int elts_n; /* (*elts)[] length. */
- unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int elts_tail; /* First element awaiting completion. */
- unsigned int elts_comp; /* Number of completion requests. */
- unsigned int elts_comp_cd; /* Countdown for next completion request. */
- unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_head; /* Current index in (*elts)[]. */
+ uint16_t elts_tail; /* First element awaiting completion. */
+ uint16_t elts_comp_cd_init; /* Initial value for countdown. */
+ uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_n; /* (*elts)[] length. */
+ uint16_t cq_ci; /* Consumer index for completion queue. */
+ uint16_t cqe_n; /* Number of CQ elements. */
+ uint16_t wqe_ci; /* Consumer index for work queue. */
+ uint16_t wqe_n; /* Number of WQ elements. */
+ uint16_t bf_offset; /* Blueflame offset. */
+ uint16_t bf_buf_size; /* Blueflame size. */
+ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
+ volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+ volatile uint32_t *qp_db; /* Work queue doorbell. */
+ volatile uint32_t *cq_db; /* Completion queue doorbell. */
+ volatile void *bf_reg; /* Blueflame register. */
struct {
const struct rte_mempool *mp; /* Cached Memory Pool. */
struct ibv_mr *mr; /* Memory Region (for mp). */
- uint32_t lkey; /* mr->lkey */
+ uint32_t lkey; /* htonl(mr->lkey) */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+ struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
struct txq_ctrl {
-#ifdef HAVE_VERBS_VLAN_INSERTION
- struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
-#else
+ struct priv *priv; /* Back pointer to private data. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-#endif
struct ibv_exp_cq_family *if_cq; /* CQ interface. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
unsigned int socket; /* CPU socket ID for allocations. */
@@ -294,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_txq.c */

void txq_cleanup(struct txq_ctrl *);
-int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
- const struct rte_eth_txconf *);
+int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
+ unsigned int, const struct rte_eth_txconf *);
int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
const struct rte_eth_txconf *);
void mlx5_tx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d7cc39d..dbf9c04 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -60,6 +60,7 @@
#endif

#include "mlx5_utils.h"
+#include "mlx5_defs.h"
#include "mlx5.h"
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
@@ -72,48 +73,22 @@
* Pointer to TX queue structure.
* @param elts_n
* Number of elements to allocate.
- *
- * @return
- * 0 on success, errno value on failure.
*/
-static int
+static void
txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
{
unsigned int i;
- struct txq_elt (*elts)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
- int ret = 0;

- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
- ret = ENOMEM;
- goto error;
- }
- for (i = 0; (i != elts_n); ++i) {
- struct txq_elt *elt = &(*elts)[i];
+ for (i = 0; (i != elts_n); ++i)
+ (*txq_ctrl->txq.elts)[i] = NULL;
+ for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+ volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];

- elt->buf = NULL;
+ memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
}
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
- txq_ctrl->txq.elts_n = elts_n;
- txq_ctrl->txq.elts = elts;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- txq_ctrl->txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
- txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
- assert(ret == 0);
- return 0;
-error:
- rte_free(elts);
-
- DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
- assert(ret > 0);
- return ret;
}

/**
@@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
unsigned int elts_n = txq_ctrl->txq.elts_n;
unsigned int elts_head = txq_ctrl->txq.elts_head;
unsigned int elts_tail = txq_ctrl->txq.elts_tail;
- struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;
+ struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;

DEBUG("%p: freeing WRs", (void *)txq_ctrl);
- txq_ctrl->txq.elts_n = 0;
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
- txq_ctrl->txq.elts_comp = 0;
- txq_ctrl->txq.elts_comp_cd = 0;
- txq_ctrl->txq.elts_comp_cd_init = 0;
- txq_ctrl->txq.elts = NULL;

- if (elts == NULL)
- return;
while (elts_tail != elts_head) {
- struct txq_elt *elt = &(*elts)[elts_tail];
+ struct rte_mbuf *elt = (*elts)[elts_tail];

- assert(elt->buf != NULL);
- rte_pktmbuf_free(elt->buf);
+ assert(elt != NULL);
+ rte_pktmbuf_free(elt);
#ifndef NDEBUG
/* Poisoning. */
- memset(elt, 0x77, sizeof(*elt));
+ memset(&(*elts)[elts_tail],
+ 0x77,
+ sizeof((*elts)[elts_tail]));
#endif
if (++elts_tail == elts_n)
elts_tail = 0;
}
- rte_free(elts);
}

/**
@@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)

DEBUG("cleaning up %p", (void *)txq_ctrl);
txq_free_elts(txq_ctrl);
- txq_ctrl->txq.poll_cnt = NULL;
- txq_ctrl->txq.send_flush = NULL;
if (txq_ctrl->if_qp != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.qp != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->qp != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_qp,
&params));
}
if (txq_ctrl->if_cq != NULL) {
- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- assert(txq_ctrl->txq.cq != NULL);
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ assert(txq_ctrl->cq != NULL);
params = (struct ibv_exp_release_intf_params){
.comp_mask = 0,
};
- claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+ claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
txq_ctrl->if_cq,
&params));
}
- if (txq_ctrl->txq.qp != NULL)
- claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
- if (txq_ctrl->txq.cq != NULL)
- claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+ if (txq_ctrl->qp != NULL)
+ claim_zero(ibv_destroy_qp(txq_ctrl->qp));
+ if (txq_ctrl->cq != NULL)
+ claim_zero(ibv_destroy_cq(txq_ctrl->cq));
if (txq_ctrl->rd != NULL) {
struct ibv_exp_destroy_res_domain_attr attr = {
.comp_mask = 0,
};

- assert(txq_ctrl->txq.priv != NULL);
- assert(txq_ctrl->txq.priv->ctx != NULL);
- claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+ assert(txq_ctrl->priv != NULL);
+ assert(txq_ctrl->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx,
txq_ctrl->rd,
&attr));
}
@@ -221,6 +188,49 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
}

/**
+ * Initialize TX queue.
+ *
+ * @param tmpl
+ * Pointer to TX queue control template.
+ * @param txq_ctrl
+ * Pointer to TX queue control.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static inline int
+txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+{
+ struct mlx5_qp *qp = to_mqp(tmpl->qp);
+ struct ibv_cq *ibcq = tmpl->cq;
+ struct mlx5_cq *cq = to_mxxx(cq, cq);
+
+ if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+ ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+ "it should be set to %u", RTE_CACHE_LINE_SIZE);
+ return EINVAL;
+ }
+ tmpl->txq.cqe_n = ibcq->cqe + 1;
+ tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
+ tmpl->txq.wqes =
+ (volatile union mlx5_wqe (*)[])
+ (uintptr_t)qp->gen_data.sqstart;
+ tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+ tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
+ tmpl->txq.bf_reg = qp->gen_data.bf->reg;
+ tmpl->txq.bf_offset = qp->gen_data.bf->offset;
+ tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+ tmpl->txq.cq_db = cq->dbrec;
+ tmpl->txq.cqes =
+ (volatile struct mlx5_cqe (*)[])
+ (uintptr_t)cq->active_buf->buf;
+ tmpl->txq.elts =
+ (struct rte_mbuf *(*)[tmpl->txq.elts_n])
+ ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+ return 0;
+}
+
+/**
* Configure a TX queue.
*
* @param dev
@@ -238,15 +248,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
* 0 on success, errno value on failure.
*/
int
-txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
- unsigned int socket, const struct rte_eth_txconf *conf)
+txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
+ uint16_t desc, unsigned int socket,
+ const struct rte_eth_txconf *conf)
{
struct priv *priv = mlx5_get_priv(dev);
struct txq_ctrl tmpl = {
+ .priv = priv,
.socket = socket,
- .txq = {
- .priv = priv,
- },
};
union {
struct ibv_exp_query_intf_params params;
@@ -254,15 +263,19 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
struct ibv_exp_res_domain_init_attr rd;
struct ibv_exp_cq_init_attr cq;
struct ibv_exp_qp_attr mod;
+ struct ibv_exp_cq_attr cq_attr;
} attr;
enum ibv_exp_query_intf_status status;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of TX descriptors", (void *)dev);
- return EINVAL;
- }
+ tmpl.txq.elts_n = desc;
+ /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+ * at least 4 times per ring. */
+ tmpl.txq.elts_comp_cd_init =
+ ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
+ MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
+ tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -281,8 +294,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
- if (tmpl.txq.cq == NULL) {
+ tmpl.cq = ibv_exp_create_cq(priv->ctx,
+ (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ NULL, NULL, 0, &attr.cq);
+ if (tmpl.cq == NULL) {
ret = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(void *)dev, strerror(ret));
@@ -294,9 +309,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
priv->device_attr.max_sge);
attr.init = (struct ibv_exp_qp_init_attr){
/* CQ to be associated with the send queue. */
- .send_cq = tmpl.txq.cq,
+ .send_cq = tmpl.cq,
/* CQ to be associated with the receive queue. */
- .recv_cq = tmpl.txq.cq,
+ .recv_cq = tmpl.cq,
.cap = {
/* Max number of outstanding WRs. */
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -314,8 +329,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
- tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
- if (tmpl.txq.qp == NULL) {
+ tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(ret));
@@ -327,30 +342,31 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
/* Primary port number. */
.port_num = priv->port
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
if (ret) {
ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = txq_alloc_elts(&tmpl, desc);
+ ret = txq_setup(&tmpl, txq_ctrl);
if (ret) {
- ERROR("%p: TXQ allocation failed: %s",
+ ERROR("%p: cannot initialize TX queue structure: %s",
(void *)dev, strerror(ret));
goto error;
}
+ txq_alloc_elts(&tmpl, desc);
attr.mod = (struct ibv_exp_qp_attr){
.qp_state = IBV_QPS_RTR
};
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
(void *)dev, strerror(ret));
goto error;
}
attr.mod.qp_state = IBV_QPS_RTS;
- ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
if (ret) {
ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
(void *)dev, strerror(ret));
@@ -359,7 +375,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_CQ,
- .obj = tmpl.txq.cq,
+ .obj = tmpl.cq,
};
tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
if (tmpl.if_cq == NULL) {
@@ -371,10 +387,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
attr.params = (struct ibv_exp_query_intf_params){
.intf_scope = IBV_EXP_INTF_GLOBAL,
.intf = IBV_EXP_INTF_QP_BURST,
- .obj = tmpl.txq.qp,
-#ifdef HAVE_VERBS_VLAN_INSERTION
.intf_version = 1,
-#endif
+ .obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
(priv->mps ?
@@ -392,12 +406,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
txq_cleanup(txq_ctrl);
*txq_ctrl = tmpl;
- txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
- txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
-#endif
- txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
/* Pre-register known mempools. */
rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
@@ -432,15 +440,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
{
struct priv *priv = dev->data->dev_private;
struct txq *txq = (*priv->txqs)[idx];
- struct txq_ctrl *txq_ctrl;
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
int ret;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;

priv_lock(priv);
- if (txq)
- txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in TX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->txqs_n) {
@@ -459,8 +471,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(*priv->txqs)[idx] = NULL;
txq_cleanup(txq_ctrl);
} else {
- txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
- 0, socket);
+ txq_ctrl =
+ rte_calloc_socket("TXQ", 1,
+ sizeof(*txq_ctrl) +
+ desc * sizeof(struct rte_mbuf *),
+ 0, socket);
if (txq_ctrl == NULL) {
ERROR("%p: unable to allocate queue index %u",
(void *)dev, idx);
@@ -468,7 +483,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -ENOMEM;
}
}
- ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
+ ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
if (ret)
rte_free(txq_ctrl);
else {
@@ -503,7 +518,7 @@ mlx5_tx_queue_release(void *dpdk_txq)
if (txq == NULL)
return;
txq_ctrl = container_of(txq, struct txq_ctrl, txq);
- priv = txq->priv;
+ priv = txq_ctrl->priv;
priv_lock(priv);
for (i = 0; (i != priv->txqs_n); ++i)
if ((*priv->txqs)[i] == txq) {
@@ -538,7 +553,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n)
{
struct txq *txq = dpdk_txq;
- struct priv *priv = mlx5_secondary_data_setup(txq->priv);
+ struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+ struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
struct priv *primary_priv;
unsigned int index;
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:28 UTC
Permalink
Mini (compressed) CQEs are returned by the NIC when PCI back pressure is
detected, in which case the first CQE64 contains common packet information
followed by a number of CQE8 providing the rest, followed by a matching
number of empty CQE64 entries to be used by software for decompression.

Before decompression:

0 1 2 6 7 8
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 | | CQE64 |
|-------| |---------| |-------| |-------| |-------| |-------|
| ..... | | cqe8[0] | | | . | | | | | ..... |
| ..... | | cqe8[1] | | | . | | | | | ..... |
| ..... | | ....... | | | . | | | | | ..... |
| ..... | | cqe8[7] | | | | | | | | ..... |
+-------+ +---------+ +-------+ +-------+ +-------+ +-------+

After decompression:

0 1 ... 8
+-------+ +-------+ +-------+
| CQE64 | | CQE64 | | CQE64 |
|-------| |-------| |-------|
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | . | ..... |
| ..... | | ..... | | ..... |
+-------+ +-------+ +-------+

This patch does not perform the entire decompression step as it would be
really expensive, instead the first CQE64 is consumed and an internal
context is maintained to interpret the following CQE8 entries directly.

Intermediate empty CQE64 entries are handed back to HW without further
processing.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 6 +
drivers/net/mlx5/mlx5.c | 25 ++++-
drivers/net/mlx5/mlx5.h | 1 +
drivers/net/mlx5/mlx5_rxq.c | 9 +-
drivers/net/mlx5/mlx5_rxtx.c | 260 ++++++++++++++++++++++++++++++++-----------
drivers/net/mlx5/mlx5_rxtx.h | 11 ++
drivers/net/mlx5/mlx5_txq.c | 5 +
7 files changed, 248 insertions(+), 69 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3a07928..756153b 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -148,6 +148,12 @@ Run-time configuration

- **ethtool** operations on related kernel interfaces also affect the PMD.

+- ``rxq_cqe_comp_en`` parameter [int]
+
+ A nonzero value enables the compression of CQE on RX side. This feature
+ allows to save PCI bandwidth and improve performance at the cost of a
+ slightly higher CPU usage. Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 56b1dfc..7e8c579 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,9 @@
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"

+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -256,12 +259,21 @@ static int
mlx5_args_check(const char *key, const char *val, void *opaque)
{
struct priv *priv = opaque;
+ unsigned long tmp;

- /* No parameters are expected at the moment. */
- (void)priv;
- (void)val;
- WARN("%s: unknown parameter", key);
- return EINVAL;
+ errno = 0;
+ tmp = strtoul(val, NULL, 0);
+ if (errno) {
+ WARN("%s: \"%s\" is not a valid integer", key, val);
+ return errno;
+ }
+ if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
+ priv->cqe_comp = !!tmp;
+ else {
+ WARN("%s: unknown parameter", key);
+ return EINVAL;
+ }
+ return 0;
}

/**
@@ -279,7 +291,7 @@ static int
mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
- NULL,
+ MLX5_RXQ_CQE_COMP_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -474,6 +486,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
ERROR("failed to process device arguments: %s",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3dca03d..8f5a6df 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -111,6 +111,7 @@ struct priv {
unsigned int hw_padding:1; /* End alignment padding is supported. */
unsigned int sriov:1; /* This is a VF or PF with VF devices. */
unsigned int mps:1; /* Whether multi-packet send is supported. */
+ unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index a8f68a3..6881cdd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -897,6 +897,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
return EINVAL;
}
tmpl->rxq.rq_db = rwq->rq.db;
+ tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
tmpl->rxq.rq_ci = 0;
tmpl->rxq.cq_db = cq->dbrec;
@@ -955,6 +956,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ unsigned int cqe_n = desc - 1;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -994,7 +996,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
.res_domain = tmpl.rd,
};
- tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+ if (priv->cqe_comp) {
+ attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+ attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+ cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
+ }
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
&attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 95bf981..30d413c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -69,44 +69,85 @@
#include "mlx5_defs.h"
#include "mlx5_prm.h"

-static inline volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
- unsigned int cqes_n, uint16_t *ci)
- __attribute__((always_inline));
+#ifndef NDEBUG
+
+/**
+ * Verify or set magic value in CQE.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ *
+ * @return
+ * 0 the first time.
+ */
+static inline int
+check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
+{
+ static const uint8_t magic[] = "seen";
+ volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
+ int ret = 1;
+ unsigned int i;
+
+ for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
+ if (!ret || !(ret = ((*buf)[i] == magic[i])))
+ (*buf)[i] = magic[i];
+ return ret;
+}
+
+#endif /* NDEBUG */

static inline int
-rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
+ __attribute__((always_inline));

-static volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
- unsigned int cqes_n, uint16_t *ci)
+/**
+ * Check whether CQE is valid.
+ *
+ * @param cqe
+ * Pointer to CQE.
+ * @param cqes_n
+ * Size of completion queue.
+ * @param ci
+ * Consumer index.
+ *
+ * @return
+ * 0 on success, 1 on failure.
+ */
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+ unsigned int cqes_n, const uint16_t ci)
{
- volatile struct mlx5_cqe64 *cqe;
- uint16_t idx = *ci;
- uint8_t op_own;
-
- cqe = &cqes[idx & (cqes_n - 1)].cqe64;
- op_own = cqe->op_own;
- if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
- return NULL;
- } else if (unlikely(op_own & 0x80)) {
- switch (op_own >> 4) {
- case MLX5_CQE_INVALID:
- return NULL; /* No CQE */
- case MLX5_CQE_REQ_ERR:
- return cqe;
- case MLX5_CQE_RESP_ERR:
- ++(*ci);
- return NULL;
- default:
- return NULL;
- }
- }
- if (cqe) {
- *ci = idx + 1;
- return cqe;
+ uint16_t idx = ci & cqes_n;
+ uint8_t op_own = cqe->op_own;
+ uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+ uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+
+ if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
+ return 1; /* No CQE. */
+#ifndef NDEBUG
+ if ((op_code == MLX5_CQE_RESP_ERR) ||
+ (op_code == MLX5_CQE_REQ_ERR)) {
+ volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
+ uint8_t syndrome = err_cqe->syndrome;
+
+ if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
+ (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
+ return 0;
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE error %u (0x%02x)"
+ " syndrome 0x%02x",
+ op_code, op_code, syndrome);
+ return 1;
+ } else if ((op_code != MLX5_CQE_RESP_SEND) &&
+ (op_code != MLX5_CQE_REQ)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected CQE opcode %u (0x%02x)",
+ op_code, op_code);
+ return 1;
}
- return NULL;
+#endif /* NDEBUG */
+ return 0;
}

/**
@@ -125,20 +166,34 @@ txq_complete(struct txq *txq)
{
const unsigned int elts_n = txq->elts_n;
const unsigned int cqe_n = txq->cqe_n;
+ const unsigned int cqe_cnt = cqe_n - 1;
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
unsigned int wqe_ci = (unsigned int)-1;
- int ret = 0;

- while (ret == 0) {
- volatile struct mlx5_cqe64 *cqe;
+ do {
+ unsigned int idx = cq_ci & cqe_cnt;
+ volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;

- cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
- if (cqe == NULL)
+ if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
break;
+#ifndef NDEBUG
+ if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected compressed CQE, TX stopped");
+ return;
+ }
+ if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
+ (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
+ if (!check_cqe64_seen(cqe))
+ ERROR("unexpected error CQE, TX stopped");
+ return;
+ }
+#endif /* NDEBUG */
wqe_ci = ntohs(cqe->wqe_counter);
- }
+ ++cq_ci;
+ } while (1);
if (unlikely(wqe_ci == (unsigned int)-1))
return;
/* Free buffers. */
@@ -507,6 +562,97 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
}

/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param cqe
+ * CQE to process.
+ *
+ * @return
+ * Packet size in bytes (0 if there is none), -1 in case of completion
+ * with error.
+ */
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
+ uint16_t cqe_cnt)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ uint16_t cqe_n = cqe_cnt + 1;
+ int len = 0;
+
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64);
+
+ len = ntohl((*mc)[zip->ai & 7].byte_cnt);
+ if ((++zip->ai & 7) == 0) {
+ /* Increment consumer index to skip the number of
+ * CQEs consumed. Hardware leaves holes in the CQ
+ * ring for software use. */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ uint16_t idx = rxq->cq_ci;
+ uint16_t end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].cqe64.op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /* No compressed data, get next CQE and verify if it is compressed. */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret == 1))
+ return 0;
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
+ cqe_cnt].cqe64;
+
+ /* Fix endianness. */
+ zip->cqe_cnt = ntohl(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one returned by
+ * check_cqe64().
+ *
+ * If completion comprises several mini arrays, as a
+ * special case the second one is located 7 CQEs after
+ * the initial CQE instead of 8 for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci & cqe_cnt;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = ntohl((*mc)[0].byte_cnt);
+ zip->ai = 1;
+ } else
+ len = ntohl(cqe->byte_cnt);
+ /* Error while receiving packet. */
+ if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
+ return -1;
+ }
+ return len;
+}
+
+/**
* Translate RX completion flags to offload flags.
*
* @param[in] rxq
@@ -554,26 +700,6 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
}

/**
- * Get size of the next packet.
- *
- * @param rxq
- * RX queue to fetch packet from.
- *
- * @return
- * Packet size in bytes.
- */
-static inline int __attribute__((always_inline))
-rx_poll_len(struct rxq *rxq)
-{
- volatile struct mlx5_cqe64 *cqe;
-
- cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
- if (cqe)
- return ntohl(cqe->byte_cnt);
- return 0;
-}
-
-/**
* DPDK callback for RX.
*
* @param dpdk_rxq
@@ -595,15 +721,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int rq_ci = rxq->rq_ci;
const unsigned int elts_n = rxq->elts_n;
const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int cqe_cnt = rxq->cqe_n - 1;

for (i = 0; (i != pkts_n); ++i) {
unsigned int idx = rq_ci & wqe_cnt;
+ int len;
struct rte_mbuf *rep;
struct rte_mbuf *pkt;
- unsigned int len;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;

pkt = (*rxq->elts)[idx];
rte_prefetch0(cqe);
@@ -616,12 +743,20 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
NB_SEGS(rep) = 1;
PORT(rep) = rxq->port_id;
NEXT(rep) = NULL;
- len = rx_poll_len(rxq);
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
if (unlikely(len == 0)) {
rte_mbuf_refcnt_set(rep, 0);
__rte_mbuf_raw_free(rep);
break;
}
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ --i;
+ goto skip;
+ }
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
@@ -651,6 +786,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Return packet. */
*(pkts++) = pkt;
++pkts_ret;
+ skip:
++rq_ci;
}
if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 6b3bb2d..77b0fde 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -91,6 +91,15 @@ struct fdir_queue {

struct priv;

+/* Compressed CQE context. */
+struct rxq_zip {
+ uint16_t ai; /* Array index. */
+ uint16_t ca; /* Current array index. */
+ uint16_t na; /* Next array index. */
+ uint16_t cq_ci; /* The next CQE. */
+ uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
/* RX queue descriptor. */
struct rxq {
unsigned int csum:1; /* Enable checksum offloading. */
@@ -100,9 +109,11 @@ struct rxq {
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
+ uint16_t cqe_n; /* Number of CQ elements. */
uint16_t port_id;
volatile struct mlx5_wqe_data_seg(*wqes)[];
volatile struct mlx5_cqe(*cqes)[];
+ struct rxq_zip zip; /* Compressed context. */
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
struct rte_mbuf *(*elts)[];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index dbf9c04..ddcd6b6 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -268,6 +268,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
enum ibv_exp_query_intf_status status;
int ret = 0;

+ if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+ ret = ENOTSUP;
+ ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
+ goto error;
+ }
(void)conf; /* Thresholds configuration (ignored). */
tmpl.txq.elts_n = desc;
/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:29 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Replacing the variable countdown (which depends on the number of
descriptors) with a fixed relative threshold known at compile time improves
performance by reducing the TX queue structure footprint and the amount of
code to manage completions during a burst.

Completions are now requested at most once per burst after threshold is
reached.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_defs.h | 7 +++++--
drivers/net/mlx5/mlx5_rxtx.c | 42 ++++++++++++++++++++++++------------------
drivers/net/mlx5/mlx5_rxtx.h | 5 ++---
drivers/net/mlx5/mlx5_txq.c | 19 ++++++++++++-------
4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 8d2ec7a..cc2a6f3 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -48,8 +48,11 @@
/* Maximum number of special flows. */
#define MLX5_MAX_SPECIAL_FLOWS 4

-/* Request send completion once in every 64 sends, might be less. */
-#define MLX5_PMD_TX_PER_COMP_REQ 64
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32

/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 30d413c..d56c9e9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -154,9 +154,6 @@ check_cqe64(volatile struct mlx5_cqe64 *cqe,
* Manage TX completions.
*
* When sending a burst, mlx5_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
*
* @param txq
* Pointer to TX queue structure.
@@ -170,14 +167,16 @@ txq_complete(struct txq *txq)
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
- unsigned int wqe_ci = (unsigned int)-1;
+ volatile struct mlx5_cqe64 *cqe = NULL;
+ volatile union mlx5_wqe *wqe;

do {
- unsigned int idx = cq_ci & cqe_cnt;
- volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;
+ volatile struct mlx5_cqe64 *tmp;

- if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
+ tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64;
+ if (check_cqe64(tmp, cqe_n, cq_ci))
break;
+ cqe = tmp;
#ifndef NDEBUG
if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
if (!check_cqe64_seen(cqe))
@@ -191,14 +190,15 @@ txq_complete(struct txq *txq)
return;
}
#endif /* NDEBUG */
- wqe_ci = ntohs(cqe->wqe_counter);
++cq_ci;
} while (1);
- if (unlikely(wqe_ci == (unsigned int)-1))
+ if (unlikely(cqe == NULL))
return;
+ wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
+ elts_tail = wqe->wqe.ctrl.data[3];
+ assert(elts_tail < txq->wqe_n);
/* Free buffers. */
- elts_tail = (wqe_ci + 1) & (elts_n - 1);
- do {
+ while (elts_free != elts_tail) {
struct rte_mbuf *elt = (*txq->elts)[elts_free];
unsigned int elts_free_next =
(elts_free + 1) & (elts_n - 1);
@@ -214,7 +214,7 @@ txq_complete(struct txq *txq)
/* Only one segment needs to be freed. */
rte_pktmbuf_free_seg(elt);
elts_free = elts_free_next;
- } while (elts_free != elts_tail);
+ }
txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
/* Update the consumer index. */
@@ -435,6 +435,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
const unsigned int elts_n = txq->elts_n;
unsigned int i;
unsigned int max;
+ unsigned int comp;
volatile union mlx5_wqe *wqe;
struct rte_mbuf *buf;

@@ -484,12 +485,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- /* Request completion if needed. */
- if (unlikely(--txq->elts_comp == 0)) {
- wqe->wqe.ctrl.data[2] = htonl(8);
- txq->elts_comp = txq->elts_comp_cd_init;
- } else
- wqe->wqe.ctrl.data[2] = 0;
+ wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -508,6 +504,16 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->wqe.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->wqe.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent packets counter. */
txq->stats.opackets += i;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 77b0fde..f900e65 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -238,8 +238,7 @@ struct hash_rxq {
struct txq {
uint16_t elts_head; /* Current index in (*elts)[]. */
uint16_t elts_tail; /* First element awaiting completion. */
- uint16_t elts_comp_cd_init; /* Initial value for countdown. */
- uint16_t elts_comp; /* Elements before asking a completion. */
+ uint16_t elts_comp; /* Counter since last completion request. */
uint16_t elts_n; /* (*elts)[] length. */
uint16_t cq_ci; /* Consumer index for completion queue. */
uint16_t cqe_n; /* Number of CQ elements. */
@@ -247,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
volatile uint32_t *qp_db; /* Work queue doorbell. */
@@ -259,7 +259,6 @@ struct txq {
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct rte_mbuf *(*elts)[]; /* TX elements. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- uint32_t qp_num_8s; /* QP number shifted by 8. */
} __rte_cache_aligned;

/* TX queue control descriptor. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index ddcd6b6..7b2dc7c 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -89,6 +89,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;
}

/**
@@ -108,6 +109,7 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
DEBUG("%p: freeing WRs", (void *)txq_ctrl);
txq_ctrl->txq.elts_head = 0;
txq_ctrl->txq.elts_tail = 0;
+ txq_ctrl->txq.elts_comp = 0;

while (elts_tail != elts_head) {
struct rte_mbuf *elt = (*elts)[elts_tail];
@@ -274,13 +276,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
goto error;
}
(void)conf; /* Thresholds configuration (ignored). */
+ assert(desc > MLX5_TX_COMP_THRESH);
tmpl.txq.elts_n = desc;
- /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
- * at least 4 times per ring. */
- tmpl.txq.elts_comp_cd_init =
- ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
- MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
- tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -300,7 +297,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.res_domain = tmpl.rd,
};
tmpl.cq = ibv_exp_create_cq(priv->ctx,
- (desc / tmpl.txq.elts_comp_cd_init) - 1,
+ (((desc / MLX5_TX_COMP_THRESH) - 1) ?
+ ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
NULL, NULL, 0, &attr.cq);
if (tmpl.cq == NULL) {
ret = ENOMEM;
@@ -452,6 +450,13 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
return -E_RTE_SECONDARY;

priv_lock(priv);
+ if (desc <= MLX5_TX_COMP_THRESH) {
+ WARN("%p: number of descriptors requested for TX queue %u"
+ " must be higher than MLX5_TX_COMP_THRESH, using"
+ " %u instead of %u",
+ (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
+ desc = MLX5_TX_COMP_THRESH + 1;
+ }
if (!rte_is_power_of_2(desc)) {
desc = 1 << log2above(desc);
WARN("%p: increased number of descriptors in TX queue %u"
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:30 UTC
Permalink
From: Yaacov Hazan <***@mellanox.com>

Implement send inline feature which copies packet data directly into WQEs
for improved latency. The maximum packet size and the minimum number of Tx
queues to qualify for inline send are user-configurable.

This feature is effective when HW causes a performance bottleneck.

Signed-off-by: Yaacov Hazan <***@mellanox.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
doc/guides/nics/mlx5.rst | 17 +++
drivers/net/mlx5/mlx5.c | 13 ++
drivers/net/mlx5/mlx5.h | 2 +
drivers/net/mlx5/mlx5_ethdev.c | 5 +
drivers/net/mlx5/mlx5_rxtx.c | 271 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 4 +
7 files changed, 314 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 756153b..9ada221 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -154,6 +154,23 @@ Run-time configuration
allows to save PCI bandwidth and improve performance at the cost of a
slightly higher CPU usage. Enabled by default.

+- ``txq_inline`` parameter [int]
+
+ Amount of data to be inlined during TX operations. Improves latency.
+ Can improve PPS performance when PCI back pressure is detected and may be
+ useful for scenarios involving heavy traffic on many queues.
+
+ It is not enabled by default (set to 0) since the additional software
+ logic necessary to handle this mode can lower performance when back
+ pressure is not expected.
+
+- ``txqs_min_inline`` parameter [int]
+
+ Enable inline send only when the number of TX queues is greater or equal
+ to this value.
+
+ This option should be used in combination with ``txq_inline`` above.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7e8c579..8c8c5e4 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -72,6 +72,13 @@
/* Device parameter to enable RX completion queue compression. */
#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"

+/* Device parameter to configure inline send. */
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/* Device parameter to configure the number of TX queues threshold for
+ * enabling inline send. */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
/**
* Retrieve integer value from environment variable.
*
@@ -269,6 +276,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
}
if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0)
priv->cqe_comp = !!tmp;
+ else if (strcmp(MLX5_TXQ_INLINE, key) == 0)
+ priv->txq_inline = tmp;
+ else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
+ priv->txqs_inline = tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -292,6 +303,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
{
static const char *params[] = {
MLX5_RXQ_CQE_COMP_EN,
+ MLX5_TXQ_INLINE,
+ MLX5_TXQS_MIN_INLINE,
};
struct rte_kvargs *kvlist;
int ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8f5a6df..3a86609 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -113,6 +113,8 @@ struct priv {
unsigned int mps:1; /* Whether multi-packet send is supported. */
unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
unsigned int pending_alarm:1; /* An alarm is pending. */
+ unsigned int txq_inline; /* Maximum packet size for inlining. */
+ unsigned int txqs_inline; /* Queue number threshold for inlining. */
/* RX/TX queues. */
unsigned int rxqs_n; /* RX queues array size. */
unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 4e125a7..a2bdc56 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1317,6 +1317,11 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
+ if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
+ DEBUG("selected inline TX function (%u >= %u queues)",
+ priv->txqs_n, priv->txqs_inline);
+ }
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index d56c9e9..43fe532 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -374,6 +374,139 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
}

/**
+ * Write a inline WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ */
+static inline void
+mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length)
+{
+ uint32_t size;
+ uint16_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+
+ /* Copy the first 16 bytes into inline header. */
+ rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (void *)(uintptr_t)addr,
+ MLX5_ETH_INLINE_HEADER_SIZE);
+ addr += MLX5_ETH_INLINE_HEADER_SIZE;
+ length -= MLX5_ETH_INLINE_HEADER_SIZE;
+ size = 3 + ((4 + length + 15) / 16);
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
+ * Write a inline WQE with VLAN.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe
+ * Pointer to the WQE to fill.
+ * @param addr
+ * Buffer data address.
+ * @param length
+ * Packet length.
+ * @param lkey
+ * Memory region lkey.
+ * @param vlan_tci
+ * VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+ uintptr_t addr, uint32_t length, uint16_t vlan_tci)
+{
+ uint32_t size;
+ uint32_t wqe_cnt = txq->wqe_n - 1;
+ uint16_t wqe_ci = txq->wqe_ci + 1;
+ uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+ /*
+ * Copy 12 bytes of source & destination MAC address.
+ * Copy 4 bytes of VLAN.
+ * Copy 2 bytes of Ether type.
+ */
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+ (uint8_t *)addr, 12);
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12,
+ &vlan, sizeof(vlan));
+ rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 16,
+ ((uint8_t *)addr + 12), 2);
+ addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+ size = (sizeof(wqe->inl.ctrl.ctrl) +
+ sizeof(wqe->inl.eseg) +
+ sizeof(wqe->inl.byte_cnt) +
+ length + 15) / 16;
+ wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+ rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+ (void *)addr, MLX5_WQE64_INL_DATA);
+ addr += MLX5_WQE64_INL_DATA;
+ length -= MLX5_WQE64_INL_DATA;
+ while (length) {
+ volatile union mlx5_wqe *wqe_next =
+ &(*txq->wqes)[wqe_ci & wqe_cnt];
+ uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+ sizeof(*wqe) :
+ length;
+
+ rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+ (uint8_t *)addr);
+ addr += copy_bytes;
+ length -= copy_bytes;
+ ++wqe_ci;
+ }
+ assert(size < 64);
+ wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+ wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[3] = 0;
+ wqe->inl.eseg.rsvd0 = 0;
+ wqe->inl.eseg.rsvd1 = 0;
+ wqe->inl.eseg.mss = 0;
+ wqe->inl.eseg.rsvd2 = 0;
+ wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+ /* Increment consumer index. */
+ txq->wqe_ci = wqe_ci;
+}
+
+/**
* Ring TX queue doorbell.
*
* @param txq
@@ -415,6 +548,23 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
}

/**
+ * Prefetch a WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe_ci
+ * WQE consumer index.
+ */
+static inline void
+tx_prefetch_wqe(struct txq *txq, uint16_t ci)
+{
+ volatile union mlx5_wqe *wqe;
+
+ wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+ rte_prefetch0(wqe);
+}
+
+/**
* DPDK callback for TX.
*
* @param dpdk_txq
@@ -525,6 +675,127 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * DPDK callback for TX with inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ volatile union mlx5_wqe *wqe;
+ struct rte_mbuf *buf;
+ unsigned int max_inline = txq->max_inline;
+
+ if (unlikely(!pkts_n))
+ return 0;
+ buf = pkts[0];
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_cqe(txq, txq->cq_ci + 1);
+ rte_prefetch0(buf);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+
+ wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ if (i + 1 < max)
+ rte_prefetch0(pkts[i + 1]);
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+ wqe->inl.eseg.cs_flags =
+ MLX5_ETH_WQE_L3_CSUM |
+ MLX5_ETH_WQE_L4_CSUM;
+ } else
+ wqe->inl.eseg.cs_flags = 0;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Prefetch next buffer data. */
+ if (i + 1 < max)
+ rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ volatile void *));
+ if (length <= max_inline) {
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_inline_vlan(txq, wqe,
+ addr, length,
+ buf->vlan_tci);
+ else
+ mlx5_wqe_write_inline(txq, wqe, addr, length);
+ } else {
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (buf->ol_flags & PKT_TX_VLAN_PKT)
+ mlx5_wqe_write_vlan(txq, wqe, addr, length,
+ lkey, buf->vlan_tci);
+ else
+ mlx5_wqe_write(txq, wqe, addr, length, lkey);
+ }
+ wqe->inl.ctrl.data[2] = 0;
+ elts_head = elts_head_next;
+ buf = pkts[i + 1];
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ /* Request completion on last WQE. */
+ wqe->inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f900e65..3c83148 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,6 +246,7 @@ struct txq {
uint16_t wqe_n; /* Number of WQ elements. */
uint16_t bf_offset; /* Blueflame offset. */
uint16_t bf_buf_size; /* Blueflame size. */
+ uint16_t max_inline; /* Maximum size to inline in a WQE. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
@@ -310,6 +311,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7b2dc7c..6a4a96e 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -332,6 +332,10 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
};
+ if (priv->txq_inline && priv->txqs_n >= priv->txqs_inline) {
+ tmpl.txq.max_inline = priv->txq_inline;
+ attr.init.cap.max_inline_data = tmpl.txq.max_inline;
+ }
tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
if (tmpl.qp == NULL) {
ret = (errno ? errno : EINVAL);
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:31 UTC
Permalink
This feature enables the TX burst function to emit up to 5 packets using
only two WQEs on devices that support it. Saves PCI bandwidth and improves
performance.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Olga Shern <***@mellanox.com>
---
doc/guides/nics/mlx5.rst | 10 ++
drivers/net/mlx5/mlx5.c | 14 +-
drivers/net/mlx5/mlx5_ethdev.c | 15 +-
drivers/net/mlx5/mlx5_rxtx.c | 400 +++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.h | 2 +
drivers/net/mlx5/mlx5_txq.c | 2 +-
6 files changed, 439 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 9ada221..063c4a5 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -171,6 +171,16 @@ Run-time configuration

This option should be used in combination with ``txq_inline`` above.

+- ``txq_mpw_en`` parameter [int]
+
+ A nonzero value enables multi-packet send. This feature allows the TX
+ burst function to pack up to five packets in two descriptors in order to
+ save PCI bandwidth and improve performance at the cost of a slightly
+ higher CPU usage.
+
+ It is currently only supported on the ConnectX-4 Lx family of adapters.
+ Enabled by default.
+
Prerequisites
-------------

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 8c8c5e4..b85030a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -79,6 +79,9 @@
* enabling inline send. */
#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"

+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
/**
* Retrieve integer value from environment variable.
*
@@ -280,6 +283,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
priv->txq_inline = tmp;
else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0)
priv->txqs_inline = tmp;
+ else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0)
+ priv->mps = !!tmp;
else {
WARN("%s: unknown parameter", key);
return EINVAL;
@@ -305,6 +310,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
MLX5_RXQ_CQE_COMP_EN,
MLX5_TXQ_INLINE,
MLX5_TXQS_MIN_INLINE,
+ MLX5_TXQ_MPW_EN,
};
struct rte_kvargs *kvlist;
int ret = 0;
@@ -499,6 +505,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
+ priv->mps = mps; /* Enable MPW by default if supported. */
priv->cqe_comp = 1; /* Enable compression by default. */
err = mlx5_args(priv, pci_dev->devargs);
if (err) {
@@ -547,7 +554,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)

priv_get_num_vfs(priv, &num_vfs);
priv->sriov = (num_vfs || sriov);
- priv->mps = mps;
+ if (priv->mps && !mps) {
+ ERROR("multi-packet send not supported on this device"
+ " (" MLX5_TXQ_MPW_EN ")");
+ err = ENOTSUP;
+ goto port_error;
+ }
/* Allocate and register default RSS hash keys. */
priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index a2bdc56..69bfe03 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -584,7 +584,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
DEV_RX_OFFLOAD_UDP_CKSUM |
DEV_RX_OFFLOAD_TCP_CKSUM) :
0);
- info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
+ if (!priv->mps)
+ info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
if (priv->hw_csum)
info->tx_offload_capa |=
(DEV_TX_OFFLOAD_IPV4_CKSUM |
@@ -1317,7 +1318,17 @@ void
priv_select_tx_function(struct priv *priv)
{
priv->dev->tx_pkt_burst = mlx5_tx_burst;
- if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+ /* Display warning for unsupported configurations. */
+ if (priv->sriov && priv->mps)
+ WARN("multi-packet send WQE cannot be used on a SR-IOV setup");
+ /* Select appropriate TX function. */
+ if ((priv->sriov == 0) && priv->mps && priv->txq_inline) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
+ DEBUG("selected MPW inline TX function");
+ } else if ((priv->sriov == 0) && priv->mps) {
+ priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
+ DEBUG("selected MPW TX function");
+ } else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
DEBUG("selected inline TX function (%u >= %u queues)",
priv->txqs_n, priv->txqs_inline);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 43fe532..2ee504d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -796,6 +796,406 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}

/**
+ * Open a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+ volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
+ (volatile struct mlx5_wqe_data_seg (*)[])
+ (uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+
+ mpw->state = MLX5_MPW_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw.eseg.mss = htons(length);
+ mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw.eseg.rsvd0 = 0;
+ mpw->wqe->mpw.eseg.rsvd1 = 0;
+ mpw->wqe->mpw.eseg.rsvd2 = 0;
+ mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw.ctrl.data[2] = 0;
+ mpw->wqe->mpw.ctrl.data[3] = 0;
+ mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
+ mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+ mpw->data.dseg[2] = &(*dseg)[0];
+ mpw->data.dseg[3] = &(*dseg)[1];
+ mpw->data.dseg[4] = &(*dseg)[2];
+}
+
+/**
+ * Close a MPW session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int num = mpw->pkts_n;
+
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ if (num < 3)
+ ++txq->wqe_ci;
+ else
+ txq->wqe_ci += 2;
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+}
+
+/**
+ * DPDK callback for TX with MPW support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
+ ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+ mlx5_mpw_close(txq, &mpw);
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ }
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
+ * Open a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ * @param length
+ * Packet length.
+ */
+static inline void
+mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+ uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+
+ mpw->state = MLX5_MPW_INL_STATE_OPENED;
+ mpw->pkts_n = 0;
+ mpw->len = length;
+ mpw->total_len = 0;
+ mpw->wqe = &(*txq->wqes)[idx];
+ mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+ (txq->wqe_ci << 8) |
+ MLX5_OPCODE_LSO_MPW);
+ mpw->wqe->mpw_inl.ctrl.data[2] = 0;
+ mpw->wqe->mpw_inl.ctrl.data[3] = 0;
+ mpw->wqe->mpw_inl.eseg.mss = htons(length);
+ mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
+ mpw->wqe->mpw_inl.eseg.cs_flags = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
+ mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
+ mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+}
+
+/**
+ * Close a MPW inline session.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param mpw
+ * Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+ unsigned int size;
+
+ size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+ /* Store size in multiple of 16 bytes. Control and Ethernet segments
+ * count as 2. */
+ mpw->wqe->mpw_inl.ctrl.data[1] =
+ htonl(txq->qp_num_8s | ((size + 15) / 16));
+ mpw->state = MLX5_MPW_STATE_CLOSED;
+ mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+ txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+}
+
+/**
+ * DPDK callback for TX with MPW inline support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ uint16_t elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int i;
+ unsigned int max;
+ unsigned int comp;
+ unsigned int inline_room = txq->max_inline;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ /* Prefetch first packet cacheline. */
+ tx_prefetch_cqe(txq, txq->cq_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci);
+ tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ /* Start processing. */
+ txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t cs_flags = 0;
+
+ /* Should we enable HW CKSUM offload */
+ if (buf->ol_flags &
+ (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ (*txq->elts)[elts_head] = buf;
+ /* Start new session if packet differs. */
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+ mlx5_mpw_close(txq, &mpw);
+ } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
+ if ((mpw.len != length) ||
+ (length > inline_room) ||
+ (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+ if (length > inline_room) {
+ mlx5_mpw_new(txq, &mpw, length);
+ mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+ } else {
+ mlx5_mpw_inline_new(txq, &mpw, length);
+ mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+ }
+ }
+ if (mpw.state == MLX5_MPW_STATE_OPENED) {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ assert(inline_room == txq->max_inline);
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(length),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+ mlx5_mpw_close(txq, &mpw);
+ } else {
+ unsigned int max;
+
+ assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
+ assert(length <= inline_room);
+ /* Maximum number of bytes before wrapping. */
+ max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+ (uintptr_t)mpw.data.raw);
+ if (length > max) {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ max);
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)(addr + max),
+ length - max);
+ mpw.data.raw += length - max;
+ } else {
+ rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ length);
+ mpw.data.raw += length;
+ }
+ if ((uintptr_t)mpw.data.raw ==
+ (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+ mpw.data.raw =
+ (volatile void *)&(*txq->wqes)[0];
+ ++mpw.pkts_n;
+ if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
+ mlx5_mpw_inline_close(txq, &mpw);
+ inline_room = txq->max_inline;
+ } else
+ inline_room -= length;
+ }
+ mpw.total_len += length;
+ elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ }
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ comp = txq->elts_comp + i;
+ if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile union mlx5_wqe *wqe = mpw.wqe;
+
+ /* Request completion on last WQE. */
+ wqe->mpw_inl.ctrl.data[2] = htonl(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->mpw_inl.ctrl.data[3] = elts_head;
+ txq->elts_comp = 0;
+ } else
+ txq->elts_comp = comp;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ /* Ring QP doorbell. */
+ if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
+ mlx5_mpw_inline_close(txq, &mpw);
+ else if (mpw.state == MLX5_MPW_STATE_OPENED)
+ mlx5_mpw_close(txq, &mpw);
+ mlx5_tx_dbrec(txq);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
* Translate RX completion flags to packet type.
*
* @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3c83148..41605f9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -312,6 +312,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6a4a96e..4f17fb0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -398,7 +398,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.obj = tmpl.qp,
/* Enable multi-packet send if supported. */
.family_flags =
- (priv->mps ?
+ ((priv->mps && !priv->sriov) ?
IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
0),
};
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:32 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_txq.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4f17fb0..bae9f3d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -343,6 +343,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u,"
+ " max_inline_data=%u",
+ attr.init.cap.max_send_wr,
+ attr.init.cap.max_send_sge,
+ attr.init.cap.max_inline_data);
attr.mod = (struct ibv_exp_qp_attr){
/* Move the QP to this state. */
.qp_state = IBV_QPS_INIT,
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:33 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The space necessary to store segmented packets cannot be known in advance
and must be verified for each of them.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 136 ++++++++++++++++++++++---------------------
1 file changed, 70 insertions(+), 66 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ee504d..7097713 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -583,50 +583,49 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
rte_prefetch0(wqe);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Retrieve buffer information. */
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -649,8 +648,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
txq->stats.obytes += length;
#endif
elts_head = elts_head_next;
- buf = pkts[i + 1];
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -693,44 +692,43 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
- struct rte_mbuf *buf;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
return 0;
- buf = pkts[0];
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_cqe(txq, txq->cq_ci + 1);
- rte_prefetch0(buf);
+ rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
- if (i + 1 < max)
- rte_prefetch0(pkts[i + 1]);
+ if (pkts_n)
+ rte_prefetch0(*pkts);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -745,8 +743,8 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Update element. */
(*txq->elts)[elts_head] = buf;
/* Prefetch next buffer data. */
- if (i + 1 < max)
- rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+ if (pkts_n)
+ rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
if (length <= max_inline) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
@@ -766,12 +764,12 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}
wqe->inl.ctrl.data[2] = 0;
elts_head = elts_head_next;
- buf = pkts[i + 1];
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -879,13 +877,15 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -895,22 +895,22 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
+ do {
+ struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *dseg;
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -943,7 +943,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
@@ -1048,7 +1049,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
struct txq *txq = (struct txq *)dpdk_txq;
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
- unsigned int i;
+ unsigned int i = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1056,6 +1057,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
.state = MLX5_MPW_STATE_CLOSED,
};

+ if (unlikely(!pkts_n))
+ return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -1065,21 +1068,21 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
- unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+ do {
+ struct rte_mbuf *buf;
+ unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t cs_flags = 0;

+ /* Make sure there is enough room to store this packet and
+ * that one ring entry remains unused. */
+ if (max < 1 + 1)
+ break;
+ --max;
+ --pkts_n;
+ buf = *(pkts++);
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -1165,7 +1168,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
/* Increment sent bytes counter. */
txq->stats.obytes += length;
#endif
- }
+ ++i;
+ } while (pkts_n);
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:34 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Compared to its previous incarnation, the software limit on the number of
mbuf segments is no more (previously MLX5_PMD_SGE_WR_N, set to 4 by
default) hence no need for linearization code and related buffers that
permanently consumed a non negligible amount of memory to handle oversized
mbufs.

The resulting code is both lighter and faster.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 231 +++++++++++++++++++++++++++++++++----------
drivers/net/mlx5/mlx5_txq.c | 6 +-
2 files changed, 182 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7097713..db784c0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -301,6 +301,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
{
wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -346,6 +347,7 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,

wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+ wqe->wqe.ctrl.data[2] = 0;
wqe->wqe.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -423,6 +425,7 @@ mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -496,6 +499,7 @@ mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
assert(size < 64);
wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+ wqe->inl.ctrl.data[2] = 0;
wqe->inl.ctrl.data[3] = 0;
wqe->inl.eseg.rsvd0 = 0;
wqe->inl.eseg.rsvd1 = 0;
@@ -584,6 +588,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -600,21 +605,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
rte_prefetch0(wqe);
if (pkts_n)
rte_prefetch0(*pkts);
@@ -634,7 +643,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
buf->vlan_tci);
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
- wqe->wqe.ctrl.data[2] = 0;
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -643,6 +651,35 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
MLX5_ETH_WQE_L4_CSUM;
} else
wqe->wqe.eseg.cs_flags = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+ elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
txq->stats.obytes += length;
@@ -654,7 +691,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->wqe.ctrl.data[2] = htonl(8);
@@ -693,6 +730,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
volatile union mlx5_wqe *wqe;
@@ -710,21 +748,25 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
uint32_t lkey;
+ unsigned int segs_n = buf->nb_segs;
+ volatile struct mlx5_wqe_data_seg *dseg;
+ unsigned int ds = sizeof(*wqe) / 16;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
elts_head_next = (elts_head + 1) & (elts_n - 1);
wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+ dseg = &wqe->wqe.dseg;
tx_prefetch_wqe(txq, txq->wqe_ci);
tx_prefetch_wqe(txq, txq->wqe_ci + 1);
if (pkts_n)
@@ -746,13 +788,14 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (pkts_n)
rte_prefetch0(rte_pktmbuf_mtod(*pkts,
volatile void *));
- if (length <= max_inline) {
+ if ((length <= max_inline) && (segs_n == 1)) {
if (buf->ol_flags & PKT_TX_VLAN_PKT)
mlx5_wqe_write_inline_vlan(txq, wqe,
addr, length,
buf->vlan_tci);
else
mlx5_wqe_write_inline(txq, wqe, addr, length);
+ goto skip_segs;
} else {
/* Retrieve Memory Region key for this memory pool. */
lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -762,7 +805,35 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
else
mlx5_wqe_write(txq, wqe, addr, length, lkey);
}
- wqe->inl.ctrl.data[2] = 0;
+ while (--segs_n) {
+ /* Spill on next WQE when the current one does not have
+ * enough room left. Size of WQE must a be a multiple
+ * of data segment size. */
+ assert(!(sizeof(*wqe) % sizeof(*dseg)));
+ if (!(ds % (sizeof(*wqe) / 16)))
+ dseg = (volatile void *)
+ &(*txq->wqes)[txq->wqe_ci++ &
+ (txq->wqe_n - 1)];
+ else
+ ++dseg;
+ ++ds;
+ buf = buf->next;
+ assert(buf);
+ /* Store segment information. */
+ dseg->byte_count = htonl(DATA_LEN(buf));
+ dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ (*txq->elts)[elts_head_next] = buf;
+ elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ length += DATA_LEN(buf);
+#endif
+ ++j;
+ }
+ /* Update DS field in WQE. */
+ wqe->inl.ctrl.data[1] &= htonl(0xffffffc0);
+ wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f);
+ skip_segs:
elts_head = elts_head_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
@@ -774,7 +845,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
/* Request completion on last WQE. */
wqe->inl.ctrl.data[2] = htonl(8);
@@ -878,6 +949,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
struct mlx5_mpw mpw = {
@@ -896,46 +968,67 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
- volatile struct mlx5_wqe_data_seg *dseg;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
- uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
+ break;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
break;
- --max;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
+ assert(length);
/* Start new session if packet differs. */
if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
mlx5_mpw_close(txq, &mpw);
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
}
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+ uintptr_t addr;
+
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
elts_head = elts_head_next;
@@ -949,7 +1042,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

@@ -1050,6 +1144,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t elts_head = txq->elts_head;
const unsigned int elts_n = txq->elts_n;
unsigned int i = 0;
+ unsigned int j = 0;
unsigned int max;
unsigned int comp;
unsigned int inline_room = txq->max_inline;
@@ -1069,36 +1164,38 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (max > elts_n)
max -= elts_n;
do {
- struct rte_mbuf *buf;
+ struct rte_mbuf *buf = *(pkts++);
unsigned int elts_head_next;
uintptr_t addr;
uint32_t length;
+ unsigned int segs_n = buf->nb_segs;
uint32_t cs_flags = 0;

/* Make sure there is enough room to store this packet and
* that one ring entry remains unused. */
- if (max < 1 + 1)
+ assert(segs_n);
+ if (max < segs_n + 1)
break;
- --max;
+ /* Do not bother with large packets MPW cannot handle. */
+ if (segs_n > MLX5_MPW_DSEG_MAX)
+ break;
+ max -= segs_n;
--pkts_n;
- buf = *(pkts++);
- elts_head_next = (elts_head + 1) & (elts_n - 1);
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- (*txq->elts)[elts_head] = buf;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
/* Start new session if packet differs. */
if (mpw.state == MLX5_MPW_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(mpw.wqe->mpw.eseg.cs_flags != cs_flags))
mlx5_mpw_close(txq, &mpw);
} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
if ((mpw.len != length) ||
+ (segs_n != 1) ||
(length > inline_room) ||
(mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
mlx5_mpw_inline_close(txq, &mpw);
@@ -1106,7 +1203,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
}
}
if (mpw.state == MLX5_MPW_STATE_CLOSED) {
- if (length > inline_room) {
+ if ((segs_n != 1) ||
+ (length > inline_room)) {
mlx5_mpw_new(txq, &mpw, length);
mpw.wqe->mpw.eseg.cs_flags = cs_flags;
} else {
@@ -1114,17 +1212,36 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
}
}
+ /* Multi-segment packets must be alone in their MPW. */
+ assert((segs_n == 1) || (mpw.pkts_n == 0));
if (mpw.state == MLX5_MPW_STATE_OPENED) {
- volatile struct mlx5_wqe_data_seg *dseg;
-
assert(inline_room == txq->max_inline);
- dseg = mpw.data.dseg[mpw.pkts_n];
- *dseg = (struct mlx5_wqe_data_seg){
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
- .addr = htonll(addr),
- };
- ++mpw.pkts_n;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length = 0;
+#endif
+ do {
+ volatile struct mlx5_wqe_data_seg *dseg;
+
+ elts_head_next =
+ (elts_head + 1) & (elts_n - 1);
+ assert(buf);
+ (*txq->elts)[elts_head] = buf;
+ dseg = mpw.data.dseg[mpw.pkts_n];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ *dseg = (struct mlx5_wqe_data_seg){
+ .byte_count = htonl(DATA_LEN(buf)),
+ .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ .addr = htonll(addr),
+ };
+ elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+ length += DATA_LEN(buf);
+#endif
+ buf = buf->next;
+ ++mpw.pkts_n;
+ ++j;
+ } while (--segs_n);
+ assert(length == mpw.len);
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
mlx5_mpw_close(txq, &mpw);
} else {
@@ -1132,6 +1249,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,

assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
assert(length <= inline_room);
+ assert(length == DATA_LEN(buf));
+ elts_head_next = (elts_head + 1) & (elts_n - 1);
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ (*txq->elts)[elts_head] = buf;
/* Maximum number of bytes before wrapping. */
max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
(uintptr_t)mpw.data.raw);
@@ -1156,6 +1277,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.data.raw =
(volatile void *)&(*txq->wqes)[0];
++mpw.pkts_n;
+ ++j;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
mlx5_mpw_inline_close(txq, &mpw);
inline_room = txq->max_inline;
@@ -1174,7 +1296,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (unlikely(i == 0))
return 0;
/* Check whether completion threshold has been reached. */
- comp = txq->elts_comp + i;
+ /* "j" includes both packets and segments. */
+ comp = txq->elts_comp + j;
if (comp >= MLX5_TX_COMP_THRESH) {
volatile union mlx5_wqe *wqe = mpw.wqe;

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index bae9f3d..f1c17e0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -320,7 +320,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
priv->device_attr.max_qp_wr :
desc),
- /* Max number of scatter/gather elements in a WR. */
+ /* Max number of scatter/gather elements in a WR,
+ * must be 1 to prevent libmlx5 from trying to affect
+ * too much memory. TX gather is not impacted by the
+ * priv->device_attr.max_sge limit and will still work
+ * properly. */
.max_send_sge = 1,
},
.qp_type = IBV_QPT_RAW_PACKET,
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:35 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Since commit "mlx5: resurrect Tx gather support", older GCC versions (such
as 4.8.5) may complain about the following:

mlx5_rxtx.c: In function `mlx5_tx_burst':
mlx5_rxtx.c:705:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

mlx5_rxtx.c: In function `mlx5_tx_burst_inline':
mlx5_rxtx.c:864:25: error: `wqe' may be used uninitialized in this
function [-Werror=maybe-uninitialized]

In both cases, this code cannot be reached when wqe is not initialized.

Considering older GCC versions are still widely used, work around this
issue by initializing wqe preemptively, even if it should not be necessary.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index db784c0..2fc57dc 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -591,7 +591,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;

if (unlikely(!pkts_n))
return 0;
@@ -733,7 +733,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile union mlx5_wqe *wqe;
+ volatile union mlx5_wqe *wqe = NULL;
unsigned int max_inline = txq->max_inline;

if (unlikely(!pkts_n))
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:36 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

Toggling RX checksum offloads is already done at initialization time. This
code does not belong in rxq_rehash().

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_rxq.c | 10 ----------
1 file changed, 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 6881cdd..707296c 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -798,7 +798,6 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct priv *priv = rxq_ctrl->priv;
struct rxq_ctrl tmpl = *rxq_ctrl;
unsigned int mbuf_n;
unsigned int desc_n;
@@ -811,15 +810,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
/* Number of descriptors and mbufs currently allocated. */
desc_n = tmpl.rxq.elts_n;
mbuf_n = desc_n;
- /* Toggle RX checksum offload if hardware supports it. */
- if (priv->hw_csum) {
- tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum = tmpl.rxq.csum;
- }
- if (priv->hw_csum_l2tun) {
- tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:37 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

The primary purpose of rxq_rehash() function is to stop and restart
reception on a queue after re-posting buffers. This may fail if the array
that temporarily stores existing buffers for reuse cannot be allocated.

Update rxq_rehash() to work on the target queue directly (not through a
template copy) and avoid this allocation.

rxq_alloc_elts() is modified accordingly to take buffers from an existing
queue directly and update their refcount.

Unlike rxq_rehash(), rxq_setup() must work on a temporary structure but
should not allocate new mbufs from the pool while reinitializing an
existing queue. This is achieved by using the refcount-aware
rxq_alloc_elts() before overwriting queue data.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
---
drivers/net/mlx5/mlx5_rxq.c | 83 ++++++++++++++++++++++-----------------------
1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 707296c..0a3225e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -642,7 +642,7 @@ priv_rehash_flows(struct priv *priv)
*/
static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
- struct rte_mbuf **pool)
+ struct rte_mbuf *(*pool)[])
{
unsigned int i;
int ret = 0;
@@ -654,9 +654,10 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
&(*rxq_ctrl->rxq.wqes)[i];

if (pool != NULL) {
- buf = *(pool++);
+ buf = (*pool)[i];
assert(buf != NULL);
rte_pktmbuf_reset(buf);
+ rte_pktmbuf_refcnt_update(buf, 1);
} else
buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
if (buf == NULL) {
@@ -781,7 +782,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
}

/**
- * Reconfigure a RX queue with new parameters.
+ * Reconfigure RX queue buffers.
*
* rxq_rehash() does not allocate mbufs, which, if not done from the right
* thread (such as a control thread), may corrupt the pool.
@@ -798,67 +799,48 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
int
rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
{
- struct rxq_ctrl tmpl = *rxq_ctrl;
- unsigned int mbuf_n;
- unsigned int desc_n;
- struct rte_mbuf **pool;
- unsigned int i, k;
+ unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+ unsigned int i;
struct ibv_exp_wq_attr mod;
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
- /* Number of descriptors and mbufs currently allocated. */
- desc_n = tmpl.rxq.elts_n;
- mbuf_n = desc_n;
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RESET,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
assert(err > 0);
return err;
}
- /* Allocate pool. */
- pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
- if (pool == NULL) {
- ERROR("%p: cannot allocate memory", (void *)dev);
- return ENOBUFS;
- }
/* Snatch mbufs from original queue. */
- k = 0;
- for (i = 0; (i != desc_n); ++i)
- pool[k++] = (*rxq_ctrl->rxq.elts)[i];
- assert(k == mbuf_n);
- rte_free(pool);
+ claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
+ for (i = 0; i != elts_n; ++i) {
+ struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
+
+ assert(rte_mbuf_refcnt_read(buf) == 2);
+ rte_pktmbuf_free_seg(buf);
+ }
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
.wq_state = IBV_EXP_WQS_RDY,
};
- err = ibv_exp_modify_wq(tmpl.wq, &mod);
+ err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
if (err) {
ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
(void *)dev, strerror(err));
goto error;
}
- /* Post SGEs. */
- err = rxq_alloc_elts(&tmpl, desc_n, pool);
- if (err) {
- ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
- rte_free(pool);
- assert(err > 0);
- return err;
- }
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc_n;
+ rxq_ctrl->rxq.rq_ci = elts_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
- *rxq_ctrl = tmpl;
assert(err >= 0);
return err;
}
@@ -868,24 +850,26 @@ error:
*
* @param tmpl
* Pointer to RX queue control template.
- * @param rxq_ctrl
- * Pointer to RX queue control.
*
* @return
* 0 on success, errno value on failure.
*/
static inline int
-rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+rxq_setup(struct rxq_ctrl *tmpl)
{
struct ibv_cq *ibcq = tmpl->cq;
struct mlx5_cq *cq = to_mxxx(cq, cq);
struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+ struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+ rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);

if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
"it should be set to %u", RTE_CACHE_LINE_SIZE);
return EINVAL;
}
+ if (elts == NULL)
+ return ENOMEM;
tmpl->rxq.rq_db = rwq->rq.db;
tmpl->rxq.cqe_n = ibcq->cqe + 1;
tmpl->rxq.cq_ci = 0;
@@ -897,9 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
tmpl->rxq.cqes =
(volatile struct mlx5_cqe (*)[])
(uintptr_t)cq->active_buf->buf;
- tmpl->rxq.elts =
- (struct rte_mbuf *(*)[tmpl->rxq.elts_n])
- ((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+ tmpl->rxq.elts = elts;
return 0;
}

@@ -947,6 +929,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
unsigned int cqe_n = desc - 1;
+ struct rte_mbuf *(*elts)[desc] = NULL;
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
@@ -1103,13 +1086,19 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_setup(&tmpl, rxq_ctrl);
+ ret = rxq_setup(&tmpl);
if (ret) {
ERROR("%p: cannot initialize RX queue structure: %s",
(void *)dev, strerror(ret));
goto error;
}
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ /* Reuse buffers from original queue if possible. */
+ if (rxq_ctrl->rxq.elts_n) {
+ assert(rxq_ctrl->rxq.elts_n == desc);
+ assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
+ ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
+ } else
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1118,6 +1107,14 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
rxq_cleanup(rxq_ctrl);
+ /* Move mbuf pointers to dedicated storage area in RX queue. */
+ elts = (void *)(rxq_ctrl + 1);
+ rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
+#ifndef NDEBUG
+ memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
+#endif
+ rte_free(tmpl.rxq.elts);
+ tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
rxq_ctrl->rxq.rq_ci = desc;
@@ -1127,7 +1124,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
assert(ret == 0);
return 0;
error:
+ elts = tmpl.rxq.elts;
rxq_cleanup(&tmpl);
+ rte_free(elts);
assert(ret > 0);
return ret;
}
--
2.1.4
Nelio Laranjeiro
2016-06-21 07:23:38 UTC
Permalink
From: Adrien Mazarguil <***@6wind.com>

This commit brings back Rx scatter and related support by the MTU update
function. The maximum number of segments per packet is not a fixed value
anymore (previously MLX5_PMD_SGE_WR_N, set to 4 by default) as it caused
performance issues when fewer segments were actually needed as well as
limitations on the maximum packet size that could be received with the
default mbuf size (supporting at most 8576 bytes).

These limitations are now lifted as the number of SGEs is derived from the
MTU (which implies MRU) at queue initialization and during MTU update.

Signed-off-by: Adrien Mazarguil <***@6wind.com>
Signed-off-by: Vasily Philipov <***@mellanox.com>
Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 84 +++++++++++++++++++++----
drivers/net/mlx5/mlx5_rxq.c | 73 +++++++++++++++++-----
drivers/net/mlx5/mlx5_rxtx.c | 139 ++++++++++++++++++++++++-----------------
drivers/net/mlx5/mlx5_rxtx.h | 1 +
4 files changed, 215 insertions(+), 82 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 69bfe03..757f8e4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -725,6 +725,9 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
unsigned int i;
uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
mlx5_rx_burst;
+ unsigned int max_frame_len;
+ int rehash;
+ int restart = priv->started;

if (mlx5_is_secondary())
return -E_RTE_SECONDARY;
@@ -738,7 +741,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
goto out;
} else
DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
- priv->mtu = mtu;
/* Temporarily replace RX handler with a fake one, assuming it has not
* been copied elsewhere. */
dev->rx_pkt_burst = removed_rx_burst;
@@ -746,28 +748,88 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
* removed_rx_burst() instead. */
rte_wmb();
usleep(1000);
+ /* MTU does not include header and CRC. */
+ max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN;
+ /* Check if at least one queue is going to need a SGE update. */
+ for (i = 0; i != priv->rxqs_n; ++i) {
+ struct rxq *rxq = (*priv->rxqs)[i];
+ unsigned int mb_len;
+ unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len;
+ unsigned int sges_n;
+
+ if (rxq == NULL)
+ continue;
+ mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ if (sges_n != rxq->sges_n)
+ break;
+ }
+ /* If all queues have the right number of SGEs, a simple rehash
+ * of their buffers is enough, otherwise SGE information can only
+ * be updated in a queue by recreating it. All resources that depend
+ * on queues (flows, indirection tables) must be recreated as well in
+ * that case. */
+ rehash = (i == priv->rxqs_n);
+ if (!rehash) {
+ /* Clean up everything as with mlx5_dev_stop(). */
+ priv_special_flow_disable_all(priv);
+ priv_mac_addrs_disable(priv);
+ priv_destroy_hash_rxqs(priv);
+ priv_fdir_disable(priv);
+ priv_dev_interrupt_handler_uninstall(priv, dev);
+ }
+recover:
/* Reconfigure each RX queue. */
for (i = 0; (i != priv->rxqs_n); ++i) {
struct rxq *rxq = (*priv->rxqs)[i];
- unsigned int mb_len;
- unsigned int max_frame_len;
+ struct rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct rxq_ctrl, rxq);
int sp;
+ unsigned int mb_len;
+ unsigned int tmp;

if (rxq == NULL)
continue;
- /* Calculate new maximum frame length according to MTU and
- * toggle scattered support (sp) if necessary. */
- max_frame_len = (priv->mtu + ETHER_HDR_LEN +
- (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ /* Toggle scattered support (sp) if necessary. */
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- if (sp) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- ret = ENOTSUP;
- goto out;
+ /* Provide new values to rxq_setup(). */
+ dev->data->dev_conf.rxmode.jumbo_frame = sp;
+ dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
+ if (rehash)
+ ret = rxq_rehash(dev, rxq_ctrl);
+ else
+ ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+ rxq_ctrl->socket, NULL, rxq->mp);
+ if (!ret)
+ continue;
+ /* Attempt to roll back in case of error. */
+ tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM;
+ if (max_frame_len != tmp) {
+ max_frame_len = tmp;
+ goto recover;
}
+ /* Double fault, disable RX. */
+ break;
}
+ /* Use a safe RX burst function in case of error, otherwise mimic
+ * mlx5_dev_start(). */
+ if (ret) {
+ ERROR("unable to reconfigure RX queues, RX disabled");
+ rx_func = removed_rx_burst;
+ } else if (restart &&
+ !rehash &&
+ !priv_create_hash_rxqs(priv) &&
+ !priv_rehash_flows(priv)) {
+ if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE)
+ priv_fdir_enable(priv);
+ priv_dev_interrupt_handler_install(priv, dev);
+ }
+ priv->mtu = mtu;
/* Burst functions can now be called again. */
rte_wmb();
dev->rx_pkt_burst = rx_func;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0a3225e..38e3caa 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -644,10 +644,11 @@ static int
rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
struct rte_mbuf *(*pool)[])
{
+ const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
unsigned int i;
int ret = 0;

- /* For each WR (packet). */
+ /* Iterate on segments. */
for (i = 0; (i != elts_n); ++i) {
struct rte_mbuf *buf;
volatile struct mlx5_wqe_data_seg *scat =
@@ -672,6 +673,9 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
assert(!buf->next);
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ SET_DATA_OFF(buf, 0);
PORT(buf) = rxq_ctrl->rxq.port_id;
DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
PKT_LEN(buf) = DATA_LEN(buf);
@@ -685,8 +689,8 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
};
(*rxq_ctrl->rxq.elts)[i] = buf;
}
- DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq_ctrl, elts_n);
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
assert(ret == 0);
return 0;
error:
@@ -804,7 +808,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
struct ibv_exp_wq_attr mod;
int err;

- DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
+ DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
+ (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
+ assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -837,7 +843,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
goto error;
}
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = elts_n;
+ rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
error:
@@ -933,9 +939,40 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if (desc == 0) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of 2)", (void *)dev);
+ /* Enable scattered packets support for this queue if necessary. */
+ assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+ if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+ (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+ (mb_len - RTE_PKTMBUF_HEADROOM))) {
+ unsigned int size =
+ RTE_PKTMBUF_HEADROOM +
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ unsigned int sges_n;
+
+ /* Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two. */
+ sges_n = log2above((size / mb_len) + !!(size % mb_len));
+ tmpl.rxq.sges_n = sges_n;
+ /* Make sure rxq.sges_n did not overflow. */
+ size = mb_len * (1 << tmpl.rxq.sges_n);
+ size -= RTE_PKTMBUF_HEADROOM;
+ if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+ ERROR("%p: too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u",
+ (void *)dev,
+ 1 << sges_n,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len);
+ return EOVERFLOW;
+ }
+ }
+ DEBUG("%p: maximum number of segments per packet: %u",
+ (void *)dev, 1 << tmpl.rxq.sges_n);
+ if (desc % (1 << tmpl.rxq.sges_n)) {
+ ERROR("%p: number of RX queue descriptors (%u) is not a"
+ " multiple of SGEs per packet (%u)",
+ (void *)dev,
+ desc,
+ 1 << tmpl.rxq.sges_n);
return EINVAL;
}
/* Toggle RX checksum offload if hardware supports it. */
@@ -943,7 +980,6 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.rxq.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -993,11 +1029,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
.wq_context = NULL, /* Could be useful in the future. */
.wq_type = IBV_EXP_WQT_RQ,
/* Max number of outstanding WRs. */
- .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
- priv->device_attr.max_qp_wr :
- (int)desc),
+ .max_recv_wr = desc >> tmpl.rxq.sges_n,
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = 1,
+ .max_recv_sge = 1 << tmpl.rxq.sges_n,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1049,6 +1083,17 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
(void *)dev, strerror(ret));
goto error;
}
+ /* Make sure number of WRs*SGEs match expectations since a queue
+ * cannot allocate more than "desc" buffers. */
+ if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
+ ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
+ ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+ (void *)dev,
+ (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
+ attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+ ret = EINVAL;
+ goto error;
+ }
/* Save port ID. */
tmpl.rxq.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1117,7 +1162,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
tmpl.rxq.elts = elts;
*rxq_ctrl = tmpl;
/* Update doorbell counter. */
- rxq_ctrl->rxq.rq_ci = desc;
+ rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
rte_wmb();
*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2fc57dc..71ecdcd 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1520,96 +1520,121 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = dpdk_rxq;
- unsigned int pkts_ret = 0;
- unsigned int i;
- unsigned int rq_ci = rxq->rq_ci;
- const unsigned int elts_n = rxq->elts_n;
- const unsigned int wqe_cnt = elts_n - 1;
+ const unsigned int wqe_cnt = rxq->elts_n - 1;
const unsigned int cqe_cnt = rxq->cqe_n - 1;
+ const unsigned int sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ unsigned int i = 0;
+ unsigned int rq_ci = rxq->rq_ci << sges_n;
+ int len;

- for (i = 0; (i != pkts_n); ++i) {
+ while (pkts_n) {
unsigned int idx = rq_ci & wqe_cnt;
- int len;
- struct rte_mbuf *rep;
- struct rte_mbuf *pkt;
volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
- volatile struct mlx5_cqe64 *cqe =
- &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ struct rte_mbuf *rep = (*rxq->elts)[idx];

- pkt = (*rxq->elts)[idx];
+ if (pkt)
+ NEXT(seg) = rep;
+ seg = rep;
+ rte_prefetch0(seg);
rte_prefetch0(cqe);
+ rte_prefetch0(wqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
+ while (pkt) {
+ seg = NEXT(pkt);
+ rte_mbuf_refcnt_set(pkt, 0);
+ __rte_mbuf_raw_free(pkt);
+ pkt = seg;
+ }
++rxq->stats.rx_nombuf;
break;
}
- SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(rep) = 1;
- PORT(rep) = rxq->port_id;
- NEXT(rep) = NULL;
- len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
- if (unlikely(len == 0)) {
- rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
- break;
- }
- if (unlikely(len == -1)) {
- /* RX error, packet is likely too large. */
- rte_mbuf_refcnt_set(rep, 0);
- __rte_mbuf_raw_free(rep);
- ++rxq->stats.idropped;
- --i;
- goto skip;
+ if (!pkt) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
+ if (len == 0) {
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len == -1)) {
+ /* RX error, packet is likely too large. */
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
+ /* Update packet information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type =
+ rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags =
+ rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc &
+ MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
+ }
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
+ }
+ PKT_LEN(pkt) = len;
}
+ DATA_LEN(rep) = DATA_LEN(seg);
+ PKT_LEN(rep) = PKT_LEN(seg);
+ SET_DATA_OFF(rep, DATA_OFF(seg));
+ NB_SEGS(rep) = NB_SEGS(seg);
+ PORT(rep) = PORT(seg);
+ NEXT(rep) = NULL;
+ (*rxq->elts)[idx] = rep;
/* Fill NIC descriptor with the new buffer. The lkey and size
* of the buffers are already known, only the buffer address
* changes. */
- wqe->addr = htonll((uintptr_t)rep->buf_addr +
- RTE_PKTMBUF_HEADROOM);
- (*rxq->elts)[idx] = rep;
- /* Update pkt information. */
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
- rxq->crc_present) {
- if (rxq->csum) {
- pkt->packet_type = rxq_cq_to_pkt_type(cqe);
- pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
- }
- if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
- pkt->ol_flags |= PKT_RX_VLAN_PKT;
- pkt->vlan_tci = ntohs(cqe->vlan_info);
- }
- if (rxq->crc_present)
- len -= ETHER_CRC_LEN;
+ wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > DATA_LEN(seg)) {
+ len -= DATA_LEN(seg);
+ ++NB_SEGS(pkt);
+ ++rq_ci;
+ continue;
}
- PKT_LEN(pkt) = len;
- DATA_LEN(pkt) = len;
+ DATA_LEN(seg) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
- rxq->stats.ibytes += len;
+ rxq->stats.ibytes += PKT_LEN(pkt);
#endif
/* Return packet. */
*(pkts++) = pkt;
- ++pkts_ret;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
++rq_ci;
+ rq_ci <<= sges_n;
}
- if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
+ if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
return 0;
- /* Repost WRs. */
-#ifdef DEBUG_RECV
- DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
/* Update the consumer index. */
- rxq->rq_ci = rq_ci;
+ rxq->rq_ci = rq_ci >> sges_n;
rte_wmb();
*rxq->cq_db = htonl(rxq->cq_ci);
rte_wmb();
*rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
+ rxq->stats.ipackets += i;
#endif
- return pkts_ret;
+ return i;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 41605f9..f6e2cba 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -106,6 +106,7 @@ struct rxq {
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
+ unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
uint16_t rq_ci;
uint16_t cq_ci;
uint16_t elts_n;
--
2.1.4
Yuanhan Liu
2016-06-21 07:43:08 UTC
Permalink
Hi,

Here is an off-topic comment: would you please add following line to
the sendemail section of your git config file?

chainreplyto = false

That would let me to break the long threads in my client much easier.
Otherwise, it's hard for me to do it, leading that your thread occupies
several screens on my side.

It seems that Tetsuya also has the issue, thus CC'ed.

Thanks.

--yliu
Nélio Laranjeiro
2016-06-21 08:00:34 UTC
Permalink
Post by Yuanhan Liu
Hi,
Here is an off-topic comment: would you please add following line to
the sendemail section of your git config file?
chainreplyto = false
That would let me to break the long threads in my client much easier.
Otherwise, it's hard for me to do it, leading that your thread occupies
several screens on my side.
It seems that Tetsuya also has the issue, thus CC'ed.
Thanks.
--yliu
I already have it in my sendemail section (copied from
http://dpdk.org/dev web page).

I was wondering it did not split correctly the patchset threads.

I will try to use the command line "--no-chain-reply-to" option next
time.

Thanks.
--
Nélio Laranjeiro
6WIND
Yuanhan Liu
2016-06-21 08:05:44 UTC
Permalink
Post by Nélio Laranjeiro
Post by Yuanhan Liu
Hi,
Here is an off-topic comment: would you please add following line to
the sendemail section of your git config file?
chainreplyto = false
That would let me to break the long threads in my client much easier.
Otherwise, it's hard for me to do it, leading that your thread occupies
several screens on my side.
It seems that Tetsuya also has the issue, thus CC'ed.
Thanks.
--yliu
I already have it in my sendemail section (copied from
http://dpdk.org/dev web page).
I was wondering it did not split correctly the patchset threads.
No idea, and here is a blind guess: maybe you have a local git config
file that overwrites the globle options?
Post by Nélio Laranjeiro
I will try to use the command line "--no-chain-reply-to" option next
time.
Thanks!

--yliu
Nélio Laranjeiro
2016-06-21 08:49:04 UTC
Permalink
Post by Yuanhan Liu
Post by Nélio Laranjeiro
Post by Yuanhan Liu
Hi,
Here is an off-topic comment: would you please add following line to
the sendemail section of your git config file?
chainreplyto = false
That would let me to break the long threads in my client much easier.
Otherwise, it's hard for me to do it, leading that your thread occupies
several screens on my side.
It seems that Tetsuya also has the issue, thus CC'ed.
Thanks.
--yliu
I already have it in my sendemail section (copied from
http://dpdk.org/dev web page).
I was wondering it did not split correctly the patchset threads.
No idea, and here is a blind guess: maybe you have a local git config
file that overwrites the globle options?
No, my local git/config does not have any sendemail section. It worked
once, maybe an update of the package on my machine broke the script.
Post by Yuanhan Liu
Post by Nélio Laranjeiro
I will try to use the command line "--no-chain-reply-to" option next
time.
Thanks!
--yliu
--
Nélio Laranjeiro
6WIND
Ferruh Yigit
2016-06-21 10:44:23 UTC
Permalink
Post by Yuanhan Liu
Hi,
Here is an off-topic comment: would you please add following line to
the sendemail section of your git config file?
chainreplyto = false
That would let me to break the long threads in my client much easier.
Otherwise, it's hard for me to do it, leading that your thread occupies
several screens on my side.
It seems that Tetsuya also has the issue, thus CC'ed.
As far as I can see this is not related to the chainreplyto option, but
"--no-thread" seems set, because all patchsets sent as reply to first
mail of first patchset [C].
Correct setting should be "--thread and --no-chain-reply-to"

Although this is really detail, for multi version patchsets, if there is
a preferred way between (A) or (B) I would like to learn too?

A)

- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - - [v2 1/N]
- - - [v2 N/N]
- - - [v3 0/N]
- - - - [v3 1/N]
- - - - [v3 N/N]
- - - - [v4 0/N]
- - - - - [v4 1/N]
- - - - - [v4 N/N]



B)

- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - - [v2 1/N]
- - - [v2 N/N]
- - [v3 0/N]
- - - [v3 1/N]
- - - [v3 N/N]
- - [v4 0/N]
- - - [v4 1/N]
- - - [v4 N/N]


C)

- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - [v2 1/N]
- - [v2 N/N]
- - [v3 0/N]
- - [v3 1/N]
- - [v3 N/N]
Thomas Monjalon
2016-06-21 12:26:43 UTC
Permalink
Post by Ferruh Yigit
Although this is really detail, for multi version patchsets, if there is
a preferred way between (A) or (B) I would like to learn too?
In my opinion, A and B are fine.
And I prefer B.
Post by Ferruh Yigit
A)
- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - - [v2 1/N]
- - - [v2 N/N]
- - - [v3 0/N]
- - - - [v3 1/N]
- - - - [v3 N/N]
- - - - [v4 0/N]
- - - - - [v4 1/N]
- - - - - [v4 N/N]
B)
- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - - [v2 1/N]
- - - [v2 N/N]
- - [v3 0/N]
- - - [v3 1/N]
- - - [v3 N/N]
- - [v4 0/N]
- - - [v4 1/N]
- - - [v4 N/N]
C)
- [0/N]
- - [1/N]
- - [2/N]
- - [v2 0/N]
- - [v2 1/N]
- - [v2 N/N]
- - [v3 0/N]
- - [v3 1/N]
- - [v3 N/N]
Ferruh Yigit
2016-06-21 16:42:29 UTC
Permalink
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
- Rebased patchset on top of next-net/rel_16_07.
- Rebased patchset on top of dpdk/master.
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.
mlx5: replace countdown with threshold for Tx completions
mlx5: add debugging information about Tx queues capabilities
mlx5: check remaining space while processing Tx burst
mlx5: resurrect Tx gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant Rx queue initialization code
mlx5: make Rx queue reinitialization safer
mlx5: resurrect Rx scatter support
drivers: fix PCI class id support
mlx5: split memory registration function
mlx5: remove Tx gather support
mlx5: remove Rx scatter support
mlx5: remove configuration variable
mlx5: remove inline Tx support
mlx5: split Tx queue structure
mlx5: split Rx queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add Tx/Rx burst function selection wrapper
mlx5: refactor Rx data path
mlx5: refactor Tx data path
mlx5: handle Rx CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
Patchset applies and compiles fine, thanks.

But still has some checkpatch warnings, -btw, I am using the checkpatch
script from latest master branch of Linux repo.

Following is the sample type of warnings (not instances, there are more
than one instance per type):

WARNING:UNSPECIFIED_INT: Prefer 'unsigned int' to bare use of 'unsigned'
#112: FILE: drivers/net/mlx5/mlx5_mr.c:65:
+ unsigned mem_idx)

WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a
separate line
#288: FILE: drivers/net/mlx5/mlx5_mr.c:241:
+ * pointer is valid. */

WARNING:USE_NEGATIVE_ERRNO: return of an errno should typically be
negative (ie: return -EINVAL)
#524: FILE: drivers/net/mlx5/mlx5_txq.c:265:
+ return EINVAL;

WARNING:LONG_LINE: line over 80 characters
#108: FILE: drivers/net/mlx5/mlx5_ethdev.c:1250:
+ txq_ctrl->txq.stats.idx =
primary_txq->stats.idx;

WARNING:STATIC_CONST_CHAR_ARRAY: static const char * array should
probably be static const char * const
#88: FILE: drivers/net/mlx5/mlx5.c:281:
+ static const char *params[] = {

ERROR:ASSIGN_IN_IF: do not use assignment in if condition
#218: FILE: drivers/net/mlx5/mlx5_rxtx.c:92:
+ if (!ret || !(ret = ((*buf)[i] == magic[i])))

CHECK:SPACING: spaces preferred around that '&' (ctx:VxV)
#414: FILE: drivers/net/mlx5/mlx5_rxtx.c:625:
+ (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
^

WARNING:INDENTED_LABEL: labels should not be indented
#520: FILE: drivers/net/mlx5/mlx5_rxtx.c:789:
+ skip:
Adrien Mazarguil
2016-06-22 08:20:54 UTC
Permalink
Post by Ferruh Yigit
Post by Nelio Laranjeiro
Enhance mlx5 with a data path that bypasses Verbs.
The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.
The PMD remains usable during the transition.
This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".
- Rebased patchset on top of next-net/rel_16_07.
- Rebased patchset on top of dpdk/master.
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.
mlx5: replace countdown with threshold for Tx completions
mlx5: add debugging information about Tx queues capabilities
mlx5: check remaining space while processing Tx burst
mlx5: resurrect Tx gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant Rx queue initialization code
mlx5: make Rx queue reinitialization safer
mlx5: resurrect Rx scatter support
drivers: fix PCI class id support
mlx5: split memory registration function
mlx5: remove Tx gather support
mlx5: remove Rx scatter support
mlx5: remove configuration variable
mlx5: remove inline Tx support
mlx5: split Tx queue structure
mlx5: split Rx queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add Tx/Rx burst function selection wrapper
mlx5: refactor Rx data path
mlx5: refactor Tx data path
mlx5: handle Rx CQE compression
mlx5: add support for multi-packet send
mlx5: add support for inline send
Patchset applies and compiles fine, thanks.
But still has some checkpatch warnings, -btw, I am using the checkpatch
script from latest master branch of Linux repo.
Following is the sample type of warnings (not instances, there are more
While Nelio is preparing a v4 to address the kvargs issue, the remaining
warnings can be safely ignored.

A few of them are in relocated but unmodified code as this patchset
refactors the entire PMD, others are documented. We settled on positive
errno values internally because mlx5 uses syscalls and switching the sign
bit all over the place quickly became unmanageable. They are made negative
when returning from public callbacks (except for kvargs by mistake).

In short, we did run checkpatch, fixed a million warnings and other errors
and left those on purpose, nothing to worry about.
Post by Ferruh Yigit
WARNING:UNSPECIFIED_INT: Prefer 'unsigned int' to bare use of 'unsigned'
+ unsigned mem_idx)
WARNING:BLOCK_COMMENT_STYLE: Block comments use a trailing */ on a
separate line
+ * pointer is valid. */
WARNING:USE_NEGATIVE_ERRNO: return of an errno should typically be
negative (ie: return -EINVAL)
+ return EINVAL;
WARNING:LONG_LINE: line over 80 characters
+ txq_ctrl->txq.stats.idx =
primary_txq->stats.idx;
WARNING:STATIC_CONST_CHAR_ARRAY: static const char * array should
probably be static const char * const
+ static const char *params[] = {
ERROR:ASSIGN_IN_IF: do not use assignment in if condition
+ if (!ret || !(ret = ((*buf)[i] == magic[i])))
CHECK:SPACING: spaces preferred around that '&' (ctx:VxV)
+ (uintptr_t)&(*rxq->cqes)[rxq->cq_ci &
^
WARNING:INDENTED_LABEL: labels should not be indented
--
Adrien Mazarguil
6WIND
Nelio Laranjeiro
2016-06-22 09:05:30 UTC
Permalink
Enhance mlx5 with a data path that bypasses Verbs.

The first half of this patchset removes support for functionality completely
rewritten in the second half (scatter/gather, inline send), while the data
path is refactored without Verbs.

The PMD remains usable during the transition.

This patchset must be applied after "Miscellaneous fixes for mlx4 and mlx5".

Changes in v4:
- Fixed errno return value of mlx5_args().
- Fixed long line above 80 characters.

Changes in v3:
- Rebased patchset on top of next-net/rel_16_07.

Changes in v2:
- Rebased patchset on top of dpdk/master.
- Fixed CQE size on Power8.
- Fixed mbuf assertion failure in debug mode.
- Fixed missing class_id field in rte_pci_id by using RTE_PCI_DEVICE.

Adrien Mazarguil (8):
mlx5: replace countdown with threshold for Tx completions
mlx5: add debugging information about Tx queues capabilities
mlx5: check remaining space while processing Tx burst
mlx5: resurrect Tx gather support
mlx5: work around spurious compilation errors
mlx5: remove redundant Rx queue initialization code
mlx5: make Rx queue reinitialization safer
mlx5: resurrect Rx scatter support

Nelio Laranjeiro (16):
drivers: fix PCI class id support
mlx5: split memory registration function
mlx5: remove Tx gather support
mlx5: remove Rx scatter support
mlx5: remove configuration variable
mlx5: remove inline Tx support
mlx5: split Tx queue structure
mlx5: split Rx queue structure
mlx5: update prerequisites for upcoming enhancements
mlx5: add definitions for data path without Verbs
mlx5: add support for configuration through kvargs
mlx5: add Tx/Rx burst function selection wrapper
mlx5: refactor Rx data path
mlx5: refactor Tx data path
mlx5: handle Rx CQE compression
mlx5: add support for multi-packet send

Yaacov Hazan (1):
mlx5: add support for inline send

config/common_base | 2 -
doc/guides/nics/mlx5.rst | 94 +-
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +-
drivers/net/mlx4/mlx4.c | 18 +-
drivers/net/mlx5/Makefile | 49 +-
drivers/net/mlx5/mlx5.c | 182 ++-
drivers/net/mlx5/mlx5.h | 10 +
drivers/net/mlx5/mlx5_defs.h | 26 +-
drivers/net/mlx5/mlx5_ethdev.c | 189 ++-
drivers/net/mlx5/mlx5_fdir.c | 20 +-
drivers/net/mlx5/mlx5_mr.c | 280 ++++
drivers/net/mlx5/mlx5_prm.h | 163 +++
drivers/net/mlx5/mlx5_rxmode.c | 8 -
drivers/net/mlx5/mlx5_rxq.c | 762 ++++-------
drivers/net/mlx5/mlx5_rxtx.c | 2210 +++++++++++++++++++-------------
drivers/net/mlx5/mlx5_rxtx.h | 176 ++-
drivers/net/mlx5/mlx5_txq.c | 368 +++---
drivers/net/mlx5/mlx5_vlan.c | 6 +-
drivers/net/nfp/nfp_net.c | 12 +-
19 files changed, 2625 insertions(+), 1955 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c
create mode 100644 drivers/net/mlx5/mlx5_prm.h
--
2.1.4
Nelio Laranjeiro
2016-06-22 09:05:31 UTC
Permalink
Fixes: 701c8d80c820 ("pci: support class id probing")

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
---
drivers/crypto/qat/rte_qat_cryptodev.c | 5 +----
drivers/net/mlx4/mlx4.c | 18 ++++++------------
drivers/net/mlx5/mlx5.c | 24 ++++++++----------------
drivers/net/nfp/nfp_net.c | 12 ++++--------
4 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/drivers/crypto/qat/rte_qat_cryptodev.c b/drivers/crypto/qat/rte_qat_cryptodev.c
index a7912f5..f46ec85 100644
--- a/drivers/crypto/qat/rte_qat_cryptodev.c
+++ b/drivers/crypto/qat/rte_qat_cryptodev.c
@@ -69,10 +69,7 @@ static struct rte_cryptodev_ops crypto_qat_ops = {

static struct rte_pci_id pci_id_qat_map[] = {
{
- .vendor_id = 0x8086,
- .device_id = 0x0443,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(0x8086, 0x0443),
},
{.device_id = 0},
};
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 9e94630..6228688 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -5807,22 +5807,16 @@ error:

static const struct rte_pci_id mlx4_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX3VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
},
{
.vendor_id = 0
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 67a541c..350028b 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -610,28 +610,20 @@ error:

static const struct rte_pci_id mlx5_pci_id_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4VF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LX,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
},
{
- .vendor_id = PCI_VENDOR_ID_MELLANOX,
- .device_id = PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
+ PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
},
{
.vendor_id = 0
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..dd0c559 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -2446,16 +2446,12 @@ nfp_net_init(struct rte_eth_dev *eth_dev)

static struct rte_pci_id pci_id_nfp_net_map[] = {
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_PF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_PF_NIC)
},
{
- .vendor_id = PCI_VENDOR_ID_NETRONOME,
- .device_id = PCI_DEVICE_ID_NFP6000_VF_NIC,
- .subsystem_vendor_id = PCI_ANY_ID,
- .subsystem_device_id = PCI_ANY_ID,
+ RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
+ PCI_DEVICE_ID_NFP6000_VF_NIC)
},
{
.vendor_id = 0,
--
2.1.4
Nelio Laranjeiro
2016-06-22 09:05:32 UTC
Permalink
Except for the first time when memory registration occurs, the lkey is
always cached. Since memory registration is slow and performs system calls,
performance can be improved by moving that code to its own function outside
of the data path so only the lookup code is left in the original inlined
function.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/Makefile | 1 +
drivers/net/mlx5/mlx5_mr.c | 277 +++++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx5/mlx5_rxtx.c | 209 ++------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 8 +-
4 files changed, 295 insertions(+), 200 deletions(-)
create mode 100644 drivers/net/mlx5/mlx5_mr.c

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 82558aa..999ada5 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -47,6 +47,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c

# Dependencies.
DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
new file mode 100644
index 0000000..7c3e87f
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -0,0 +1,277 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2016 6WIND S.A.
+ * Copyright 2016 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mempool.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+struct mlx5_check_mempool_data {
+ int ret;
+ char *start;
+ char *end;
+};
+
+/* Called by mlx5_check_mempool() when iterating the memory chunks. */
+static void mlx5_check_mempool_cb(struct rte_mempool *mp,
+ void *opaque, struct rte_mempool_memhdr *memhdr,
+ unsigned mem_idx)
+{
+ struct mlx5_check_mempool_data *data = opaque;
+
+ (void)mp;
+ (void)mem_idx;
+
+ /* It already failed, skip the next chunks. */
+ if (data->ret != 0)
+ return;
+ /* It is the first chunk. */
+ if (data->start == NULL && data->end == NULL) {
+ data->start = memhdr->addr;
+ data->end = data->start + memhdr->len;
+ return;
+ }
+ if (data->end == memhdr->addr) {
+ data->end += memhdr->len;
+ return;
+ }
+ if (data->start == (char *)memhdr->addr + memhdr->len) {
+ data->start -= memhdr->len;
+ return;
+ }
+ /* Error, mempool is not virtually contiguous. */
+ data->ret = -1;
+}
+
+/**
+ * Check if a mempool can be used: it must be virtually contiguous.
+ *
+ * @param[in] mp
+ * Pointer to memory pool.
+ * @param[out] start
+ * Pointer to the start address of the mempool virtual memory area
+ * @param[out] end
+ * Pointer to the end address of the mempool virtual memory area
+ *
+ * @return
+ * 0 on success (mempool is virtually contiguous), -1 on error.
+ */
+static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
+ uintptr_t *end)
+{
+ struct mlx5_check_mempool_data data;
+
+ memset(&data, 0, sizeof(data));
+ rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
+ *start = (uintptr_t)data.start;
+ *end = (uintptr_t)data.end;
+
+ return data.ret;
+}
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ * Pointer to protection domain.
+ * @param mp
+ * Pointer to memory pool.
+ *
+ * @return
+ * Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ uintptr_t start;
+ uintptr_t end;
+ unsigned int i;
+
+ if (mlx5_check_mempool(mp, &start, &end) != 0) {
+ ERROR("mempool %p: not virtually contiguous",
+ (void *)mp);
+ return NULL;
+ }
+
+ DEBUG("mempool %p area start=%p end=%p size=%zu",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ /* Round start and end to page boundary if found in memory segments. */
+ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+ uintptr_t addr = (uintptr_t)ms[i].addr;
+ size_t len = ms[i].len;
+ unsigned int align = ms[i].hugepage_sz;
+
+ if ((start > addr) && (start < addr + len))
+ start = RTE_ALIGN_FLOOR(start, align);
+ if ((end > addr) && (end < addr + len))
+ end = RTE_ALIGN_CEIL(end, align);
+ }
+ DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+ (void *)mp, (void *)start, (void *)end,
+ (size_t)(end - start));
+ return ibv_reg_mr(pd,
+ (void *)start,
+ end - start,
+ IBV_ACCESS_LOCAL_WRITE);
+}
+
+/**
+ * Register a Memory Region (MR) <-> Memory Pool (MP) association in
+ * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ *
+ * This function should only be called by txq_mp2mr().
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] mp
+ * Memory Pool for which a Memory Region lkey must be returned.
+ * @param idx
+ * Index of the next available entry.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
+{
+ struct ibv_mr *mr;
+
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx5_mp2mr(txq->priv->pd, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --idx;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[idx].mp = mp;
+ txq->mp2mr[idx].mr = mr;
+ txq->mp2mr[idx].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[idx].lkey);
+ return txq->mp2mr[idx].lkey;
+}
+
+struct txq_mp2mr_mbuf_check_data {
+ int ret;
+};
+
+/**
+ * Callback function for rte_mempool_obj_iter() to check whether a given
+ * mempool object looks like a mbuf.
+ *
+ * @param[in] mp
+ * The mempool pointer
+ * @param[in] arg
+ * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
+ * return value.
+ * @param[in] obj
+ * Object address.
+ * @param index
+ * Object index, unused.
+ */
+static void
+txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
+ uint32_t index __rte_unused)
+{
+ struct txq_mp2mr_mbuf_check_data *data = arg;
+ struct rte_mbuf *buf = obj;
+
+ /* Check whether mbuf structure fits element size and whether mempool
+ * pointer is valid. */
+ if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
+ data->ret = -1;
+}
+
+/**
+ * Iterator function for rte_mempool_walk() to register existing mempools and
+ * fill the MP to MR cache of a TX queue.
+ *
+ * @param[in] mp
+ * Memory Pool to register.
+ * @param *arg
+ * Pointer to TX queue structure.
+ */
+void
+txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
+{
+ struct txq *txq = arg;
+ struct txq_mp2mr_mbuf_check_data data = {
+ .ret = 0,
+ };
+ unsigned int i;
+
+ /* Register mempool only if the first element looks like a mbuf. */
+ if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
+ data.ret == -1)
+ return;
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp)
+ return;
+ }
+ txq_mp2mr_reg(txq, mp, i);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9cb1dfa..616cf7a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -140,121 +140,6 @@ txq_complete(struct txq *txq)
return 0;
}

-struct mlx5_check_mempool_data {
- int ret;
- char *start;
- char *end;
-};
-
-/* Called by mlx5_check_mempool() when iterating the memory chunks. */
-static void mlx5_check_mempool_cb(struct rte_mempool *mp,
- void *opaque, struct rte_mempool_memhdr *memhdr,
- unsigned mem_idx)
-{
- struct mlx5_check_mempool_data *data = opaque;
-
- (void)mp;
- (void)mem_idx;
-
- /* It already failed, skip the next chunks. */
- if (data->ret != 0)
- return;
- /* It is the first chunk. */
- if (data->start == NULL && data->end == NULL) {
- data->start = memhdr->addr;
- data->end = data->start + memhdr->len;
- return;
- }
- if (data->end == memhdr->addr) {
- data->end += memhdr->len;
- return;
- }
- if (data->start == (char *)memhdr->addr + memhdr->len) {
- data->start -= memhdr->len;
- return;
- }
- /* Error, mempool is not virtually contigous. */
- data->ret = -1;
-}
-
-/**
- * Check if a mempool can be used: it must be virtually contiguous.
- *
- * @param[in] mp
- * Pointer to memory pool.
- * @param[out] start
- * Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- * Pointer to the end address of the mempool virtual memory area
- *
- * @return
- * 0 on success (mempool is virtually contiguous), -1 on error.
- */
-static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
- uintptr_t *end)
-{
- struct mlx5_check_mempool_data data;
-
- memset(&data, 0, sizeof(data));
- rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
- *start = (uintptr_t)data.start;
- *end = (uintptr_t)data.end;
-
- return data.ret;
-}
-
-/* For best performance, this function should not be inlined. */
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __attribute__((noinline));
-
-/**
- * Register mempool as a memory region.
- *
- * @param pd
- * Pointer to protection domain.
- * @param mp
- * Pointer to memory pool.
- *
- * @return
- * Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- uintptr_t start;
- uintptr_t end;
- unsigned int i;
-
- if (mlx5_check_mempool(mp, &start, &end) != 0) {
- ERROR("mempool %p: not virtually contiguous",
- (void *)mp);
- return NULL;
- }
-
- DEBUG("mempool %p area start=%p end=%p size=%zu",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- /* Round start and end to page boundary if found in memory segments. */
- for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
- uintptr_t addr = (uintptr_t)ms[i].addr;
- size_t len = ms[i].len;
- unsigned int align = ms[i].hugepage_sz;
-
- if ((start > addr) && (start < addr + len))
- start = RTE_ALIGN_FLOOR(start, align);
- if ((end > addr) && (end < addr + len))
- end = RTE_ALIGN_CEIL(end, align);
- }
- DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
- (void *)mp, (void *)start, (void *)end,
- (size_t)(end - start));
- return ibv_reg_mr(pd,
- (void *)start,
- end - start,
- IBV_ACCESS_LOCAL_WRITE);
-}
-
/**
* Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
* the cloned mbuf is allocated is returned instead.
@@ -273,6 +158,10 @@ txq_mb2mp(struct rte_mbuf *buf)
return buf->pool;
}

+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+ __attribute__((always_inline));
+
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
@@ -286,11 +175,11 @@ txq_mb2mp(struct rte_mbuf *buf)
* @return
* mr->lkey on success, (uint32_t)-1 on failure.
*/
-static uint32_t
+static inline uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
{
unsigned int i;
- struct ibv_mr *mr;
+ uint32_t lkey = (uint32_t)-1;

for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
if (unlikely(txq->mp2mr[i].mp == NULL)) {
@@ -300,89 +189,13 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
if (txq->mp2mr[i].mp == mp) {
assert(txq->mp2mr[i].lkey != (uint32_t)-1);
assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
+ lkey = txq->mp2mr[i].lkey;
+ break;
}
}
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx5_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
-struct txq_mp2mr_mbuf_check_data {
- int ret;
-};
-
-/**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
- *
- * @param[in] mp
- * The mempool pointer
- * @param[in] arg
- * Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- * return value.
- * @param[in] obj
- * Object address.
- * @param index
- * Object index, unused.
- */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
- uint32_t index __rte_unused)
-{
- struct txq_mp2mr_mbuf_check_data *data = arg;
- struct rte_mbuf *buf = obj;
-
- /* Check whether mbuf structure fits element size and whether mempool
- * pointer is valid. */
- if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
- data->ret = -1;
-}
-
-/**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
- *
- * @param[in] mp
- * Memory Pool to register.
- * @param *arg
- * Pointer to TX queue structure.
- */
-void
-txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
-{
- struct txq *txq = arg;
- struct txq_mp2mr_mbuf_check_data data = {
- .ret = 0,
- };
-
- /* Register mempool only if the first element looks like a mbuf. */
- if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
- data.ret == -1)
- return;
- txq_mp2mr(txq, mp);
+ if (unlikely(lkey == (uint32_t)-1))
+ lkey = txq_mp2mr_reg(txq, mp, i);
+ return lkey;
}

/**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 47f6299..462eddf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -337,12 +337,16 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);

/* mlx5_rxtx.c */

-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
-void txq_mp2mr_iter(struct rte_mempool *, void *);
uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);

+/* mlx5_mr.c */
+
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
+void txq_mp2mr_iter(struct rte_mempool *, void *);
+uint32_t txq_mp2mr_reg(struct txq *, struct rte_mempool *, unsigned int);
+
#endif /* RTE_PMD_MLX5_RXTX_H_ */
--
2.1.4
Nelio Laranjeiro
2016-06-22 09:05:33 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. TX gather cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 315 ++++++++---------------------------------
drivers/net/mlx5/mlx5_rxtx.h | 17 ---
drivers/net/mlx5/mlx5_txq.c | 49 ++-----
4 files changed, 69 insertions(+), 314 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 0a881b6..280a90a 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1260,7 +1260,7 @@ mlx5_secondary_data_setup(struct priv *priv)
if (txq != NULL) {
if (txq_setup(priv->dev,
txq,
- primary_txq->elts_n * MLX5_PMD_SGE_WR_N,
+ primary_txq->elts_n,
primary_txq->socket,
NULL) == 0) {
txq->stats.idx = primary_txq->stats.idx;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 616cf7a..6e184c3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -228,156 +228,6 @@ insert_vlan_sw(struct rte_mbuf *buf)
return 0;
}

-#if MLX5_PMD_SGE_WR_N > 1
-
-/**
- * Copy scattered mbuf contents to a single linear buffer.
- *
- * @param[out] linear
- * Linear output buffer.
- * @param[in] buf
- * Scattered input buffer.
- *
- * @return
- * Number of bytes copied to the output buffer or 0 if not large enough.
- */
-static unsigned int
-linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
-{
- unsigned int size = 0;
- unsigned int offset;
-
- do {
- unsigned int len = DATA_LEN(buf);
-
- offset = size;
- size += len;
- if (unlikely(size > sizeof(*linear)))
- return 0;
- memcpy(&(*linear)[offset],
- rte_pktmbuf_mtod(buf, uint8_t *),
- len);
- buf = NEXT(buf);
- } while (buf != NULL);
- return size;
-}
-
-/**
- * Handle scattered buffers for mlx5_tx_burst().
- *
- * @param txq
- * TX queue structure.
- * @param segs
- * Number of segments in buf.
- * @param elt
- * TX queue element to fill.
- * @param[in] buf
- * Buffer to process.
- * @param elts_head
- * Index of the linear buffer to use if necessary (normally txq->elts_head).
- * @param[out] sges
- * Array filled with SGEs on success.
- *
- * @return
- * A structure containing the processed packet size in bytes and the
- * number of SGEs. Both fields are set to (unsigned int)-1 in case of
- * failure.
- */
-static struct tx_burst_sg_ret {
- unsigned int length;
- unsigned int num;
-}
-tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
- struct rte_mbuf *buf, unsigned int elts_head,
- struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
-{
- unsigned int sent_size = 0;
- unsigned int j;
- int linearize = 0;
-
- /* When there are too many segments, extra segments are
- * linearized in the last SGE. */
- if (unlikely(segs > RTE_DIM(*sges))) {
- segs = (RTE_DIM(*sges) - 1);
- linearize = 1;
- }
- /* Update element. */
- elt->buf = buf;
- /* Register segments as SGEs. */
- for (j = 0; (j != segs); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- uint32_t lkey;
-
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update SGE. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)sge->addr);
- sge->length = DATA_LEN(buf);
- sge->lkey = lkey;
- sent_size += sge->length;
- buf = NEXT(buf);
- }
- /* If buf is not NULL here and is not going to be linearized,
- * nb_segs is not valid. */
- assert(j == segs);
- assert((buf == NULL) || (linearize));
- /* Linearize extra segments. */
- if (linearize) {
- struct ibv_sge *sge = &(*sges)[segs];
- linear_t *linear = &(*txq->elts_linear)[elts_head];
- unsigned int size = linearize_mbuf(linear, buf);
-
- assert(segs == (RTE_DIM(*sges) - 1));
- if (size == 0) {
- /* Invalid packet. */
- DEBUG("%p: packet too large to be linearized.",
- (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
- if (RTE_DIM(*sges) == 1) {
- do {
- struct rte_mbuf *next = NEXT(buf);
-
- rte_pktmbuf_free_seg(buf);
- buf = next;
- } while (buf != NULL);
- elt->buf = NULL;
- }
- /* Update SGE. */
- sge->addr = (uintptr_t)&(*linear)[0];
- sge->length = size;
- sge->lkey = txq->mr_linear->lkey;
- sent_size += size;
- /* Include last segment. */
- segs++;
- }
- return (struct tx_burst_sg_ret){
- .length = sent_size,
- .num = segs,
- };
-stop:
- return (struct tx_burst_sg_ret){
- .length = -1,
- .num = -1,
- };
-}
-
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
-
/**
* DPDK callback for TX.
*
@@ -424,14 +274,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int elts_head_next =
(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
struct txq_elt *elt = &(*txq->elts)[elts_head];
- unsigned int segs = NB_SEGS(buf);
-#ifdef MLX5_PMD_SOFT_COUNTERS
- unsigned int sent_size = 0;
-#endif
uint32_t send_flags = 0;
#ifdef HAVE_VERBS_VLAN_INSERTION
int insert_vlan = 0;
#endif /* HAVE_VERBS_VLAN_INSERTION */
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+ uintptr_t buf_next_addr;

if (i + 1 < max)
rte_prefetch0(buf_next);
@@ -464,126 +314,81 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto stop;
}
}
- if (likely(segs == 1)) {
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
- uintptr_t buf_next_addr;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = DATA_LEN(buf);
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->sriov)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- /* Prefetch next buffer data. */
- if (i + 1 < max) {
- buf_next_addr =
- rte_pktmbuf_mtod(buf_next, uintptr_t);
- rte_prefetch0((volatile void *)
- (uintptr_t)buf_next_addr);
- }
- /* Put packet into send queue. */
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->sriov)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ /* Prefetch next buffer data. */
+ if (i + 1 < max) {
+ buf_next_addr =
+ rte_pktmbuf_mtod(buf_next, uintptr_t);
+ rte_prefetch0((volatile void *)
+ (uintptr_t)buf_next_addr);
+ }
+ /* Put packet into send queue. */
#if MLX5_PMD_MAX_INLINE > 0
- if (length <= txq->max_inline) {
+ if (length <= txq->max_inline) {
#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_inline_vlan
- (txq->qp,
- (void *)addr,
- length,
- send_flags,
- &buf->vlan_tci);
- else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_inline
- (txq->qp,
- (void *)addr,
- length,
- send_flags);
- } else
-#endif
- {
- /* Retrieve Memory Region key for this
- * memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
-#ifdef HAVE_VERBS_VLAN_INSERTION
- if (insert_vlan)
- err = txq->send_pending_vlan
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags,
- &buf->vlan_tci);
- else
+ if (insert_vlan)
+ err = txq->send_pending_inline_vlan
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags,
+ &buf->vlan_tci);
+ else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending
- (txq->qp,
- addr,
- length,
- lkey,
- send_flags);
- }
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += length;
+ err = txq->send_pending_inline
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags);
+ } else
#endif
- } else {
-#if MLX5_PMD_SGE_WR_N > 1
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
- struct tx_burst_sg_ret ret;
-
- ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
- &sges);
- if (ret.length == (unsigned int)-1)
+ {
+ /* Retrieve Memory Region key for this
+ * memory pool. */
+ lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
goto stop;
- /* Put SG list into send queue. */
+ }
#ifdef HAVE_VERBS_VLAN_INSERTION
if (insert_vlan)
- err = txq->send_pending_sg_list_vlan
+ err = txq->send_pending_vlan
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags,
&buf->vlan_tci);
else
#endif /* HAVE_VERBS_VLAN_INSERTION */
- err = txq->send_pending_sg_list
+ err = txq->send_pending
(txq->qp,
- sges,
- ret.num,
+ addr,
+ length,
+ lkey,
send_flags);
- if (unlikely(err))
- goto stop;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- sent_size += ret.length;
-#endif
-#else /* MLX5_PMD_SGE_WR_N > 1 */
- DEBUG("%p: TX scattered buffers support not"
- " compiled in", (void *)txq);
- goto stop;
-#endif /* MLX5_PMD_SGE_WR_N > 1 */
}
- elts_head = elts_head_next;
- buf = buf_next;
+ if (unlikely(err))
+ goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
+ txq->stats.obytes += length;
#endif
- }
stop:
+ elts_head = elts_head_next;
+ buf = buf_next;
+ }
/* Take a shortcut if nothing must be sent. */
if (unlikely(i == 0))
return 0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 462eddf..8358ccb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -242,14 +242,6 @@ struct txq_elt {
struct rte_mbuf *buf;
};

-/* Linear buffer type. It is used when transmitting buffers with too many
- * segments that do not fit the hardware queue (see max_send_sge).
- * Extra segments are copied (linearized) in such buffers, replacing the
- * last SGE during TX.
- * The size is arbitrary but large enough to hold a jumbo frame with
- * 8 segments considering mbuf.buf_len is about 2048 bytes. */
-typedef uint8_t linear_t[16384];
-
/* TX queue descriptor. */
struct txq {
struct priv *priv; /* Back pointer to private data. */
@@ -264,12 +256,6 @@ struct txq {
int (*send_pending_inline_vlan)();
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- int (*send_pending_sg_list)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
- int (*send_pending_sg_list_vlan)();
-#endif
-#endif
int (*send_flush)(struct ibv_qp *qp);
struct ibv_cq *cq; /* Completion Queue. */
struct ibv_qp *qp; /* Queue Pair. */
@@ -289,9 +275,6 @@ struct txq {
uint32_t lkey; /* mr->lkey */
} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
struct mlx5_txq_stats stats; /* TX queue counters. */
- /* Elements used only for init part are here. */
- linear_t (*elts_linear)[]; /* Linearized buffers. */
- struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
#ifdef HAVE_VERBS_VLAN_INSERTION
struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
#else
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e20df21..5a248c9 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,26 +82,13 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
unsigned int i;
struct txq_elt (*elts)[elts_n] =
rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
- linear_t (*elts_linear)[elts_n] =
- rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
- txq->socket);
- struct ibv_mr *mr_linear = NULL;
int ret = 0;

- if ((elts == NULL) || (elts_linear == NULL)) {
+ if (elts == NULL) {
ERROR("%p: can't allocate packets array", (void *)txq);
ret = ENOMEM;
goto error;
}
- mr_linear =
- ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
- IBV_ACCESS_LOCAL_WRITE);
- if (mr_linear == NULL) {
- ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
- (void *)txq);
- ret = EINVAL;
- goto error;
- }
for (i = 0; (i != elts_n); ++i) {
struct txq_elt *elt = &(*elts)[i];

@@ -119,15 +106,9 @@ txq_alloc_elts(struct txq *txq, unsigned int elts_n)
((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
txq->elts_comp_cd = txq->elts_comp_cd_init;
- txq->elts_linear = elts_linear;
- txq->mr_linear = mr_linear;
assert(ret == 0);
return 0;
error:
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));
-
- rte_free(elts_linear);
rte_free(elts);

DEBUG("%p: failed, freed everything", (void *)txq);
@@ -148,8 +129,6 @@ txq_free_elts(struct txq *txq)
unsigned int elts_head = txq->elts_head;
unsigned int elts_tail = txq->elts_tail;
struct txq_elt (*elts)[elts_n] = txq->elts;
- linear_t (*elts_linear)[elts_n] = txq->elts_linear;
- struct ibv_mr *mr_linear = txq->mr_linear;

DEBUG("%p: freeing WRs", (void *)txq);
txq->elts_n = 0;
@@ -159,12 +138,7 @@ txq_free_elts(struct txq *txq)
txq->elts_comp_cd = 0;
txq->elts_comp_cd_init = 0;
txq->elts = NULL;
- txq->elts_linear = NULL;
- txq->mr_linear = NULL;
- if (mr_linear != NULL)
- claim_zero(ibv_dereg_mr(mr_linear));

- rte_free(elts_linear);
if (elts == NULL)
return;
while (elts_tail != elts_head) {
@@ -286,12 +260,14 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
int ret = 0;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of TX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of TX descriptors", (void *)dev);
+ return EINVAL;
+ }
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: TX gather is not supported", (void *)dev);
return EINVAL;
}
- desc /= MLX5_PMD_SGE_WR_N;
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -332,10 +308,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
- .max_send_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_send_sge = 1,
#if MLX5_PMD_MAX_INLINE > 0
.max_inline_data = MLX5_PMD_MAX_INLINE,
#endif
@@ -440,12 +413,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
txq->send_pending_inline_vlan = txq->if_qp->send_pending_inline_vlan;
#endif
#endif
-#if MLX5_PMD_SGE_WR_N > 1
- txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
-#ifdef HAVE_VERBS_VLAN_INSERTION
- txq->send_pending_sg_list_vlan = txq->if_qp->send_pending_sg_list_vlan;
-#endif
-#endif
txq->send_pending = txq->if_qp->send_pending;
#ifdef HAVE_VERBS_VLAN_INSERTION
txq->send_pending_vlan = txq->if_qp->send_pending_vlan;
--
2.1.4
Nelio Laranjeiro
2016-06-22 09:05:34 UTC
Permalink
This is done in preparation of bypassing Verbs entirely for the data path
as a performance improvement. RX scatter cannot be maintained during the
transition and will be reimplemented later.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
drivers/net/mlx5/mlx5_ethdev.c | 31 +---
drivers/net/mlx5/mlx5_rxq.c | 314 ++++++-----------------------------------
drivers/net/mlx5/mlx5_rxtx.c | 211 +--------------------------
drivers/net/mlx5/mlx5_rxtx.h | 13 +-
4 files changed, 53 insertions(+), 516 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 280a90a..ca57021 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -623,8 +623,7 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)

};

- if (dev->rx_pkt_burst == mlx5_rx_burst ||
- dev->rx_pkt_burst == mlx5_rx_burst_sp)
+ if (dev->rx_pkt_burst == mlx5_rx_burst)
return ptypes;
return NULL;
}
@@ -762,19 +761,11 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
mb_len = rte_pktmbuf_data_room_size(rxq->mp);
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
- /* Provide new values to rxq_setup(). */
- dev->data->dev_conf.rxmode.jumbo_frame = sp;
- dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
- ret = rxq_rehash(dev, rxq);
- if (ret) {
- /* Force SP RX if that queue requires it and abort. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
- break;
+ if (sp) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ ret = ENOTSUP;
+ goto out;
}
- /* Scattered burst function takes priority. */
- if (rxq->sp)
- rx_func = mlx5_rx_burst_sp;
}
/* Burst functions can now be called again. */
rte_wmb();
@@ -1103,22 +1094,12 @@ priv_set_link(struct priv *priv, int up)
{
struct rte_eth_dev *dev = priv->dev;
int err;
- unsigned int i;

if (up) {
err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
if (err)
return err;
- for (i = 0; i < priv->rxqs_n; i++)
- if ((*priv->rxqs)[i]->sp)
- break;
- /* Check if an sp queue exists.
- * Note: Some old frames might be received.
- */
- if (i == priv->rxqs_n)
- dev->rx_pkt_burst = mlx5_rx_burst;
- else
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
+ dev->rx_pkt_burst = mlx5_rx_burst;
dev->tx_pkt_burst = mlx5_tx_burst;
} else {
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 0bcf55b..38ff9fd 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -634,145 +634,6 @@ priv_rehash_flows(struct priv *priv)
}

/**
- * Allocate RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- * @param elts_n
- * Number of elements to allocate.
- * @param[in] pool
- * If not NULL, fetch buffers from this array instead of allocating them
- * with rte_pktmbuf_alloc().
- *
- * @return
- * 0 on success, errno value on failure.
- */
-static int
-rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
-{
- unsigned int i;
- struct rxq_elt_sp (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
- int ret = 0;
-
- if (elts == NULL) {
- ERROR("%p: can't allocate packets array", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
- struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
-
- /* These two arrays must have the same size. */
- assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
- /* For each SGE (segment). */
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct ibv_sge *sge = &(*sges)[j];
- struct rte_mbuf *buf;
-
- if (pool != NULL) {
- buf = *(pool++);
- assert(buf != NULL);
- rte_pktmbuf_reset(buf);
- } else
- buf = rte_pktmbuf_alloc(rxq->mp);
- if (buf == NULL) {
- assert(pool == NULL);
- ERROR("%p: empty mbuf pool", (void *)rxq);
- ret = ENOMEM;
- goto error;
- }
- elt->bufs[j] = buf;
- /* Headroom is reserved by rte_pktmbuf_alloc(). */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- /* Buffer is supposed to be empty. */
- assert(rte_pktmbuf_data_len(buf) == 0);
- assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- if (j == 0) {
- /* The first SGE keeps its headroom. */
- sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
- sge->length = (buf->buf_len -
- RTE_PKTMBUF_HEADROOM);
- } else {
- /* Subsequent SGEs lose theirs. */
- assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
- SET_DATA_OFF(buf, 0);
- sge->addr = (uintptr_t)buf->buf_addr;
- sge->length = buf->buf_len;
- }
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- }
- }
- DEBUG("%p: allocated and configured %u WRs (%zu segments)",
- (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts.sp = elts;
- assert(ret == 0);
- return 0;
-error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
- }
- DEBUG("%p: failed, freed everything", (void *)rxq);
- assert(ret > 0);
- return ret;
-}
-
-/**
- * Free RX queue elements with scattered packets support.
- *
- * @param rxq
- * Pointer to RX queue structure.
- */
-static void
-rxq_free_elts_sp(struct rxq *rxq)
-{
- unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
-
- DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts.sp = NULL;
- if (elts == NULL)
- return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- unsigned int j;
- struct rxq_elt_sp *elt = &(*elts)[i];
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- struct rte_mbuf *buf = elt->bufs[j];
-
- if (buf != NULL)
- rte_pktmbuf_free_seg(buf);
- }
- }
- rte_free(elts);
-}
-
-/**
* Allocate RX queue elements.
*
* @param rxq
@@ -838,7 +699,7 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
(void *)rxq, elts_n);
rxq->elts_n = elts_n;
rxq->elts_head = 0;
- rxq->elts.no_sp = elts;
+ rxq->elts = elts;
assert(ret == 0);
return 0;
error:
@@ -869,11 +730,11 @@ rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[elts_n] = rxq->elts;

DEBUG("%p: freeing WRs", (void *)rxq);
rxq->elts_n = 0;
- rxq->elts.no_sp = NULL;
+ rxq->elts = NULL;
if (elts == NULL)
return;
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
@@ -900,10 +761,7 @@ rxq_cleanup(struct rxq *rxq)
struct ibv_exp_release_intf_params params;

DEBUG("cleaning up %p", (void *)rxq);
- if (rxq->sp)
- rxq_free_elts_sp(rxq);
- else
- rxq_free_elts(rxq);
+ rxq_free_elts(rxq);
rxq->poll = NULL;
rxq->recv = NULL;
if (rxq->if_wq != NULL) {
@@ -973,12 +831,12 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
struct rte_mbuf **pool;
unsigned int i, k;
struct ibv_exp_wq_attr mod;
- unsigned int mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+ struct rxq_elt (*elts)[tmpl.elts_n];
int err;

DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
/* Number of descriptors and mbufs currently allocated. */
- desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
+ desc_n = tmpl.elts_n;
mbuf_n = desc_n;
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum) {
@@ -989,22 +847,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
rxq->csum_l2tun = tmpl.csum_l2tun;
}
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc_n /= MLX5_PMD_SGE_WR_N;
- } else
- tmpl.sp = 0;
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
- /* If scatter mode is the same as before, nothing to do. */
- if (tmpl.sp == rxq->sp) {
- DEBUG("%p: nothing to do", (void *)dev);
- return 0;
- }
/* From now on, any failure will render the queue unusable.
* Reinitialize WQ. */
mod = (struct ibv_exp_wq_attr){
@@ -1025,35 +867,18 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Snatch mbufs from original queue. */
k = 0;
- if (rxq->sp) {
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[i];
- unsigned int j;
-
- for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
- assert(elt->bufs[j] != NULL);
- pool[k++] = elt->bufs[j];
- }
- }
- } else {
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf = elt->buf;
+ elts = rxq->elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf = elt->buf;

- pool[k++] = buf;
- }
+ pool[k++] = buf;
}
assert(k == mbuf_n);
tmpl.elts_n = 0;
- tmpl.elts.sp = NULL;
- assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
- err = ((tmpl.sp) ?
- rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
- rxq_alloc_elts(&tmpl, desc_n, pool));
+ tmpl.elts = NULL;
+ assert((void *)&tmpl.elts == NULL);
+ err = rxq_alloc_elts(&tmpl, desc_n, pool);
if (err) {
ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
rte_free(pool);
@@ -1061,12 +886,11 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
return err;
}
assert(tmpl.elts_n == desc_n);
- assert(tmpl.elts.sp != NULL);
rte_free(pool);
/* Clean up original data. */
rxq->elts_n = 0;
- rte_free(rxq->elts.sp);
- rxq->elts.sp = NULL;
+ rte_free(rxq->elts);
+ rxq->elts = NULL;
/* Change queue state to ready. */
mod = (struct ibv_exp_wq_attr){
.attr_mask = IBV_EXP_WQ_ATTR_STATE,
@@ -1080,28 +904,14 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
}
/* Post SGEs. */
assert(tmpl.if_wq != NULL);
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (err)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- err = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (err)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ err = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (err)
+ break;
}
if (err) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1110,10 +920,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
err = EIO;
goto error;
}
- if (tmpl.sp)
- tmpl.recv = tmpl.if_wq->recv_sg_list;
- else
- tmpl.recv = tmpl.if_wq->recv_burst;
+ tmpl.recv = tmpl.if_wq->recv_burst;
error:
*rxq = tmpl;
assert(err >= 0);
@@ -1159,31 +966,26 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
} attr;
enum ibv_exp_query_intf_status status;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+ struct rxq_elt (*elts)[desc];
int ret = 0;
unsigned int i;
unsigned int cq_size = desc;

(void)conf; /* Thresholds configuration (ignored). */
- if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
- ERROR("%p: invalid number of RX descriptors (must be a"
- " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ if (desc == 0) {
+ ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
+ if (MLX5_PMD_SGE_WR_N > 1) {
+ ERROR("%p: RX scatter is not supported", (void *)dev);
+ return ENOTSUP;
+ }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
if (priv->hw_csum_l2tun)
tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
- /* Enable scattered packets support for this queue if necessary. */
- assert(mb_len >= RTE_PKTMBUF_HEADROOM);
- if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
- (dev->data->dev_conf.rxmode.max_rx_pkt_len >
- (mb_len - RTE_PKTMBUF_HEADROOM))) {
- tmpl.sp = 1;
- desc /= MLX5_PMD_SGE_WR_N;
- }
- DEBUG("%p: %s scattered packets support (%u WRs)",
- (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
+ (void)mb_len; /* I'll be back! */
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx5_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -1232,10 +1034,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
priv->device_attr.max_qp_wr :
(int)cq_size),
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = ((priv->device_attr.max_sge <
- MLX5_PMD_SGE_WR_N) ?
- priv->device_attr.max_sge :
- MLX5_PMD_SGE_WR_N),
+ .max_recv_sge = 1,
.pd = priv->pd,
.cq = tmpl.cq,
.comp_mask =
@@ -1297,10 +1096,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
(void *)dev, strerror(ret));
goto error;
}
- if (tmpl.sp)
- ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
- else
- ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
if (ret) {
ERROR("%p: RXQ allocation failed: %s",
(void *)dev, strerror(ret));
@@ -1346,28 +1142,14 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
goto error;
}
/* Post SGEs. */
- if (tmpl.sp) {
- struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_sg_list
- (tmpl.wq,
- (*elts)[i].sges,
- RTE_DIM((*elts)[i].sges));
- if (ret)
- break;
- }
- } else {
- struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
-
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- ret = tmpl.if_wq->recv_burst(
- tmpl.wq,
- &(*elts)[i].sge,
- 1);
- if (ret)
- break;
- }
+ elts = tmpl.elts;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ ret = tmpl.if_wq->recv_burst(
+ tmpl.wq,
+ &(*elts)[i].sge,
+ 1);
+ if (ret)
+ break;
}
if (ret) {
ERROR("%p: failed to post SGEs with error %d",
@@ -1388,10 +1170,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
rxq->poll = rxq->if_cq->poll_length_flags;
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- if (rxq->sp)
- rxq->recv = rxq->if_wq->recv_sg_list;
- else
- rxq->recv = rxq->if_wq->recv_burst;
+ rxq->recv = rxq->if_wq->recv_burst;
return 0;
error:
rxq_cleanup(&tmpl);
@@ -1466,10 +1245,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
(void *)dev, (void *)rxq);
(*priv->rxqs)[idx] = rxq;
/* Update receive callback. */
- if (rxq->sp)
- dev->rx_pkt_burst = mlx5_rx_burst_sp;
- else
- dev->rx_pkt_burst = mlx5_rx_burst;
+ dev->rx_pkt_burst = mlx5_rx_burst;
}
priv_unlock(priv);
return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6e184c3..07d95eb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -502,215 +502,8 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
}

/**
- * DPDK callback for RX with scattered packets support.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-uint16_t
-mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- if (unlikely(!rxq->sp))
- return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
- if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
- return 0;
- for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt_sp *elt = &(*elts)[elts_head];
- unsigned int len;
- unsigned int pkt_buf_len;
- struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
- struct rte_mbuf **pkt_buf_next = &pkt_buf;
- unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
- unsigned int j = 0;
- uint32_t flags;
- uint16_t vlan_tci;
-
- /* Sanity checks. */
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
- pkt_buf_len = len;
- /*
- * Replace spent segments with new ones, concatenate and
- * return them as pkt_buf.
- */
- while (1) {
- struct ibv_sge *sge = &elt->sges[j];
- struct rte_mbuf *seg = elt->bufs[j];
- struct rte_mbuf *rep;
- unsigned int seg_tailroom;
-
- assert(seg != NULL);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_prefetch0(seg);
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- if (pkt_buf != NULL) {
- *pkt_buf_next = NULL;
- rte_pktmbuf_free(pkt_buf);
- }
- /* Increment out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
-#ifndef NDEBUG
- /* Poison user-modifiable fields in rep. */
- NEXT(rep) = (void *)((uintptr_t)-1);
- SET_DATA_OFF(rep, 0xdead);
- DATA_LEN(rep) = 0xd00d;
- PKT_LEN(rep) = 0xdeadd00d;
- NB_SEGS(rep) = 0x2a;
- PORT(rep) = 0x2a;
- rep->ol_flags = -1;
-#endif
- assert(rep->buf_len == seg->buf_len);
- /* Reconfigure sge to use rep instead of seg. */
- assert(sge->lkey == rxq->mr->lkey);
- sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
- elt->bufs[j] = rep;
- ++j;
- /* Update pkt_buf if it's the first segment, or link
- * seg to the previous one and update pkt_buf_next. */
- *pkt_buf_next = seg;
- pkt_buf_next = &NEXT(seg);
- /* Update seg information. */
- seg_tailroom = (seg->buf_len - seg_headroom);
- assert(sge->length == seg_tailroom);
- SET_DATA_OFF(seg, seg_headroom);
- if (likely(len <= seg_tailroom)) {
- /* Last segment. */
- DATA_LEN(seg) = len;
- PKT_LEN(seg) = len;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) ==
- seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) ==
- (seg_tailroom - len));
- break;
- }
- DATA_LEN(seg) = seg_tailroom;
- PKT_LEN(seg) = seg_tailroom;
- /* Sanity check. */
- assert(rte_pktmbuf_headroom(seg) == seg_headroom);
- assert(rte_pktmbuf_tailroom(seg) == 0);
- /* Fix len and clear headroom for next segments. */
- len -= seg_tailroom;
- seg_headroom = 0;
- }
- /* Update head and tail segments. */
- *pkt_buf_next = NULL;
- assert(pkt_buf != NULL);
- assert(j != 0);
- NB_SEGS(pkt_buf) = j;
- PORT(pkt_buf) = rxq->port_id;
- PKT_LEN(pkt_buf) = pkt_buf_len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
- pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
- pkt_buf->vlan_tci = vlan_tci;
- }
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
- }
-
- /* Return packet. */
- *(pkts++) = pkt_buf;
- ++pkts_ret;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment bytes counter. */
- rxq->stats.ibytes += pkt_buf_len;
-#endif
-repost:
- ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_sg_list(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- rxq->elts_head = elts_head;
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment packets counter. */
- rxq->stats.ipackets += pkts_ret;
-#endif
- return pkts_ret;
-}
-
-/**
* DPDK callback for RX.
*
- * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
- * manage scattered packets. Improves performance when MRU is lower than the
- * size of the first segment.
- *
* @param dpdk_rxq
* Generic pointer to RX queue structure.
* @param[out] pkts
@@ -725,7 +518,7 @@ uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
const unsigned int elts_n = rxq->elts_n;
unsigned int elts_head = rxq->elts_head;
struct ibv_sge sges[pkts_n];
@@ -733,8 +526,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int pkts_ret = 0;
int ret;

- if (unlikely(rxq->sp))
- return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
for (i = 0; (i != pkts_n); ++i) {
struct rxq_elt *elt = &(*elts)[elts_head];
unsigned int len;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8358ccb..2e1f83b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -81,12 +81,6 @@ struct mlx5_txq_stats {
uint64_t odropped; /**< Total of packets not sent when TX ring full. */
};

-/* RX element (scattered packets). */
-struct rxq_elt_sp {
- struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
- struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */
-};
-
/* RX element. */
struct rxq_elt {
struct ibv_sge sge; /* Scatter/Gather Element. */
@@ -112,15 +106,11 @@ struct rxq {
unsigned int port_id; /* Port ID for incoming packets. */
unsigned int elts_n; /* (*elts)[] length. */
unsigned int elts_head; /* Current index in (*elts)[]. */
- unsigned int sp:1; /* Use scattered RX elements. */
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
unsigned int vlan_strip:1; /* Enable VLAN stripping. */
unsigned int crc_present:1; /* CRC must be subtracted. */
- union {
- struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
- struct rxq_elt (*no_sp)[]; /* RX elements. */
- } elts;
+ struct rxq_elt (*elts)[]; /* RX elements. */
unsigned int socket; /* CPU socket ID for allocations. */
struct mlx5_rxq_stats stats; /* RX queue counters. */
struct ibv_exp_res_domain *rd; /* Resource Domain. */
@@ -321,7 +311,6 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
/* mlx5_rxtx.c */

uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
-uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
--
2.1.4
Nelio Laranjeiro
2016-06-22 09:05:35 UTC
Permalink
There is no scatter/gather support anymore, CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
has no purpose and can be removed.

Signed-off-by: Nelio Laranjeiro <***@6wind.com>
Signed-off-by: Adrien Mazarguil <***@6wind.com>
---
config/common_base | 1 -
doc/guides/nics/mlx5.rst | 7 -------
drivers/net/mlx5/Makefile | 4 ----
drivers/net/mlx5/mlx5_defs.h | 5 -----
drivers/net/mlx5/mlx5_rxq.c | 4 ----
drivers/net/mlx5/mlx5_txq.c | 4 ----
6 files changed, 25 deletions(-)

diff --git a/config/common_base b/config/common_base
index ead5984..39e6333 100644
--- a/config/common_base
+++ b/config/common_base
@@ -207,7 +207,6 @@ CONFIG_RTE_LIBRTE_MLX4_SOFT_COUNTERS=1
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
-CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index d9196d1..84c35a0 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -114,13 +114,6 @@ These options can be modified in the ``.config`` file.
adds additional run-time checks and debugging messages at the cost of
lower performance.

-- ``CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N`` (default **4**)
-
- Number of scatter/gather elements (SGEs) per work request (WR). Lowering
- this number improves performance but also limits the ability to receive
- scattered packets (packets that do not fit a single mbuf). The default
- value is a safe tradeoff.
-
- ``CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE`` (default **0**)

Amount of data to be inlined during TX operations. Improves latency.
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 999ada5..656a6e1 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -86,10 +86,6 @@ else
CFLAGS += -DNDEBUG -UPEDANTIC
endif

-ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
-CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N)
-endif
-
ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
endif
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 09207d9..da1c90e 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -54,11 +54,6 @@
/* RSS Indirection table size. */
#define RSS_INDIRECTION_TABLE_SIZE 256

-/* Maximum number of Scatter/Gather Elements per Work Request. */
-#ifndef MLX5_PMD_SGE_WR_N
-#define MLX5_PMD_SGE_WR_N 4
-#endif
-
/* Maximum size for inline data. */
#ifndef MLX5_PMD_MAX_INLINE
#define MLX5_PMD_MAX_INLINE 0
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 38ff9fd..4000624 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -976,10 +976,6 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
ERROR("%p: invalid number of RX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: RX scatter is not supported", (void *)dev);
- return ENOTSUP;
- }
/* Toggle RX checksum offload if hardware supports it. */
if (priv->hw_csum)
tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 5a248c9..59974c5 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -264,10 +264,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
ERROR("%p: invalid number of TX descriptors", (void *)dev);
return EINVAL;
}
- if (MLX5_PMD_SGE_WR_N > 1) {
- ERROR("%p: TX gather is not supported", (void *)dev);
- return EINVAL;
- }
/* MRs will be registered in mp2mr[] later. */
attr.rd = (struct ibv_exp_res_domain_init_attr){
.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
--
2.1.4
Loading...