Discussion:
[dpdk-dev] [RFC] net/tap: add queues when attaching from secondary process
Raslan Darawsheh
2018-06-07 12:29:51 UTC
Permalink
In the case where the device is created by the primary process.
Currently, secondary process only contains the eth dev without being
able to do any Rx/Tx.

When attaching the device from secondary process this patch adds queues
info got from IPC massaging.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
---
drivers/net/tap/Makefile | 2 +
drivers/net/tap/rte_eth_tap.c | 105 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..913423c 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,8 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev

CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+

#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 5531fe9..0f4c8d9 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <sys/types.h>
#include <sys/stat.h>
@@ -55,6 +57,9 @@
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT

+/* IPC key for communication of queue fds between processes. */
+#define TAP_MP_KEY "tap_mp_exchange_fds"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;

@@ -93,6 +98,15 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* To communicate queue infos between processes */
+struct queues_fds {
+ char name[RTE_DEV_NAME_MAX_LEN];
+ int num_rxq_fds;
+ int num_txq_fds;
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1731,6 +1745,47 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/*
+ * Send the queues fds from the primary process to secondary.
+ */
+static int
+tap_exchange_queues_fds(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_mp_msg mp_resp;
+ struct queues_fds *out = (struct queues_fds *)&mp_resp.param;
+ const struct queues_fds *in = (const struct queues_fds *)mp_msg->param;
+ uint16_t port_id;
+ int i, ret;
+
+ TAP_LOG(DEBUG, "received request");
+ strlcpy(out->name, in->name, sizeof(out->name));
+ ret = rte_eth_dev_get_port_by_name(in->name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get dev %s", in->name);
+ return -1;
+ }
+ eth_dev = &rte_eth_devices[port_id];
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+
+ /* fill the queues fds data in the reply msg */
+ strlcpy(mp_resp.name, TAP_MP_KEY, sizeof(mp_resp.name));
+ out->num_rxq_fds = eth_dev->data->nb_rx_queues;
+ for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
+ out->rxq_fds[i] = pmd->rxq[i].fd;
+ out->num_txq_fds = eth_dev->data->nb_tx_queues;
+ for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
+ out->txq_fds[i] = pmd->txq[i].fd;
+ mp_resp.len_param = sizeof(*out);
+ mp_resp.num_fds = 0;
+ if (rte_mp_reply(&mp_resp, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply a fds request");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1744,6 +1799,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct ether_addr user_mac = { .addr_bytes = {0} };
struct rte_eth_dev *eth_dev;
+ int queue_id;

strcpy(tuntap_name, "TAP");

@@ -1757,8 +1813,46 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ /* request a sync from the primary process to get queues fds */
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "cannot initialize secondary process "
+ "without a primary one");
+ return -1;
+ }
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct queues_fds *req = (struct queues_fds *)mp_req.param;
+ struct queues_fds *resp;
+
+ strlcpy(req->name, name, sizeof(mp_req.name));
+ strlcpy(mp_req.name, TAP_MP_KEY, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*req);
+ /* request for sync from primary process to get queues fds. */
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ resp = (struct queues_fds *)mp_rep->param;
+ TAP_LOG(INFO, "Received fds for %d rx_queues and "
+ "%d tx_queues", resp->num_rxq_fds,
+ resp->num_txq_fds);
+ } else {
+ TAP_LOG(ERR, "Failed to request queues from primary");
+ return -1;
+ }
+
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+ for (queue_id = 0; queue_id < resp->num_rxq_fds; queue_id++)
+ pmd->rxq[queue_id].fd = resp->rxq_fds[queue_id];
+
+ for (queue_id = 0; queue_id < resp->num_txq_fds; queue_id++)
+ pmd->txq[queue_id].fd = resp->txq_fds[queue_id];
+
+ TAP_LOG(NOTICE, "Initializing secondary process pmd_tap for %s",
+ name);
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1806,6 +1900,14 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);

+ /* register for mp communication between secondary and primary */
+ if (rte_mp_action_register(TAP_MP_KEY, tap_exchange_queues_fds) &&
+ rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s : %s fail to register MP action : %s",
+ tuntap_name, name, strerror(errno));
+ return -1;
+ }
+
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

@@ -1813,6 +1915,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Wiles, Keith
2018-06-07 19:09:50 UTC
Permalink
Post by Raslan Darawsheh
In the case where the device is created by the primary process.
Currently, secondary process only contains the eth dev without being
able to do any Rx/Tx.
When attaching the device from secondary process this patch adds queues
info got from IPC massaging.
Can you explain this details a bit more here, not sure I follow the real problem and the solution?
Post by Raslan Darawsheh
---
drivers/net/tap/Makefile | 2 +
drivers/net/tap/rte_eth_tap.c | 105 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..913423c 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,8 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev
CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 5531fe9..0f4c8d9 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -55,6 +57,9 @@
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
+/* IPC key for communication of queue fds between processes. */
+#define TAP_MP_KEY "tap_mp_exchange_fds"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
@@ -93,6 +98,15 @@ enum ioctl_mode {
REMOTE_ONLY,
};
+/* To communicate queue infos between processes */
+struct queues_fds {
+ char name[RTE_DEV_NAME_MAX_LEN];
+ int num_rxq_fds;
+ int num_txq_fds;
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
/**
@@ -1731,6 +1745,47 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}
+/*
+ * Send the queues fds from the primary process to secondary.
+ */
+static int
+tap_exchange_queues_fds(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_mp_msg mp_resp;
+ struct queues_fds *out = (struct queues_fds *)&mp_resp.param;
+ const struct queues_fds *in = (const struct queues_fds *)mp_msg->param;
+ uint16_t port_id;
+ int i, ret;
+
+ TAP_LOG(DEBUG, "received request");
+ strlcpy(out->name, in->name, sizeof(out->name));
+ ret = rte_eth_dev_get_port_by_name(in->name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get dev %s", in->name);
+ return -1;
+ }
+ eth_dev = &rte_eth_devices[port_id];
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+
+ /* fill the queues fds data in the reply msg */
+ strlcpy(mp_resp.name, TAP_MP_KEY, sizeof(mp_resp.name));
+ out->num_rxq_fds = eth_dev->data->nb_rx_queues;
+ for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
+ out->rxq_fds[i] = pmd->rxq[i].fd;
+ out->num_txq_fds = eth_dev->data->nb_tx_queues;
+ for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
+ out->txq_fds[i] = pmd->txq[i].fd;
+ mp_resp.len_param = sizeof(*out);
+ mp_resp.num_fds = 0;
+ if (rte_mp_reply(&mp_resp, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply a fds request");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1744,6 +1799,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct ether_addr user_mac = { .addr_bytes = {0} };
struct rte_eth_dev *eth_dev;
+ int queue_id;
strcpy(tuntap_name, "TAP");
@@ -1757,8 +1813,46 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ /* request a sync from the primary process to get queues fds */
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "cannot initialize secondary process "
+ "without a primary one");
+ return -1;
+ }
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct queues_fds *req = (struct queues_fds *)mp_req.param;
+ struct queues_fds *resp;
What is the rule for DPDK coding style of having declares in the middle of a block, I thought we only wanted them at the beginning of block of code.
Post by Raslan Darawsheh
+
+ strlcpy(req->name, name, sizeof(mp_req.name));
+ strlcpy(mp_req.name, TAP_MP_KEY, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*req);
+ /* request for sync from primary process to get queues fds. */
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ resp = (struct queues_fds *)mp_rep->param;
+ TAP_LOG(INFO, "Received fds for %d rx_queues and "
+ "%d tx_queues", resp->num_rxq_fds,
+ resp->num_txq_fds);
+ } else {
+ TAP_LOG(ERR, "Failed to request queues from primary");
+ return -1;
+ }
I thought passing a FD from process to process you had to have the kernel convert the FD to the local process value. At least that was the way it was done in mmap memory FD. Is this the same problem and needs to be send in a message using a control structure with the correct flags set?
Post by Raslan Darawsheh
+
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+ for (queue_id = 0; queue_id < resp->num_rxq_fds; queue_id++)
+ pmd->rxq[queue_id].fd = resp->rxq_fds[queue_id];
+
+ for (queue_id = 0; queue_id < resp->num_txq_fds; queue_id++)
+ pmd->txq[queue_id].fd = resp->txq_fds[queue_id];
+
+ TAP_LOG(NOTICE, "Initializing secondary process pmd_tap for %s",
+ name);
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1806,6 +1900,14 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* register for mp communication between secondary and primary */
+ if (rte_mp_action_register(TAP_MP_KEY, tap_exchange_queues_fds) &&
+ rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s : %s fail to register MP action : %s",
+ tuntap_name, name, strerror(errno));
+ return -1;
+ }
+
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
@@ -1813,6 +1915,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Regards,
Keith
Raslan Darawsheh
2018-06-07 23:24:01 UTC
Permalink
Hi,

As you know that currently TAP pmd support attaching a secondary process to a primary process.
But, it's still lacking the ability to do Rx/Tx burst since it's lacking the necessary fds for RX/TX queues,
And the setting of Rx/Tx burst function.

This patch the main purpose is to exchange the fds between the processes throw the IPC massages
And to set the Rx/Tx functions for the secondary.

I hope I explained it properly, please let me know if you still didn't get it.

Kindest regards,
Raslan Darawsheh

-----Original Message-----
From: Wiles, Keith [mailto:***@intel.com]
Sent: Thursday, June 7, 2018 10:10 PM
To: Raslan Darawsheh <***@mellanox.com>
Cc: Thomas Monjalon <***@monjalon.net>; ***@dpdk.org
Subject: Re: [dpdk-dev] [RFC] net/tap: add queues when attaching from secondary process
Post by Raslan Darawsheh
In the case where the device is created by the primary process.
Currently, secondary process only contains the eth dev without being
able to do any Rx/Tx.
When attaching the device from secondary process this patch adds queues
info got from IPC massaging.
Can you explain this details a bit more here, not sure I follow the real problem and the solution?
Post by Raslan Darawsheh
---
drivers/net/tap/Makefile | 2 +
drivers/net/tap/rte_eth_tap.c | 105 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..913423c 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,8 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev
CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 5531fe9..0f4c8d9 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -55,6 +57,9 @@
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
+/* IPC key for communication of queue fds between processes. */
+#define TAP_MP_KEY "tap_mp_exchange_fds"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
@@ -93,6 +98,15 @@ enum ioctl_mode {
REMOTE_ONLY,
};
+/* To communicate queue infos between processes */
+struct queues_fds {
+ char name[RTE_DEV_NAME_MAX_LEN];
+ int num_rxq_fds;
+ int num_txq_fds;
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
/**
@@ -1731,6 +1745,47 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}
+/*
+ * Send the queues fds from the primary process to secondary.
+ */
+static int
+tap_exchange_queues_fds(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_mp_msg mp_resp;
+ struct queues_fds *out = (struct queues_fds *)&mp_resp.param;
+ const struct queues_fds *in = (const struct queues_fds *)mp_msg->param;
+ uint16_t port_id;
+ int i, ret;
+
+ TAP_LOG(DEBUG, "received request");
+ strlcpy(out->name, in->name, sizeof(out->name));
+ ret = rte_eth_dev_get_port_by_name(in->name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get dev %s", in->name);
+ return -1;
+ }
+ eth_dev = &rte_eth_devices[port_id];
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+
+ /* fill the queues fds data in the reply msg */
+ strlcpy(mp_resp.name, TAP_MP_KEY, sizeof(mp_resp.name));
+ out->num_rxq_fds = eth_dev->data->nb_rx_queues;
+ for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
+ out->rxq_fds[i] = pmd->rxq[i].fd;
+ out->num_txq_fds = eth_dev->data->nb_tx_queues;
+ for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
+ out->txq_fds[i] = pmd->txq[i].fd;
+ mp_resp.len_param = sizeof(*out);
+ mp_resp.num_fds = 0;
+ if (rte_mp_reply(&mp_resp, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply a fds request");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1744,6 +1799,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct ether_addr user_mac = { .addr_bytes = {0} };
struct rte_eth_dev *eth_dev;
+ int queue_id;
strcpy(tuntap_name, "TAP");
@@ -1757,8 +1813,46 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ /* request a sync from the primary process to get queues fds */
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "cannot initialize secondary process "
+ "without a primary one");
+ return -1;
+ }
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct queues_fds *req = (struct queues_fds *)mp_req.param;
+ struct queues_fds *resp;
What is the rule for DPDK coding style of having declares in the middle of a block, I thought we only wanted them at the beginning of block of code.
Post by Raslan Darawsheh
+
+ strlcpy(req->name, name, sizeof(mp_req.name));
+ strlcpy(mp_req.name, TAP_MP_KEY, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*req);
+ /* request for sync from primary process to get queues fds. */
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ resp = (struct queues_fds *)mp_rep->param;
+ TAP_LOG(INFO, "Received fds for %d rx_queues and "
+ "%d tx_queues", resp->num_rxq_fds,
+ resp->num_txq_fds);
+ } else {
+ TAP_LOG(ERR, "Failed to request queues from primary");
+ return -1;
+ }
I thought passing a FD from process to process you had to have the kernel convert the FD to the local process value. At least that was the way it was done in mmap memory FD. Is this the same problem and needs to be send in a message using a control structure with the correct flags set?
Post by Raslan Darawsheh
+
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+ for (queue_id = 0; queue_id < resp->num_rxq_fds; queue_id++)
+ pmd->rxq[queue_id].fd = resp->rxq_fds[queue_id];
+
+ for (queue_id = 0; queue_id < resp->num_txq_fds; queue_id++)
+ pmd->txq[queue_id].fd = resp->txq_fds[queue_id];
+
+ TAP_LOG(NOTICE, "Initializing secondary process pmd_tap for %s",
+ name);
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1806,6 +1900,14 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* register for mp communication between secondary and primary */
+ if (rte_mp_action_register(TAP_MP_KEY, tap_exchange_queues_fds) &&
+ rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s : %s fail to register MP action : %s",
+ tuntap_name, name, strerror(errno));
+ return -1;
+ }
+
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
@@ -1813,6 +1915,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Regards,
Keith
Wiles, Keith
2018-06-08 02:52:51 UTC
Permalink
On Jun 7, 2018, at 4:24 PM, Raslan Darawsheh <***@mellanox.com<mailto:***@mellanox.com>> wrote:

Hi,

As you know that currently TAP pmd support attaching a secondary process to a primary process.
But, it's still lacking the ability to do Rx/Tx burst since it's lacking the necessary fds for RX/TX queues,
And the setting of Rx/Tx burst function.

This patch the main purpose is to exchange the fds between the processes throw the IPC massages
And to set the Rx/Tx functions for the secondary.

I hope I explained it properly, please let me know if you still didn't get it.

I see the code sending the FD’s of primary and secondary to each other and the code looks fine. The problem I see is what I asked before in the comments below, which is the FDs on one process can not be used on another process without the kernel converting the FD for the given process. Is this the case here or not?

https://stackoverflow.com/questions/8037746/is-there-an-easier-way-to-share-file-descriptors-between-unrelated-processes-on


Kindest regards,
Raslan Darawsheh

-----Original Message-----
From: Wiles, Keith [mailto:***@intel.com]
Sent: Thursday, June 7, 2018 10:10 PM
To: Raslan Darawsheh <***@mellanox.com<mailto:***@mellanox.com>>
Cc: Thomas Monjalon <***@monjalon.net<mailto:***@monjalon.net>>; ***@dpdk.org<mailto:***@dpdk.org>
Subject: Re: [dpdk-dev] [RFC] net/tap: add queues when attaching from secondary process



On Jun 7, 2018, at 5:29 AM, Raslan Darawsheh <***@mellanox.com<mailto:***@mellanox.com>> wrote:

In the case where the device is created by the primary process.
Currently, secondary process only contains the eth dev without being
able to do any Rx/Tx.

When attaching the device from secondary process this patch adds queues
info got from IPC massaging.

Can you explain this details a bit more here, not sure I follow the real problem and the solution?


Signed-off-by: Raslan Darawsheh <***@mellanox.com<mailto:***@mellanox.com>>
---
drivers/net/tap/Makefile | 2 +
drivers/net/tap/rte_eth_tap.c | 105 +++++++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..913423c 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,8 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev

CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+

#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 5531fe9..0f4c8d9 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <sys/types.h>
#include <sys/stat.h>
@@ -55,6 +57,9 @@
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT

+/* IPC key for communication of queue fds between processes. */
+#define TAP_MP_KEY "tap_mp_exchange_fds"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;

@@ -93,6 +98,15 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* To communicate queue infos between processes */
+struct queues_fds {
+ char name[RTE_DEV_NAME_MAX_LEN];
+ int num_rxq_fds;
+ int num_txq_fds;
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1731,6 +1745,47 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/*
+ * Send the queues fds from the primary process to secondary.
+ */
+static int
+tap_exchange_queues_fds(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_eth_dev *eth_dev;
+ struct rte_mp_msg mp_resp;
+ struct queues_fds *out = (struct queues_fds *)&mp_resp.param;
+ const struct queues_fds *in = (const struct queues_fds *)mp_msg->param;
+ uint16_t port_id;
+ int i, ret;
+
+ TAP_LOG(DEBUG, "received request");
+ strlcpy(out->name, in->name, sizeof(out->name));
+ ret = rte_eth_dev_get_port_by_name(in->name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get dev %s", in->name);
+ return -1;
+ }
+ eth_dev = &rte_eth_devices[port_id];
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+
+ /* fill the queues fds data in the reply msg */
+ strlcpy(mp_resp.name, TAP_MP_KEY, sizeof(mp_resp.name));
+ out->num_rxq_fds = eth_dev->data->nb_rx_queues;
+ for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
+ out->rxq_fds[i] = pmd->rxq[i].fd;
+ out->num_txq_fds = eth_dev->data->nb_tx_queues;
+ for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
+ out->txq_fds[i] = pmd->txq[i].fd;
+ mp_resp.len_param = sizeof(*out);
+ mp_resp.num_fds = 0;
+ if (rte_mp_reply(&mp_resp, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply a fds request");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1744,6 +1799,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
char remote_iface[RTE_ETH_NAME_MAX_LEN];
struct ether_addr user_mac = { .addr_bytes = {0} };
struct rte_eth_dev *eth_dev;
+ int queue_id;

strcpy(tuntap_name, "TAP");

@@ -1757,8 +1813,46 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ /* request a sync from the primary process to get queues fds */
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "cannot initialize secondary process "
+ "without a primary one");
+ return -1;
+ }
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct queues_fds *req = (struct queues_fds *)mp_req.param;
+ struct queues_fds *resp;

What is the rule for DPDK coding style of having declares in the middle of a block, I thought we only wanted them at the beginning of block of code.

+
+ strlcpy(req->name, name, sizeof(mp_req.name));
+ strlcpy(mp_req.name, TAP_MP_KEY, sizeof(mp_req.name));
+ mp_req.len_param = sizeof(*req);
+ /* request for sync from primary process to get queues fds. */
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ resp = (struct queues_fds *)mp_rep->param;
+ TAP_LOG(INFO, "Received fds for %d rx_queues and "
+ "%d tx_queues", resp->num_rxq_fds,
+ resp->num_txq_fds);
+ } else {
+ TAP_LOG(ERR, "Failed to request queues from primary");
+ return -1;
+ }

I thought passing a FD from process to process you had to have the kernel convert the FD to the local process value. At least that was the way it was done in mmap memory FD. Is this the same problem and needs to be send in a message using a control structure with the correct flags set?

+
+ struct pmd_internals *pmd = eth_dev->data->dev_private;
+ for (queue_id = 0; queue_id < resp->num_rxq_fds; queue_id++)
+ pmd->rxq[queue_id].fd = resp->rxq_fds[queue_id];
+
+ for (queue_id = 0; queue_id < resp->num_txq_fds; queue_id++)
+ pmd->txq[queue_id].fd = resp->txq_fds[queue_id];
+
+ TAP_LOG(NOTICE, "Initializing secondary process pmd_tap for %s",
+ name);
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1806,6 +1900,14 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);

+ /* register for mp communication between secondary and primary */
+ if (rte_mp_action_register(TAP_MP_KEY, tap_exchange_queues_fds) &&
+ rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s : %s fail to register MP action : %s",
+ tuntap_name, name, strerror(errno));
+ return -1;
+ }
+
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

@@ -1813,6 +1915,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Wiles, Keith
2018-06-12 12:46:29 UTC
Permalink
Post by Raslan Darawsheh
Hi,
As you know that currently TAP pmd support attaching a secondary process to a primary process.
But, it's still lacking the ability to do Rx/Tx burst since it's lacking the necessary fds for RX/TX queues,
And the setting of Rx/Tx burst function.
This patch the main purpose is to exchange the fds between the processes throw the IPC massages
And to set the Rx/Tx functions for the secondary.
I hope I explained it properly, please let me know if you still didn't get it.
I see the code sending the FD’s of primary and secondary to each other and the code looks fine. The problem I see is what I asked before in the comments below, which is the FDs on one process can not be used on another process without the kernel converting the FD for the given process. Is this the case here or not?

https://stackoverflow.com/questions/8037746/is-there-an-easier-way-to-share-file-descriptors-between-unre
Raslan Darawsheh
2018-06-12 13:21:24 UTC
Permalink
Hi Keith,
Yes you are right about that,
This is exactly the case and I'll need to provide a V2 for this with the kernel mapping.

Kindest regards,
Raslan Darawsheh

-----Original Message-----
From: Wiles, Keith [mailto:***@intel.com]
Sent: Tuesday, June 12, 2018 3:46 PM
To: Raslan Darawsheh <***@mellanox.com>
Cc: Thomas Monjalon <***@monjalon.net>; ***@dpdk.org
Subject: Re: [dpdk-dev] [RFC] net/tap: add queues when attaching from secondary process
Post by Raslan Darawsheh
Hi,
As you know that currently TAP pmd support attaching a secondary process to a primary process.
But, it's still lacking the ability to do Rx/Tx burst since it's
lacking the necessary fds for RX/TX queues, And the setting of Rx/Tx burst function.
This patch the main purpose is to exchange the fds between the
processes throw the IPC massages And to set the Rx/Tx functions for the secondary.
I hope I explained it properly, please let me know if you still didn't get it.
I see the code sending the FD’s of primary and secondary to each other and the code looks fine. The problem I see is what I asked before in the comments below, which is the FDs on one process can not be used on another process without the kernel converting the FD for the given process. Is this the case here or not?

https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fstackoverflow.com%2Fquestions%2F8037746%2Fis-there-an-easier-way-to-share-file-descriptors-between-unrelated-processes-on&data=02%7C01%7Crasland%40mellanox.com%7C6356c8b95d8042516c1108d5d0628740%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636644043986930914&sdata=nX7%2FhXRDKrccz%2BVDUyZ%2BwB088r4R9KEQGeoZ0i2CJZk%3D&reserved=0


Regards,
Keith
Thomas Monjalon
2018-07-20 10:57:42 UTC
Permalink
From: Raslan Darawsheh <***@mellanox.com>

In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.

Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
Signed-off-by: Thomas Monjalon <***@monjalon.net>
---
Note: there is a bug in EAL IPC regarding fd translation.
A fix will be sent later for EAL.

v2:
- translate file descriptors via IPC API
- add documentation
---
doc/guides/nics/tap.rst | 16 ++++
doc/guides/rel_notes/release_18_08.rst | 5 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 127 ++++++++++++++++++++++++-
4 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 27148681c..d1f3e1c24 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -152,6 +152,22 @@ Distribute IPv4 TCP packets using RSS to a given MAC address over queues 0-3::
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end

+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+The IPC synchronization of Rx/Tx queues is currently limited:
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------

diff --git a/doc/guides/rel_notes/release_18_08.rst b/doc/guides/rel_notes/release_18_08.rst
index df0ad61f7..771befecb 100644
--- a/doc/guides/rel_notes/release_18_08.rst
+++ b/doc/guides/rel_notes/release_18_08.rst
@@ -77,6 +77,11 @@ New Features
* Add handlers to add/delete VxLAN port number.
* Add devarg to specify ingress VLAN rewrite mode.

+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
+

API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 324336535..3dcf05a72 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,7 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso

CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API

#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 22ba872ed..8d70126c0 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)

+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;

@@ -100,6 +105,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1972,6 +1988,96 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct pmd_internals *devpriv;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strcpy(request.name, TAP_MP_KEY);
+ strcpy(request_param->port_name, port_name);
+ request.len_param = sizeof(*request_param);
+
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d", rte_errno);
+ return -1;
+ }
+ /* FIXME: handle replies.nb_received > 1 */
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+ devpriv = dev->data->dev_private;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ devpriv->rxq[queue].fd = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ devpriv->txq[queue].fd = reply->fds[fd_iterator++];
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct pmd_internals *devpriv;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param = (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param = (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s", request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+ devpriv = dev->data->dev_private;
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->rxq[queue].fd;
+ reply_param->rxq_count++;
+ }
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->txq[queue].fd;
+ reply_param->txq_count++;
+ }
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1998,8 +2104,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
+
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -2050,10 +2166,19 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
+
leave:
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.17.1
Raslan Darawsheh
2018-09-27 11:19:55 UTC
Permalink
change the fds for the queues to be pointers and add new process private
structure and make the queue fds point to it.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
---
drivers/net/tap/rte_eth_tap.c | 63 ++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@

static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;

static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;

- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);

/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);

for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}

@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue)
{
struct rx_queue *rxq = queue;

- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue)
{
struct tx_queue *txq = queue;

- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}

@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;

if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}

TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);

return 0;

@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");

return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}

+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
@@ -1669,8 +1674,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}

if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
@@ -2089,13 +2096,13 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}

diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rte_eth_rxmode *rxmode; /* RX features */
@@ -56,7 +56,7 @@ struct rx_queue {
};

struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};

+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
/* tap_intr.c */

int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set);
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];

/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Raslan Darawsheh
2018-09-27 11:19:56 UTC
Permalink
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.

Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
Signed-off-by: Thomas Monjalon <***@monjalon.net>

---
v2:
- translate file descriptors via IPC API
- add documentation
v3:
- rabse the commit
- use private static array for fd's to be local for each process

---
---
doc/guides/nics/tap.rst | 16 ++++
doc/guides/rel_notes/release_18_11.rst | 4 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 133 +++++++++++++++++++++++++++++++++
4 files changed, 154 insertions(+)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 2714868..d1f3e1c 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -152,6 +152,22 @@ Distribute IPv4 TCP packets using RSS to a given MAC address over queues 0-3::
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end

+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+The IPC synchronization of Rx/Tx queues is currently limited:
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------

diff --git a/doc/guides/rel_notes/release_18_11.rst b/doc/guides/rel_notes/release_18_11.rst
index 8c4bb54..a9dda5b 100644
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -67,6 +67,10 @@ New Features
SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
vdev_netvsc, tap, and failsafe drivers combination.

+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.

API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3243365..7748283 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -22,6 +22,7 @@ CFLAGS += -O3
CFLAGS += -I$(SRCDIR)
CFLAGS += -I.
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 8cc4552..8d276eb 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)

+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
static struct pmd_process_private *process_private;
@@ -101,6 +106,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1980,6 +1996,100 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strcpy(request.name, TAP_MP_KEY);
+ strcpy(request_param->port_name, port_name);
+ request.len_param = sizeof(*request_param);
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ /* FIXME: handle replies.nb_received > 1 */
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+
+ dev->data->nb_rx_queues = reply_param->rxq_count;
+ dev->data->nb_tx_queues = reply_param->txq_count;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ process_private->txq_fds[queue] = reply->fds[fd_iterator++];
+
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
+ reply_param->rxq_count++;
+ }
+ assert(reply_param->rxq_count == dev->data->nb_rx_queues);
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
+ reply_param->txq_count++;
+ }
+ assert(reply_param->txq_count == dev->data->nb_tx_queues);
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -2009,6 +2119,21 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
/* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(name,
+ sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE,
+ eth_dev->device->numa_node);
+
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -2056,6 +2181,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);

+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

@@ -2063,6 +2195,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Wiles, Keith
2018-09-27 13:04:27 UTC
Permalink
Post by Thomas Monjalon
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.
Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.
---
- translate file descriptors via IPC API
- add documentation
- rabse the commit
- use private static array for fd's to be local for each process
---
---
doc/guides/nics/tap.rst | 16 ++++
doc/guides/rel_notes/release_18_11.rst | 4 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 133 +++++++++++++++++++++++++++++++++
4 files changed, 154 insertions(+)
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 2714868..d1f3e1c 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end
+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------
diff --git a/doc/guides/rel_notes/release_18_11.rst b/doc/guides/rel_notes/release_18_11.rst
index 8c4bb54..a9dda5b 100644
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -67,6 +67,10 @@ New Features
SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
vdev_netvsc, tap, and failsafe drivers combination.
+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3243365..7748283 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -22,6 +22,7 @@ CFLAGS += -O3
CFLAGS += -I$(SRCDIR)
CFLAGS += -I.
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 8cc4552..8d276eb 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
static struct pmd_process_private *process_private;
@@ -101,6 +106,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};
+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
/**
@@ -1980,6 +1996,100 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}
+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strcpy(request.name, TAP_MP_KEY);
+ strcpy(request_param->port_name, port_name);
Should we not be using the strlcpy() functions here.
Post by Thomas Monjalon
+ request.len_param = sizeof(*request_param);
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ /* FIXME: handle replies.nb_received > 1 */
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+
+ dev->data->nb_rx_queues = reply_param->rxq_count;
+ dev->data->nb_tx_queues = reply_param->txq_count;
Do we really need a rxq_count and txq_count as they are always the same it seems? Just a comment not a request to change it.
Post by Thomas Monjalon
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ process_private->txq_fds[queue] = reply->fds[fd_iterator++];
+
+
Extra blank line here, needs to be removed.
Post by Thomas Monjalon
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
+ reply_param->rxq_count++;
+ }
+ assert(reply_param->rxq_count == dev->data->nb_rx_queues);
Pick an assert method assert() or RTE_ASSERT() why have both? I would suggest using RTE_ASSERT everywhere.
Post by Thomas Monjalon
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
+ reply_param->txq_count++;
+ }
+ assert(reply_param->txq_count == dev->data->nb_tx_queues);
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
I do not like FIXME or TODO statements in the code. For FIXME we need to fix it or remove the comment. For TODO we need to put it in a enhancement list and do it later.
Post by Thomas Monjalon
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
strlcpy() here as well.
Post by Thomas Monjalon
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -2009,6 +2119,21 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
/* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(name,
+ sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE,
+ eth_dev->device->numa_node);
+
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -2056,6 +2181,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
@@ -2063,6 +2195,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Regards,
Keith
Thomas Monjalon
2018-09-27 18:53:46 UTC
Permalink
Hi Keith and Raslan,

Thanks for your review Keith.
It reminds me that you had some comments in my v3 which were not addressed.

Raslan, it seems you missed the v3 I had sent (you sent another v3).
Please check the comments done by Keith and Ferruh during the summer.

Thank you all
Raslan Darawsheh
2018-10-02 10:34:10 UTC
Permalink
change the fds for the queues to be pointers and add new process private
structure and make the queue fds point to it.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
---
drivers/net/tap/rte_eth_tap.c | 63 ++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@

static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;

static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;

- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);

/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);

for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}

@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue)
{
struct rx_queue *rxq = queue;

- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue)
{
struct tx_queue *txq = queue;

- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}

@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;

if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}

TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);

return 0;

@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");

return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}

+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
@@ -1669,8 +1674,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}

if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
@@ -2089,13 +2096,13 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}

diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rte_eth_rxmode *rxmode; /* RX features */
@@ -56,7 +56,7 @@ struct rx_queue {
};

struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};

+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
/* tap_intr.c */

int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set);
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];

/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Raslan Darawsheh
2018-10-02 10:34:11 UTC
Permalink
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.

Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
Signed-off-by: Thomas Monjalon <***@monjalon.net>

---
v2:
- translate file descriptors via IPC API
- add documentation
v3:
- rabse the commit
- use private static array for fd's to be local for each process

v4:
- change strcpy to be strlcpy
- remove the fixme and todo from comments.

---
---
doc/guides/nics/tap.rst | 16 ++++
doc/guides/rel_notes/release_18_11.rst | 4 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 133 ++++++++++++++++++++++++++++++++-
4 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 2714868..d1f3e1c 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -152,6 +152,22 @@ Distribute IPv4 TCP packets using RSS to a given MAC address over queues 0-3::
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end

+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+The IPC synchronization of Rx/Tx queues is currently limited:
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------

diff --git a/doc/guides/rel_notes/release_18_11.rst b/doc/guides/rel_notes/release_18_11.rst
index 8c4bb54..a9dda5b 100644
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -67,6 +67,10 @@ New Features
SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
vdev_netvsc, tap, and failsafe drivers combination.

+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.

API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 3243365..7748283 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -22,6 +22,7 @@ CFLAGS += -O3
CFLAGS += -I$(SRCDIR)
CFLAGS += -I.
CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 8cc4552..a751f76 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)

+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
static struct pmd_process_private *process_private;
@@ -101,6 +106,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1980,6 +1996,99 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
+ strlcpy(request_param->port_name, port_name,
+ sizeof(request_param->port_name));
+ request.len_param = sizeof(*request_param);
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+
+ dev->data->nb_rx_queues = reply_param->rxq_count;
+ dev->data->nb_tx_queues = reply_param->txq_count;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ process_private->txq_fds[queue] = reply->fds[fd_iterator++];
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
+ reply_param->rxq_count++;
+ }
+ RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
+ reply_param->txq_count++;
+ }
+ RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strlcpy(reply.name, request->name, sizeof(reply.name));
+ strlcpy(reply_param->port_name, request_param->port_name,
+ sizeof(reply_param->port_name));
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -2006,9 +2115,23 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(name,
+ sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE,
+ eth_dev->device->numa_node);
+
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -2056,6 +2179,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);

+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

@@ -2063,6 +2193,7 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.7.4
Thomas Monjalon
2018-10-02 10:41:18 UTC
Permalink
Post by Raslan Darawsheh
@@ -2056,6 +2179,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
Is it an issue registering tap_mp_sync_queues at each tap probing?
Should we do it only once?
Raslan Darawsheh
2018-10-02 10:50:55 UTC
Permalink
It should be as of per device so we should do it for each port alone since several ports can have different queues.

Moreover, if the port that has the registration was closed or unplugged we'll not be able to sync qeues for other ports.

Kindest regards,
Raslan Darawsheh

-----Original Message-----
From: Thomas Monjalon <***@monjalon.net>
Sent: Tuesday, October 2, 2018 1:41 PM
To: Raslan Darawsheh <***@mellanox.com>
Cc: ***@dpdk.org; ***@intel.com; Shahaf Shuler <***@mellanox.com>; Ori Kam <***@mellanox.com>
Subject: Re: [dpdk-dev] [PATCH v4 2/2] net/tap: add queues when attaching from secondary process
Post by Raslan Darawsheh
@@ -2056,6 +2179,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
Is it an issue registering tap_mp_sync_queues at each tap probing?
Should we do it only once?
Thomas Monjalon
2018-10-02 11:38:09 UTC
Permalink
Post by Raslan Darawsheh
Post by Thomas Monjalon
Post by Raslan Darawsheh
@@ -2056,6 +2179,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
Is it an issue registering tap_mp_sync_queues at each tap probing?
Should we do it only once?
It should be as of per device so we should do it for each port alone since several ports can have different queues.
Moreover, if the port that has the registration was closed or unplugged we'll not be able to sync qeues for other ports.
I think we should do register on first tap device probing and never unregisters.

Ferruh, any opinion?
Ferruh Yigit
2018-10-03 16:28:06 UTC
Permalink
Post by Thomas Monjalon
Post by Raslan Darawsheh
Post by Thomas Monjalon
Post by Raslan Darawsheh
@@ -2056,6 +2179,13 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
name, tap_name);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
Is it an issue registering tap_mp_sync_queues at each tap probing?
Should we do it only once?
It should be as of per device so we should do it for each port alone since several ports can have different queues.
Moreover, if the port that has the registration was closed or unplugged we'll not be able to sync qeues for other ports.
I think we should do register on first tap device probing and never unregisters.
Ferruh, any opinion?
I think it is good to unregister, but we need to keep number of active tap
devices, so register if "active_tap == 1", unregister when "active_tap == 0" ?
Thomas Monjalon
2018-10-02 10:43:42 UTC
Permalink
Post by Raslan Darawsheh
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -67,6 +67,10 @@ New Features
SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
vdev_netvsc, tap, and failsafe drivers combination.
+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
A blank line is missing here.
Post by Raslan Darawsheh
@@ -2006,9 +2115,23 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(name,
+ sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE,
+ eth_dev->device->numa_node);
+
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
Should we manage rte_pmd_tun_probe too?
Ferruh Yigit
2018-10-03 16:31:03 UTC
Permalink
Post by Thomas Monjalon
Post by Raslan Darawsheh
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -67,6 +67,10 @@ New Features
SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
vdev_netvsc, tap, and failsafe drivers combination.
+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
A blank line is missing here.
Post by Raslan Darawsheh
@@ -2006,9 +2115,23 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
eth_dev->device = &dev->device;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(name,
+ sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE,
+ eth_dev->device->numa_node);
+
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
Should we manage rte_pmd_tun_probe too?
I was about to ask same one, they share dev_ops, so tun also should be
supporting multiple queue, if so why not update it?
Ferruh Yigit
2018-10-03 18:00:14 UTC
Permalink
Post by Thomas Monjalon
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.
Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.
<...>
Post by Thomas Monjalon
+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
+ strlcpy(request_param->port_name, port_name,
+ sizeof(request_param->port_name));
+ request.len_param = sizeof(*request_param);
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+
+ dev->data->nb_rx_queues = reply_param->rxq_count;
+ dev->data->nb_tx_queues = reply_param->txq_count;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ process_private->txq_fds[queue] = reply->fds[fd_iterator++];
Based on comment in prev patch,
"process_private" is the address of the memory block for last eth_dev probed, we
need this something linked to dev, like "dev->data->nb_rx_queues".
But this will be work fine because it is called just after memory allocated,
please check below one.

Also the problem here is, this is secondary process and nobody sets
dev->data->rxq[i].fd to point process_private? So you have correct fds on
"process_private" but your redirection doesn't point here.
(Please remember redirection set in eth_dev_tap_create() which called only for
primary.)
Post by Thomas Monjalon
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
This is "process_private" in primary. If you have multiple tap device this point
the memory block for last one.
When secondary asks for fds for a device "dev", you are always sending the last
ones, which is wrong.
This can work if you have only one tap device but not if you have more.


Briefly there are two problems with this patch:
1- "process_private" should be per eth_dev instead of globally for tap
2- Need a non shared memory location to save fds (or pointers to fds),
redirection is not helping here.

Ferruh Yigit
2018-10-03 17:59:09 UTC
Permalink
Post by Raslan Darawsheh
change the fds for the queues to be pointers and add new process private
structure and make the queue fds point to it.
---
drivers/net/tap/rte_eth_tap.c | 63 ++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;
This is the handlers for eth_dev queues, this should be _per eth_dev_, you can't
have a single global variable for this, we will see the problems below.

<...>
Post by Raslan Darawsheh
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
For each tap device, you overwrite the "process_private",
- So it has the address of last created tap device, we will come back this one
- and problem is how to free this back if an eth_dev removed? We lost references
except last one.
Post by Raslan Darawsheh
+
pmd = dev->data->dev_private;
pmd->dev = dev;
snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
@@ -1669,8 +1674,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
for eth_dev, dev->data->dev_private->rxq[i].fd points to a memory address,
and remember "dev->data->dev_private" is shared between primary and secondary,

when secondary updates its rxq[i].fd to point its memory block, won't it corrupt
the primary???

I think redirection is not helping here, I am not sure to the this within the
shared memory, you need a not shared area.
Wiles, Keith
2018-09-27 13:17:41 UTC
Permalink
Post by Raslan Darawsheh
change the fds for the queues to be pointers and add new process private
structure and make the queue fds point to it.
---
drivers/net/tap/rte_eth_tap.c | 63 ++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;
Maybe I do not see some minor point for making fd a pointer to fd when we could have an array of process_private[RTE_PMD_TAP_MAX_QUEUES] instead of a pointer type here. Then we do not need to allocate the memory each PMD and they would still have a private copy. Remove the array of rx/tx fds in the structure. This way it appears we can remove the code below that is making fd a pointer to fd. It just seems overly complex to me at the cost of a few more bytes of memory.

This would remove int fd; from the structure and add a pointer to the pid_process_private instead, which is private by default.

Did I miss some detail here that makes my comment wrong?
Post by Raslan Darawsheh
static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;
- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue)
{
struct rx_queue *rxq = queue;
- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue)
{
struct tx_queue *txq = queue;
- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}
@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;
if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}
TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);
return 0;
@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");
return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
@@ -1669,8 +1674,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}
if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
@@ -2089,13 +2096,13 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rte_eth_rxmode *rxmode; /* RX features */
@@ -56,7 +56,7 @@ struct rx_queue {
};
struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};
+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES];
+};
+
/* tap_intr.c */
int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set);
diff --git a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c
index fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Regards,
Keith
Raslan Darawsheh
2018-10-02 10:30:15 UTC
Permalink
Hi,

what I'm really doing is simply do some private array for all the fd's that each process will allocate it separately which will allow that each process will be
able to access the fd's for the queues in order not to overwrite the shared ones and it's working for me this way.

Now coming to your comment I'm not sure I fully understand it but, what you are suggesting is to create an array which will be accessed by the process_id to store these fd's in it.
As far as I know we don't have something as process_id in dpdk we only have the system process id which is not relevant for the array of fd's.

Please correct me if I'm wrong,
I think this way we'll be limiting the number of secondary processes to number of queues by tap.
Meanwhile, in my solution we don't have such limitation.

Kindest regards,
Raslan Darawsheh

-----Original Message-----
From: Wiles, Keith <***@intel.com>
Sent: Thursday, September 27, 2018 4:18 PM
To: Raslan Darawsheh <***@mellanox.com>
Cc: Thomas Monjalon <***@monjalon.net>; ***@dpdk.org; Shahaf Shuler <***@mellanox.com>; Ori Katz <***@mellanox.com>
Subject: Re: [PATCH v3 1/2] net/tap: change queue fd to be pointers to process private
Post by Raslan Darawsheh
change the fds for the queues to be pointers and add new process
private structure and make the queue fds point to it.
---
drivers/net/tap/rte_eth_tap.c | 63
++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c
b/drivers/net/tap/rte_eth_tap.c index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@
static struct rte_vdev_driver pmd_tap_drv; static struct
rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;
Maybe I do not see some minor point for making fd a pointer to fd when we could have an array of process_private[RTE_PMD_TAP_MAX_QUEUES] instead of a pointer type here. Then we do not need to allocate the memory each PMD and they would still have a private copy. Remove the array of rx/tx fds in the structure. This way it appears we can remove the code below that is making fd a pointer to fd. It just seems overly complex to me at the cost of a few more bytes of memory.

This would remove int fd; from the structure and add a pointer to the pid_process_private instead, which is private by default.

Did I miss some detail here that makes my comment wrong?
Post by Raslan Darawsheh
static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;
- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue) {
struct rx_queue *rxq = queue;
- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue) {
struct tx_queue *txq = queue;
- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}
@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;
if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}
TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);
return 0;
@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");
return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}
rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
diff --git a/drivers/net/tap/rte_eth_tap.h
b/drivers/net/tap/rte_eth_tap.h index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rx_queue { };
struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};
+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES]; };
+
/* tap_intr.c */
int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set); diff --git
a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c index
fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Regards,
Keith
Wiles, Keith
2018-10-02 12:58:50 UTC
Permalink
Post by Raslan Darawsheh
Hi,
what I'm really doing is simply do some private array for all the fd's that each process will allocate it separately which will allow that each process will be
able to access the fd's for the queues in order not to overwrite the shared ones and it's working for me this way.
Now coming to your comment I'm not sure I fully understand it but, what you are suggesting is to create an array which will be accessed by the process_id to store these fd's in it.
As far as I know we don't have something as process_id in dpdk we only have the system process id which is not relevant for the array of fd's.
Please correct me if I'm wrong,
I think this way we'll be limiting the number of secondary processes to number of queues by tap.
Meanwhile, in my solution we don't have such limitation.
I did not explain myself very well in the previous email let me try again.

Right now you have a pointer to an allocated piece of memory holding the FD’s struct pmd_process_private *process_private; if we converted the pointer to an array of pmd_process_private structures using RTE_PMD_TAP_MAX_QUEUES as an index.

struct pmd_process_private {
int rx_fd;
int tx_fd;
};

static struct pmd_process_private private_process[RTE_PMD_TAP_MAX_QUEUES]; /* each process has its own array */

struct rx_queue *rxq = pmd->dev->data->rx_queues[i];

struct rx_queue {
struct pmd_process_private *priv; // rxq->priv = &private_process[rx_queue_idx];
. . .
};

rxq->priv->rx_fd; // instead of the *rxq->fd seems shorter but less clear to me

Anyway my email compiler is not very good and maybe this will not work.

We can leave the current design in place for now, till I have time to really look at doing a PoC for my suggestion.
Post by Raslan Darawsheh
Kindest regards,
Raslan Darawsheh
-----Original Message-----
Sent: Thursday, September 27, 2018 4:18 PM
Subject: Re: [PATCH v3 1/2] net/tap: change queue fd to be pointers to process private
Post by Raslan Darawsheh
change the fds for the queues to be pointers and add new process
private structure and make the queue fds point to it.
---
drivers/net/tap/rte_eth_tap.c | 63
++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c
b/drivers/net/tap/rte_eth_tap.c index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@
static struct rte_vdev_driver pmd_tap_drv; static struct
rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;
Maybe I do not see some minor point for making fd a pointer to fd when we could have an array of process_private[RTE_PMD_TAP_MAX_QUEUES] instead of a pointer type here. Then we do not need to allocate the memory each PMD and they would still have a private copy. Remove the array of rx/tx fds in the structure. This way it appears we can remove the code below that is making fd a pointer to fd. It just seems overly complex to me at the cost of a few more bytes of memory.
This would remove int fd; from the structure and add a pointer to the pid_process_private instead, which is private by default.
Did I miss some detail here that makes my comment wrong?
Post by Raslan Darawsheh
static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;
- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue) {
struct rx_queue *rxq = queue;
- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue) {
struct tx_queue *txq = queue;
- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}
@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;
if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}
TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);
return 0;
@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");
return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}
rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
diff --git a/drivers/net/tap/rte_eth_tap.h
b/drivers/net/tap/rte_eth_tap.h index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rx_queue { };
struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};
+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES]; };
+
/* tap_intr.c */
int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set); diff --git
a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c index
fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Regards,
Kei
Ferruh Yigit
2018-10-03 17:27:08 UTC
Permalink
Post by Wiles, Keith
Post by Raslan Darawsheh
Hi,
what I'm really doing is simply do some private array for all the fd's that each process will allocate it separately which will allow that each process will be
able to access the fd's for the queues in order not to overwrite the shared ones and it's working for me this way.
Now coming to your comment I'm not sure I fully understand it but, what you are suggesting is to create an array which will be accessed by the process_id to store these fd's in it.
As far as I know we don't have something as process_id in dpdk we only have the system process id which is not relevant for the array of fd's.
Please correct me if I'm wrong,
I think this way we'll be limiting the number of secondary processes to number of queues by tap.
Meanwhile, in my solution we don't have such limitation.
I did not explain myself very well in the previous email let me try again.
Right now you have a pointer to an allocated piece of memory holding the FD’s struct pmd_process_private *process_private; if we converted the pointer to an array of pmd_process_private structures using RTE_PMD_TAP_MAX_QUEUES as an index.
struct pmd_process_private {
int rx_fd;
int tx_fd;
};
static struct pmd_process_private private_process[RTE_PMD_TAP_MAX_QUEUES]; /* each process has its own array */
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
struct rx_queue {
struct pmd_process_private *priv; // rxq->priv = &private_process[rx_queue_idx];
. . .
};
rxq->priv->rx_fd; // instead of the *rxq->fd seems shorter but less clear to me
Anyway my email compiler is not very good and maybe this will not work.
This should work, it is another kind of redirection.

But I think there are other problems, current implementation seems broken, I
will comment more detailed to the patch.
Briefly, rx_pkt_burst/tx_pkt_burst are not in eth_dev->data, but eth_dev because
they need to be private to process (can't share function pointers across
processes), this is the same problem.
Post by Wiles, Keith
We can leave the current design in place for now, till I have time to really look at doing a PoC for my suggestion.
Post by Raslan Darawsheh
Kindest regards,
Raslan Darawsheh
-----Original Message-----
Sent: Thursday, September 27, 2018 4:18 PM
Subject: Re: [PATCH v3 1/2] net/tap: change queue fd to be pointers to process private
Post by Raslan Darawsheh
change the fds for the queues to be pointers and add new process
private structure and make the queue fds point to it.
---
drivers/net/tap/rte_eth_tap.c | 63
++++++++++++++++++++++++-------------------
drivers/net/tap/rte_eth_tap.h | 9 +++++--
drivers/net/tap/tap_intr.c | 4 +--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c
b/drivers/net/tap/rte_eth_tap.c index ad5ae98..8cc4552 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -64,6 +64,7 @@
static struct rte_vdev_driver pmd_tap_drv; static struct
rte_vdev_driver pmd_tun_drv;
+static struct pmd_process_private *process_private;
Maybe I do not see some minor point for making fd a pointer to fd when we could have an array of process_private[RTE_PMD_TAP_MAX_QUEUES] instead of a pointer type here. Then we do not need to allocate the memory each PMD and they would still have a private copy. Remove the array of rx/tx fds in the structure. This way it appears we can remove the code below that is making fd a pointer to fd. It just seems overly complex to me at the cost of a few more bytes of memory.
This would remove int fd; from the structure and add a pointer to the pid_process_private instead, which is private by default.
Did I miss some detail here that makes my comment wrong?
Post by Raslan Darawsheh
static const char *valid_arguments[] = {
ETH_TAP_IFACE_ARG,
@@ -331,7 +332,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t data_off = rte_pktmbuf_headroom(mbuf);
int len;
- len = readv(rxq->fd, *rxq->iovecs,
+ len = readv(*rxq->fd, *rxq->iovecs,
1 +
(rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
rxq->nb_rx_desc : 1));
@@ -595,7 +596,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
/* copy the tx frame data */
- n = writev(txq->fd, iovecs, j);
+ n = writev(*txq->fd, iovecs, j);
if (n <= 0)
break;
(*num_packets)++;
@@ -976,13 +977,13 @@ tap_dev_close(struct rte_eth_dev *dev)
tap_flow_implicit_flush(internals, NULL);
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
@@ -1007,9 +1008,9 @@ tap_rx_queue_release(void *queue) {
struct rx_queue *rxq = queue;
- if (rxq && (rxq->fd > 0)) {
- close(rxq->fd);
- rxq->fd = -1;
+ if (rxq && rxq->fd && (*rxq->fd > 0)) {
+ close(*rxq->fd);
+ *rxq->fd = -1;
rte_pktmbuf_free(rxq->pool);
rte_free(rxq->iovecs);
rxq->pool = NULL;
@@ -1022,9 +1023,9 @@ tap_tx_queue_release(void *queue) {
struct tx_queue *txq = queue;
- if (txq && (txq->fd > 0)) {
- close(txq->fd);
- txq->fd = -1;
+ if (txq && txq->fd && (*txq->fd > 0)) {
+ close(*txq->fd);
+ *txq->fd = -1;
}
}
@@ -1214,13 +1215,13 @@ tap_setup_queue(struct rte_eth_dev *dev,
struct rte_gso_ctx *gso_ctx;
if (is_rx) {
- fd = &rx->fd;
- other_fd = &tx->fd;
+ fd = rx->fd;
+ other_fd = tx->fd;
dir = "rx";
gso_ctx = NULL;
} else {
- fd = &tx->fd;
- other_fd = &rx->fd;
+ fd = tx->fd;
+ other_fd = rx->fd;
dir = "tx";
gso_ctx = &tx->gso_ctx;
}
@@ -1331,7 +1332,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
}
TAP_LOG(DEBUG, " RX TUNTAP device name %s, qid %d on fd %d",
- internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
+ internals->name, rx_queue_id, *internals->rxq[rx_queue_id].fd);
return 0;
@@ -1371,7 +1372,7 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
return -1;
TAP_LOG(DEBUG,
" TX TUNTAP device name %s, qid %d on fd %d csum %s",
- internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
+ internals->name, tx_queue_id, *internals->txq[tx_queue_id].fd,
txq->csum ? "on" : "off");
return 0;
@@ -1633,6 +1634,10 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
goto error_exit_nodev;
}
+ process_private = (struct pmd_process_private *)
+ rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+
pmd = dev->data->dev_private;
pmd->dev = dev;
/* Presetup the fds to -1 as being not valid */
pmd->ka_fd = -1;
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- pmd->rxq[i].fd = -1;
- pmd->txq[i].fd = -1;
+ process_private->rxq_fds[i] = -1;
+ process_private->txq_fds[i] = -1;
+ pmd->rxq[i].fd = &process_private->rxq_fds[i];
+ pmd->txq[i].fd = &process_private->txq_fds[i];
}
rte_pmd_tap_remove(struct rte_vdev_device *dev)
tap_nl_final(internals->nlsk_fd);
}
for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
- if (internals->rxq[i].fd != -1) {
- close(internals->rxq[i].fd);
- internals->rxq[i].fd = -1;
+ if (*internals->rxq[i].fd != -1) {
+ close(*internals->rxq[i].fd);
+ *internals->rxq[i].fd = -1;
}
- if (internals->txq[i].fd != -1) {
- close(internals->txq[i].fd);
- internals->txq[i].fd = -1;
+ if (*internals->txq[i].fd != -1) {
+ close(*internals->txq[i].fd);
+ *internals->txq[i].fd = -1;
}
}
diff --git a/drivers/net/tap/rte_eth_tap.h
b/drivers/net/tap/rte_eth_tap.h index 44e2773..4146f5c 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -46,7 +46,7 @@ struct rx_queue {
struct rte_mempool *mp; /* Mempool for RX packets */
uint32_t trigger_seen; /* Last seen Rx trigger value */
uint16_t in_port; /* Port ID */
- int fd;
+ int *fd;
struct pkt_stats stats; /* Stats for this RX queue */
uint16_t nb_rx_desc; /* max number of mbufs available */
struct rx_queue { };
struct tx_queue {
- int fd;
+ int *fd;
int type; /* Type field - TUN|TAP */
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
@@ -92,6 +92,11 @@ struct pmd_internals {
int ka_fd; /* keep-alive file descriptor */
};
+struct pmd_process_private {
+ int rxq_fds[RTE_PMD_TAP_MAX_QUEUES];
+ int txq_fds[RTE_PMD_TAP_MAX_QUEUES]; };
+
/* tap_intr.c */
int tap_rx_intr_vec_set(struct rte_eth_dev *dev, int set); diff --git
a/drivers/net/tap/tap_intr.c b/drivers/net/tap/tap_intr.c index
fc59018..a4b212d 100644
--- a/drivers/net/tap/tap_intr.c
+++ b/drivers/net/tap/tap_intr.c
@@ -71,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
struct rx_queue *rxq = pmd->dev->data->rx_queues[i];
/* Skip queues that cannot request interrupts. */
- if (!rxq || rxq->fd <= 0) {
+ if (!rxq || !rxq->fd || *rxq->fd <= 0) {
/* Use invalid intr_vec[] index to disable entry. */
intr_handle->intr_vec[i] =
RTE_INTR_VEC_RXTX_OFFSET +
@@ -79,7 +79,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev)
continue;
}
intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
- intr_handle->efds[count] = rxq->fd;
+ intr_handle->efds[count] = *rxq->fd;
count++;
}
if (!count)
--
2.7.4
Regards,
Keith
Regards,
Keith
Thomas Monjalon
2018-07-20 11:15:52 UTC
Permalink
From: Raslan Darawsheh <***@mellanox.com>

In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.

Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.

Signed-off-by: Raslan Darawsheh <***@mellanox.com>
Signed-off-by: Thomas Monjalon <***@monjalon.net>
---
Note: there is a bug in EAL IPC regarding fd translation.
A fix will be sent later for EAL.

v3:
- split some long lines
v2:
- translate file descriptors via IPC API
- add documentation
---
doc/guides/nics/tap.rst | 16 +++
doc/guides/rel_notes/release_18_08.rst | 5 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 131 ++++++++++++++++++++++++-
4 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 27148681c..d1f3e1c24 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -152,6 +152,22 @@ Distribute IPv4 TCP packets using RSS to a given MAC address over queues 0-3::
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end

+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+The IPC synchronization of Rx/Tx queues is currently limited:
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------

diff --git a/doc/guides/rel_notes/release_18_08.rst b/doc/guides/rel_notes/release_18_08.rst
index dd611b571..ec6a81236 100644
--- a/doc/guides/rel_notes/release_18_08.rst
+++ b/doc/guides/rel_notes/release_18_08.rst
@@ -74,6 +74,11 @@ New Features
* Add handlers to add/delete VxLAN port number.
* Add devarg to specify ingress VLAN rewrite mode.

+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
+

API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 324336535..3dcf05a72 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,7 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso

CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API

#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 4493507ed..98cbdf614 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>

#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)

+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;

@@ -100,6 +105,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};

+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);

/**
@@ -1920,6 +1936,100 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}

+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct pmd_internals *devpriv;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strcpy(request.name, TAP_MP_KEY);
+ strcpy(request_param->port_name, port_name);
+ request.len_param = sizeof(*request_param);
+
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ /* FIXME: handle replies.nb_received > 1 */
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+ devpriv = dev->data->dev_private;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ devpriv->rxq[queue].fd = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ devpriv->txq[queue].fd = reply->fds[fd_iterator++];
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct pmd_internals *devpriv;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+ devpriv = dev->data->dev_private;
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->rxq[queue].fd;
+ reply_param->rxq_count++;
+ }
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->txq[queue].fd;
+ reply_param->txq_count++;
+ }
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1946,8 +2056,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
+
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1998,10 +2118,19 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);

+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
+
leave:
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.17.1
Wiles, Keith
2018-07-20 15:35:14 UTC
Permalink
Post by Thomas Monjalon
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.
Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.
---
Note: there is a bug in EAL IPC regarding fd translation.
A fix will be sent later for EAL.
- split some long lines
- translate file descriptors via IPC API
- add documentation
---
doc/guides/nics/tap.rst | 16 +++
doc/guides/rel_notes/release_18_08.rst | 5 +
drivers/net/tap/Makefile | 1 +
drivers/net/tap/rte_eth_tap.c | 131 ++++++++++++++++++++++++-
4 files changed, 152 insertions(+), 1 deletion(-)
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 27148681c..d1f3e1c24 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
testpmd> flow create 0 priority 4 ingress pattern eth dst is 0a:0b:0c:0d:0e:0f \
/ ipv4 / tcp / end actions rss queues 0 1 2 3 end / end
+Multi-process sharing
+---------------------
+
+It is possible to attach an existing TAP device in a secondary process,
+by declaring it as a vdev with the same name as in the primary process,
+and without any parameter.
+
+The port attached in a secondary process will give access to the
+statistics and the queues.
+Therefore it can be used for monitoring or Rx/Tx processing.
+
+
+ - Only 8 queues
+ - Synchronized on probing, but not on later port update
+
Example
-------
diff --git a/doc/guides/rel_notes/release_18_08.rst b/doc/guides/rel_notes/release_18_08.rst
index dd611b571..ec6a81236 100644
--- a/doc/guides/rel_notes/release_18_08.rst
+++ b/doc/guides/rel_notes/release_18_08.rst
@@ -74,6 +74,11 @@ New Features
* Add handlers to add/delete VxLAN port number.
* Add devarg to specify ingress VLAN rewrite mode.
+* **Added TAP Rx/Tx queues sharing with a secondary process.**
+
+ A secondary process can attach a TAP device created in the primary process,
+ probe the queues, and process Rx/Tx in a secondary process.
+
API Changes
-----------
diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 324336535..3dcf05a72 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -27,6 +27,7 @@ LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev -lrte_gso
CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
#
# all source are stored in SRCS-y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 4493507ed..98cbdf614 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -16,6 +16,8 @@
#include <rte_debug.h>
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
#include <assert.h>
#include <sys/types.h>
@@ -62,6 +64,9 @@
#define TAP_GSO_MBUFS_NUM \
(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
+/* IPC key for queue fds sync */
+#define TAP_MP_KEY "tap_mp_sync_queues"
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
@@ -100,6 +105,17 @@ enum ioctl_mode {
REMOTE_ONLY,
};
+/* Message header to synchronize queues via IPC */
+struct ipc_queues {
+ char port_name[RTE_DEV_NAME_MAX_LEN];
+ int rxq_count;
+ int txq_count;
+ /*
+ * The file descriptors are in the dedicated part
+ * of the Unix message to be translated by the kernel.
+ */
+};
+
static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
/**
@@ -1920,6 +1936,100 @@ rte_pmd_tun_probe(struct rte_vdev_device *dev)
return ret;
}
+/* Request queue file descriptors from secondary to primary. */
+static int
+tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
+{
+ int ret;
+ struct pmd_internals *devpriv;
+ struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
+ struct rte_mp_msg request, *reply;
+ struct rte_mp_reply replies;
+ struct ipc_queues *request_param = (struct ipc_queues *)request.param;
+ struct ipc_queues *reply_param;
+ int queue, fd_iterator;
+
+ /* Prepare the request */
+ strcpy(request.name, TAP_MP_KEY);
+ strcpy(request_param->port_name, port_name);
+ request.len_param = sizeof(*request_param);
+
+ /* Send request and receive reply */
+ ret = rte_mp_request_sync(&request, &replies, &timeout);
+ if (ret < 0) {
+ TAP_LOG(ERR, "Failed to request queues from primary: %d",
+ rte_errno);
+ return -1;
+ }
+ /* FIXME: handle replies.nb_received > 1 */
I am not a big fan of having TODO or FIXME comments in the code. Can we remove them and just describe the problem and what would happen or not happen if the condition occurs? If we need to add this support in the future then we need to put these in a enhancement tracker or someplace else.
Post by Thomas Monjalon
+ reply = &replies.msgs[0];
+ reply_param = (struct ipc_queues *)reply->param;
+ TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
+
+ /* Attach the queues from received file descriptors */
+ devpriv = dev->data->dev_private;
+ fd_iterator = 0;
+ for (queue = 0; queue < reply_param->rxq_count; queue++)
+ devpriv->rxq[queue].fd = reply->fds[fd_iterator++];
+ for (queue = 0; queue < reply_param->txq_count; queue++)
+ devpriv->txq[queue].fd = reply->fds[fd_iterator++];
+
+ return 0;
+}
+
+/* Send the queue file descriptors from the primary process to secondary. */
+static int
+tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
+{
+ struct rte_eth_dev *dev;
+ struct pmd_internals *devpriv;
+ struct rte_mp_msg reply;
+ const struct ipc_queues *request_param =
+ (const struct ipc_queues *)request->param;
+ struct ipc_queues *reply_param =
+ (struct ipc_queues *)reply.param;
+ uint16_t port_id;
+ int queue;
+ int ret;
+
+ /* Get requested port */
+ TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
+ ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
+ if (ret) {
+ TAP_LOG(ERR, "Failed to get port id for %s",
+ request_param->port_name);
+ return -1;
+ }
+ dev = &rte_eth_devices[port_id];
+ devpriv = dev->data->dev_private;
+
+ /* Fill file descriptors for all queues */
+ reply.num_fds = 0;
+ reply_param->rxq_count = 0;
+ for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->rxq[queue].fd;
+ reply_param->rxq_count++;
+ }
+ reply_param->txq_count = 0;
+ for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
+ reply.fds[reply.num_fds++] = devpriv->txq[queue].fd;
+ reply_param->txq_count++;
+ }
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
Here too.
Post by Thomas Monjalon
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
Normally we use the snprintf or strlcpy() functions for the above should we do that here too?
Post by Thomas Monjalon
+ reply.len_param = sizeof(*reply_param);
+ if (rte_mp_reply(&reply, peer) < 0) {
+ TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Open a TAP interface device.
*/
static int
@@ -1946,8 +2056,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
Does the call above need to be wrapped using if secondary process or is this for both primary and secondary?
Post by Thomas Monjalon
+
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1998,10 +2118,19 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
Same for this one as above?
Post by Thomas Monjalon
+
if (ret == -1) {
TAP_LOG(ERR, "Failed to create pmd for %s as %s",
name, tap_name);
+ rte_mp_action_unregister(TAP_MP_KEY);
tap_unit--; /* Restore the unit number */
}
rte_kvargs_free(kvlist);
--
2.17.1
This looks fine to me except for the couple tests and the FIXME comments. Let me know about the other comments and with the FIXME changes I can Ack it.

Regards,
Keith
Thomas Monjalon
2018-07-20 21:51:39 UTC
Permalink
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* FIXME: handle replies.nb_received > 1 */
I am not a big fan of having TODO or FIXME comments in the code.
What don't you like in such comments?
Post by Wiles, Keith
Can we remove them and just describe the problem and what would happen
or not happen if the condition occurs?
You mean describing the problem in the code?
Post by Wiles, Keith
If we need to add this support in the future then we need to put these
in a enhancement tracker or someplace else.
The limitation is documented in the guide (limit of 8 queues).
Post by Wiles, Keith
Post by Thomas Monjalon
+ reply = &replies.msgs[0];
[...]
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
Here too.
This limitation is related to the previous one (send only one message,
receive only message).
Post by Wiles, Keith
Post by Thomas Monjalon
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
Normally we use the snprintf or strlcpy() functions for the above should we do that here too?
Yes it looks to be a good idea.
Post by Wiles, Keith
Post by Thomas Monjalon
@@ -1946,8 +2056,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
Does the call above need to be wrapped using if secondary process or is this for both primary and secondary?
It is already in a "secondary only" block.
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
Same for this one as above?
This code path is executed only in primary or creation of port in secondary.
I think it is fine.

However I am thinking it should be registered only once for all TAP ports.
Wiles, Keith
2018-07-21 13:44:41 UTC
Permalink
Post by Thomas Monjalon
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* FIXME: handle replies.nb_received > 1 */
I am not a big fan of having TODO or FIXME comments in the code.
What don't you like in such comments?
We should not have FIXME or TODO in the code it does not look like it is complete, if we need to fix something then fix it or put it on a todo list not in the code. The same thing for the TODO which to me means a future enhancement we just need to add it to a future todo list.

If the code in these sections have a limitation them describe the limitation and remove the FIXME and TODOs from the code.
Post by Thomas Monjalon
Post by Wiles, Keith
Can we remove them and just describe the problem and what would happen
or not happen if the condition occurs?
You mean describing the problem in the code?
Post by Wiles, Keith
If we need to add this support in the future then we need to put these
in a enhancement tracker or someplace else.
The limitation is documented in the guide (limit of 8 queues).
Post by Wiles, Keith
Post by Thomas Monjalon
+ reply = &replies.msgs[0];
[...]
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* FIXME: split message if more queues than RTE_MP_MAX_FD_NUM */
Here too.
This limitation is related to the previous one (send only one message,
receive only message).
Post by Wiles, Keith
Post by Thomas Monjalon
+ RTE_ASSERT(reply.num_fds <= RTE_MP_MAX_FD_NUM);
+
+ /* Send reply */
+ strcpy(reply.name, request->name);
+ strcpy(reply_param->port_name, request_param->port_name);
Normally we use the snprintf or strlcpy() functions for the above should we do that here too?
Yes it looks to be a good idea.
Post by Wiles, Keith
Post by Thomas Monjalon
@@ -1946,8 +2056,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
Does the call above need to be wrapped using if secondary process or is this for both primary and secondary?
It is already in a "secondary only" block.
Post by Wiles, Keith
Post by Thomas Monjalon
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
Same for this one as above?
This code path is executed only in primary or creation of port in secondary.
I think it is fine.
However I am thinking it should be registered only once for all TAP ports.
Regards,
Keith
Ferruh Yigit
2018-08-23 11:51:14 UTC
Permalink
Post by Thomas Monjalon
In the case the device is created by the primary process,
the secondary must request some file descriptors to attach the queues.
The file descriptors are shared via IPC Unix socket.
Thanks to the IPC synchronization, the secondary process
is now able to do Rx/Tx on a TAP created by the primary process.
---
Note: there is a bug in EAL IPC regarding fd translation.
A fix will be sent later for EAL.
- split some long lines
- translate file descriptors via IPC API
- add documentation
---
doc/guides/nics/tap.rst | 16 +++
doc/guides/rel_notes/release_18_08.rst | 5 +
Needs to be 18.11 release notes now.

<...>
Post by Thomas Monjalon
@@ -1946,8 +2056,18 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
TAP_LOG(ERR, "Failed to probe %s", name);
return -1;
}
- /* TODO: request info from primary to set up Rx and Tx */
eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = pmd_rx_burst;
+ eth_dev->tx_pkt_burst = pmd_tx_burst;
+
+ if (!rte_eal_primary_proc_alive(NULL)) {
+ TAP_LOG(ERR, "Primary process is missing");
+ return -1;
+ }
+ ret = tap_mp_attach_queues(name, eth_dev);
+ if (ret != 0)
+ return -1;
+
Should update rte_pmd_tun_probe() too?
Post by Thomas Monjalon
rte_eth_dev_probing_finish(eth_dev);
return 0;
}
@@ -1998,10 +2118,19 @@ rte_pmd_tap_probe(struct rte_vdev_device *dev)
ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
ETH_TUNTAP_TYPE_TAP);
+ /* Register IPC feed callback */
+ ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
+ if (ret < 0 && rte_errno != EEXIST) {
+ TAP_LOG(ERR, "%s: Failed to register IPC callback: %s",
+ tuntap_name, strerror(rte_errno));
+ goto leave;
+ }
This is causing an error if there are multiple tap devices, since it tries to
register multiple times.
Loading...